gam 0.3.112 - Docs.rs

use crate::cache::Fingerprinter;
use crate::faer_ndarray::FaerEigh;
use crate::faer_ndarray::{FaerCholesky, fast_atb, fast_av};
use crate::matrix::{
    DesignMatrix, EmbeddedColumnBlock, LinearOperator, SignedWeightsView, SymmetricMatrix,
    dense_rowwise_kronecker,
};
use crate::pirls::{LinearInequalityConstraints, solve_newton_directionwith_lower_bounds};
use crate::resource::{DerivativeStorageMode, ResourcePolicy};
use crate::solver::active_set::{
    project_stationarity_residual_on_constraint_cone, solve_quadratic_with_linear_constraints,
};
use crate::solver::estimate::reml::penalty_logdet::PenaltyPseudologdet;
use crate::solver::estimate::reml::unified::{
    BlockCoupledOperator, ContractedPsiSecondOrder, ContractedPsiSecondOrderFn,
    DenseSpectralOperator, DispersionHandling, DriftDerivResult, FixedDriftDerivFn,
    HessianDerivativeProvider, HessianOperator, HyperCoord, HyperCoordDrift, HyperCoordPair,
    HyperOperator, MatrixFreeSpdOperator, PenaltySubspaceTrace, ProjectedKktResidual,
    StochasticTraceState, compute_block_penalty_logdet_derivs, exact_pseudo_logdet,
    positive_eigenvalue_threshold, spectral_epsilon, spectral_regularize,
};
use crate::solver::estimate::{
    EstimationError, FitGeometry, ensure_finite_scalar_estimation, validate_all_finite_estimation,
};
use crate::solver::persistent_warm_start::{
    PersistentBlockInnerSummary, PersistentBlockWarmStartRecord, load_block_record,
    store_block_record,
};
use crate::types::{RidgeDeterminantMode, RidgePolicy};
use faer::Side;
use ndarray::{Array1, Array2, ArrayView1, ArrayViewMut1, s};
use std::any::{Any, type_name};
use std::cell::RefCell;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::ops::Range;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex, OnceLock, Weak};
use thiserror::Error;

pub use crate::solver::estimate::reml::unified::{EvalMode, PseudoLogdetMode};

/// A penalty matrix that may be stored in Kronecker-factored form.
///
/// For tensor-product terms (e.g. time-varying survival covariates), the penalty
/// has the structure `S = left ⊗ right` (Kronecker product). Keeping this
/// factored avoids materializing (p_left × p_right)² dense entries and enables
/// exact log-determinant computation via `log|A ⊗ B| = n_B log|A| + n_A log|B|`.
///
/// Dense penalties are stored as-is.  Callers that need a raw `Array2<f64>` can
/// call `as_dense()` (zero-cost for Dense, lazy-materialized for KroneckerFactored).
#[derive(Clone, Debug)]
pub enum PenaltyMatrix {
    Dense(Array2<f64>),
    KroneckerFactored {
        left: Array2<f64>,
        right: Array2<f64>,
    },
    /// Block-local penalty: `local` is `block_dim × block_dim`, embedded at
    /// `col_range` in the full parameter space of dimension `total_dim`.
    /// Avoids materializing the full `total_dim × total_dim` matrix.
    Blockwise {
        local: Array2<f64>,
        col_range: std::ops::Range<usize>,
        total_dim: usize,
    },
    /// Wrapper assigning this penalty component to a user-visible precision
    /// label. Components with the same label share one smoothing parameter.
    Labeled {
        label: String,
        inner: Box<PenaltyMatrix>,
    },
    /// Wrapper fixing this penalty component at a physical log-precision.
    /// Fixed components remain in the block-local physical penalty layout but
    /// are removed from the REML outer coordinate vector.
    Fixed {
        log_lambda: f64,
        inner: Box<PenaltyMatrix>,
    },
}

impl PenaltyMatrix {
    /// Number of rows (= number of columns, since penalties are square).
    pub fn dim(&self) -> usize {
        match self {
            Self::Dense(m) => m.nrows(),
            Self::KroneckerFactored { left, right } => left.nrows() * right.nrows(),
            Self::Blockwise { total_dim, .. } => *total_dim,
            Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.dim(),
        }
    }

    /// Returns (nrows, ncols) like Array2::dim().
    pub fn shape(&self) -> (usize, usize) {
        let d = self.dim();
        (d, d)
    }

    /// Materialize the full dense matrix.
    pub fn to_dense(&self) -> Array2<f64> {
        match self {
            Self::Dense(m) => m.clone(),
            Self::KroneckerFactored { left, right } => {
                crate::terms::construction::kronecker_product(left, right)
            }
            Self::Blockwise {
                local,
                col_range,
                total_dim,
            } => {
                let mut g = Array2::zeros((*total_dim, *total_dim));
                g.slice_mut(ndarray::s![
                    col_range.start..col_range.end,
                    col_range.start..col_range.end
                ])
                .assign(local);
                g
            }
            Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.to_dense(),
        }
    }

    /// Borrow the inner dense matrix if Dense, otherwise materialize.
    pub fn as_dense_cow(&self) -> std::borrow::Cow<'_, Array2<f64>> {
        match self {
            Self::Dense(m) => std::borrow::Cow::Borrowed(m),
            Self::KroneckerFactored { .. }
            | Self::Blockwise { .. }
            | Self::Labeled { .. }
            | Self::Fixed { .. } => std::borrow::Cow::Owned(self.to_dense()),
        }
    }

    /// Returns a reference to the inner matrix if this is a Dense variant.
    pub fn as_dense_ref(&self) -> Option<&Array2<f64>> {
        match self {
            Self::Dense(m) => Some(m),
            Self::Fixed { inner, .. } => inner.as_dense_ref(),
            Self::KroneckerFactored { .. } | Self::Blockwise { .. } | Self::Labeled { .. } => None,
        }
    }

    pub fn with_precision_label(self, label: impl Into<String>) -> Self {
        Self::Labeled {
            label: label.into(),
            inner: Box::new(self),
        }
    }

    pub fn precision_label(&self) -> Option<&str> {
        match self {
            Self::Labeled { label, .. } => Some(label.as_str()),
            Self::Fixed { .. } => None,
            _ => None,
        }
    }

    pub fn with_fixed_log_lambda(self, log_lambda: f64) -> Self {
        Self::Fixed {
            log_lambda,
            inner: Box::new(self),
        }
    }

    pub fn fixed_log_lambda(&self) -> Option<f64> {
        match self {
            Self::Fixed { log_lambda, .. } => Some(*log_lambda),
            Self::Labeled { inner, .. } => inner.fixed_log_lambda(),
            _ => None,
        }
    }

    /// Compute S * v using the Kronecker vec trick when factored:
    ///   (A ⊗ B) vec(V) = vec(B V Aᵀ)
    /// where V = reshape(v, (p_right, p_left)).
    pub fn dot(&self, v: &Array1<f64>) -> Array1<f64> {
        match self {
            Self::Dense(m) => m.dot(v),
            Self::KroneckerFactored { left, right } => {
                let p_left = left.nrows();
                let p_right = right.nrows();
                // v is (p_left * p_right,).  Reshape as (p_right, p_left).
                let v_mat =
                    ndarray::ArrayView2::from_shape((p_right, p_left), v.as_slice().unwrap())
                        .unwrap();
                // result = B V A' then flatten.
                let bv = right.dot(&v_mat);
                let bva = bv.dot(&left.t());
                Array1::from_iter(bva.iter().copied())
            }
            Self::Blockwise {
                local,
                col_range,
                total_dim,
            } => {
                let mut out = Array1::zeros(*total_dim);
                let v_block = v.slice(ndarray::s![col_range.clone()]);
                let result_block = local.dot(&v_block);
                out.slice_mut(ndarray::s![col_range.clone()])
                    .assign(&result_block);
                out
            }
            Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.dot(v),
        }
    }

    /// Add λ * self to a mutable dense accumulator.
    pub fn add_scaled_to(&self, lambda: f64, target: &mut Array2<f64>) {
        match self {
            Self::Dense(m) => {
                target.scaled_add(lambda, m);
            }
            Self::KroneckerFactored { left, right } => {
                let p_left = left.nrows();
                let p_right = right.nrows();
                for i1 in 0..p_left {
                    for j1 in 0..p_left {
                        let a_ij = left[[i1, j1]];
                        if a_ij == 0.0 {
                            continue;
                        }
                        let scaled_a = lambda * a_ij;
                        for i2 in 0..p_right {
                            let row = i1 * p_right + i2;
                            for j2 in 0..p_right {
                                let col = j1 * p_right + j2;
                                target[[row, col]] += scaled_a * right[[i2, j2]];
                            }
                        }
                    }
                }
            }
            Self::Blockwise {
                local, col_range, ..
            } => {
                target
                    .slice_mut(ndarray::s![col_range.clone(), col_range.clone()])
                    .scaled_add(lambda, local);
            }
            Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => {
                inner.add_scaled_to(lambda, target)
            }
        }
    }

    /// Add λ * diag(self) to a mutable diagonal accumulator.
    pub fn add_scaled_diag_to(&self, lambda: f64, target: &mut Array1<f64>) {
        match self {
            Self::Dense(m) => {
                let p = m.nrows().min(target.len());
                for j in 0..p {
                    target[j] += lambda * m[[j, j]];
                }
            }
            Self::KroneckerFactored { left, right } => {
                let p_left = left.nrows();
                let p_right = right.nrows();
                assert_eq!(target.len(), p_left * p_right);
                for i_left in 0..p_left {
                    let left_diag = left[[i_left, i_left]];
                    if left_diag == 0.0 {
                        continue;
                    }
                    let scaled_left = lambda * left_diag;
                    for i_right in 0..p_right {
                        target[i_left * p_right + i_right] +=
                            scaled_left * right[[i_right, i_right]];
                    }
                }
            }
            Self::Blockwise {
                local, col_range, ..
            } => {
                let width = local.nrows().min(col_range.len());
                for local_idx in 0..width {
                    target[col_range.start + local_idx] += lambda * local[[local_idx, local_idx]];
                }
            }
            Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => {
                inner.add_scaled_diag_to(lambda, target)
            }
        }
    }

    /// Compute the quadratic form β' S β.
    pub fn quadratic_form(&self, beta: &Array1<f64>) -> f64 {
        match self {
            Self::Dense(m) => beta.dot(&m.dot(beta)),
            Self::KroneckerFactored { .. } => {
                let sv = self.dot(beta);
                beta.dot(&sv)
            }
            Self::Blockwise {
                local, col_range, ..
            } => {
                let beta_block = beta.slice(ndarray::s![col_range.clone()]);
                let sv = local.dot(&beta_block);
                beta_block.dot(&sv)
            }
            Self::Labeled { inner, .. } | Self::Fixed { inner, .. } => inner.quadratic_form(beta),
        }
    }

    /// Access dimensions like an Array2.
    pub fn nrows(&self) -> usize {
        self.dim()
    }

    pub fn ncols(&self) -> usize {
        self.dim()
    }
}

impl From<Array2<f64>> for PenaltyMatrix {
    fn from(m: Array2<f64>) -> Self {
        Self::Dense(m)
    }
}

/// Per-subject channel Hessian provider for multi-output families.
///
/// The Fisher information decomposition for multi-output families is
///
/// ```text
/// I(β) = Σ_i  J_iᵀ W_i J_i
/// ```
///
/// where `J_i` is the channel-stacked Jacobian (shape `n_outputs × p` for
/// subject `i`) and `W_i` is the `n_outputs × n_outputs` per-subject channel
/// Hessian of the row negative log-likelihood (the second-derivative block of
/// `−log L_i(u_i)` at a pilot β, PSD-clamped).
///
/// For single-output families this is the scalar IRLS weight; for multi-output
/// families (survival marginal-slope: `n_outputs = 4`; location-scale:
/// `n_outputs = 2`) it carries full cross-channel curvature.
///
/// The identifiability canonicalisation step uses the `n_outputs`-channel
/// weighted joint design `W_joint = Σ_i sqrt(W_i) ⊗ J_i` to detect
/// block-against-block aliasing.  When this trait is present on
/// `ParameterBlockSpec::channel_hessian`, `canonicalize_for_identifiability`
/// routes through `audit_identifiability_channel_aware`; when absent it falls
/// back to the scalar-weight flat audit.
///
/// # W-metric rank theorem
///
/// The canonicalisation computes `rank(J^T W J)` where `W_blkdiag =
/// block-diagonal of per-subject W_i`.  This rank equals
///
/// ```text
/// rank(J) − dim(range(J) ∩ ker(W_blkdiag))
/// ```
///
/// i.e. columns of `J` that lie in the kernel of `W_blkdiag` (flat directions
/// in the curvature landscape at the pilot β) are correctly identified as
/// curvature-redundant and may be dropped.
pub trait FamilyChannelHessian: Send + Sync {
    /// Number of output channels `n_outputs` (= K in the row Jacobian).
    fn n_outputs(&self) -> usize;

    /// Number of subjects (rows).
    fn n_subjects(&self) -> usize;

    /// Fill the `n_outputs × n_outputs` per-subject channel Hessian `W_i`
    /// into `out` (row-major, length `n_outputs * n_outputs`) for subject `i`.
    /// Negative eigenvalues must be clamped to zero (PSD projection) before
    /// or inside this call.
    fn fill_subject(&self, i: usize, out: &mut [f64]);

    /// Materialise the full `(n_subjects × n_outputs × n_outputs)` tensor.
    /// Default implementation calls `fill_subject` for each row.
    fn evaluate_full(&self) -> ndarray::Array3<f64> {
        let n = self.n_subjects();
        let k = self.n_outputs();
        let mut out = ndarray::Array3::<f64>::zeros((n, k, k));
        let mut buf = vec![0.0_f64; k * k];
        for i in 0..n {
            self.fill_subject(i, &mut buf);
            for a in 0..k {
                for b in 0..k {
                    out[[i, a, b]] = buf[a * k + b];
                }
            }
        }
        out
    }

    /// Return a refreshed W evaluated at `beta` using `family_scalars` when
    /// those scalars carry the per-row primary state at the current β.
    ///
    /// # Fisher information identity
    ///
    /// `I(β) = J(β)^T W(β) J(β)`. T8 originally froze W at β=0; T34 refreshes
    /// both J and W at the current β so the audit's rank verdict reflects the
    /// actual local identifiability.
    ///
    /// # Default implementation (β-independent W)
    ///
    /// Families whose W is β-independent (e.g. Gaussian-identity where
    /// `W = prior_w`) return a clone of their frozen W by delegating to
    /// `evaluate_full()`. No recomputation is performed. `beta` and
    /// `family_scalars` are ignored.
    ///
    /// # Override (β-dependent W)
    ///
    /// Families with β-dependent W (e.g. survival marginal-slope where
    /// `W_i(β)` depends on `(q0_i, q1_i, qd1_i, g_i)`) must override this
    /// method and recompute W from the current primary state.
    ///
    /// When `beta` is non-zero in a way that affects W (i.e. `g_i != 0`),
    /// `family_scalars` MUST be `Some(..)`. Return `Err` if scalars are
    /// missing in that case (same error-message style as T26's contract).
    fn channel_hessian_at(
        &self,
        beta: &[f64],
        family_scalars: Option<&std::sync::Arc<dyn std::any::Any + Send + Sync>>,
    ) -> Result<Arc<dyn FamilyChannelHessian>, String> {
        // Default: W is β-independent — return a snapshot of the frozen W
        // wrapped in a simple tensor-backed implementation. β and
        // family_scalars are validated (NaN-guard, presence flag) so callers
        // that pass garbage state still see an Err rather than a silently-stale
        // W. The default impl does not require family_scalars; family-specific
        // overrides may.
        if beta.iter().any(|v| v.is_nan()) {
            return Err("channel_hessian_at: beta contains NaN".to_string());
        }
        // Acknowledge family_scalars without binding it to a discarded name.
        if family_scalars.is_some() && beta.is_empty() {
            return Err(
                "channel_hessian_at: family_scalars supplied but beta is empty".to_string(),
            );
        }
        let tensor = self.evaluate_full();
        Ok(Arc::new(TensorChannelHessian { h: tensor }))
    }
}

/// A [`FamilyChannelHessian`] backed directly by a pre-computed
/// `(n × K × K)` tensor. Used by the default `channel_hessian_at`
/// implementation and by tests.
///
/// This is the β-independent path: `fill_subject` reads from the frozen
/// tensor without any recomputation.
pub struct TensorChannelHessian {
    pub h: ndarray::Array3<f64>,
}

impl FamilyChannelHessian for TensorChannelHessian {
    fn n_outputs(&self) -> usize {
        self.h.shape()[1]
    }

    fn n_subjects(&self) -> usize {
        self.h.shape()[0]
    }

    fn fill_subject(&self, i: usize, out: &mut [f64]) {
        let k = self.h.shape()[1];
        assert_eq!(out.len(), k * k);
        for a in 0..k {
            for b in 0..k {
                out[a * k + b] = self.h[[i, a, b]];
            }
        }
    }

    fn evaluate_full(&self) -> ndarray::Array3<f64> {
        self.h.clone()
    }
}

/// β-linearization state passed to [`BlockEffectiveJacobian::effective_jacobian_at`].
///
/// At pre-fit initialization, pass `beta = &[]` / zeros and `family_scalars = None`.
/// Families that need β-dependent scalars (e.g. survival marginal-slope's q0, q1,
/// g, c, z) store them in `family_scalars` as a concrete type behind
/// `Arc<dyn Any + Send + Sync>` and downcast inside their impl.
pub struct FamilyLinearizationState<'a> {
    pub beta: &'a [f64],
    /// Optional family-shared scalars at this β linearization.
    /// Downcast via `state.family_scalars.as_ref().and_then(|a| a.downcast_ref::<T>())`.
    pub family_scalars: Option<Arc<dyn std::any::Any + Send + Sync>>,
    /// Optional per-subject channel Hessian for multi-output families.
    /// When `Some`, the identifiability canonicalisation step and the Gram
    /// builder use the channel-stacked Fisher information instead of the
    /// scalar-weight approximation.  Single-output families leave this `None`.
    pub channel_hessian: Option<Arc<dyn FamilyChannelHessian>>,
    /// Probit frailty scale factor `s_f = 1/√(1+σ²)`.
    ///
    /// For survival marginal-slope families the logslope η contribution is
    /// `s_f · g · z`, so any Jacobian callback that depends on g or z must
    /// read `s_f` from here rather than from a captured-at-construction value.
    /// When σ = 0 (no frailty) or for non-frailty families, set this to 1.0.
    ///
    /// Since σ is always **fixed** (not jointly optimised with β) in the
    /// survival family, `s_f` is a static scalar for the entire inner fit;
    /// `∂s_f/∂σ` never appears in the β-Jacobian.  The field is nonetheless
    /// carried through state so that Jacobian callbacks are not required to
    /// capture `s_f` at spec-construction time — they can read it at
    /// evaluation time and thus stay correct across outer-loop σ updates.
    pub probit_frailty_scale: f64,
}

/// β-dependent Jacobian callback for a parameter block.
///
/// Principled long-term contract for expressing how a block contributes to
/// the stacked linear predictor at a given β:
///
/// ```text
/// J(β) ∈ ℝ^{n_rows · n_outputs × p_block}
/// ```
///
/// - Single-output linear block: returns `design.clone()`.
/// - Row-scaled block (`RowScaledJacobian`): returns `diag(eta_scaling) · design` (still linear in β).
/// - Multi-output block (e.g. survival marginal-slope with η0, η1, ad1):
///   stacks `∂eta_r/∂β_k` for `r ∈ 0..n_outputs`, row-major ordering.
///
/// The default impl on [`ParameterBlockSpec::effective_jacobian_at`] is:
/// - `jacobian_callback = None` → `design.clone()`.
/// - `jacobian_callback = Some(cb)` → delegates to `cb.effective_jacobian_at`.
pub trait BlockEffectiveJacobian: Send + Sync {
    /// Stacked multi-output Jacobian at the current β.
    ///
    /// Shape: `(n_rows * n_outputs, p_block)`, **channel-major**: rows
    /// `r * n_rows .. (r + 1) * n_rows` carry output channel `r`'s row
    /// Jacobian, so `stacked[r * n_rows + i, j]` is observation `i`'s row at
    /// output `r` and coefficient column `j`.  Every consumer that destacks
    /// this matrix (audit, canonicaliser, fit) relies on this layout — see
    /// `BlockJacobianAsRowOp::from_callback` for the destacking transpose.
    /// For `n_outputs = 1` this is identical to the `(n_rows, p_block)` effective
    /// design used by the flat identifiability audit.
    fn effective_jacobian_at(
        &self,
        state: &FamilyLinearizationState<'_>,
    ) -> Result<Array2<f64>, String>;

    /// Number of stacked output channels. 1 for most blocks.
    fn n_outputs(&self) -> usize {
        1
    }

    /// Returns the per-row scaling vector when this callback is a simple
    /// diagonal-scaling block (`RowScaledJacobian`).  Used by the
    /// identifiability audit's skewness-aware bias correction (T25).
    ///
    /// Returns `None` for all blocks except `RowScaledJacobian`.
    fn eta_row_scaling_for_skewness(&self) -> Option<Arc<[f64]>> {
        None
    }
}

/// A [`BlockEffectiveJacobian`] for any block that contributes linearly to
/// exactly one output of a multi-output family.
///
/// `own_output` is the zero-based output index that this block drives.
/// `n_family_outputs` is the total number of outputs (e.g. 2 for location-scale).
/// `design` is the block's effective design matrix (n × p_block).
///
/// The returned Jacobian has shape `(n_family_outputs * n, p_block)`:
/// rows `own_output * n .. (own_output + 1) * n` contain `design`,
/// all other rows are zero.
pub struct AdditiveBlockJacobian {
    pub design: Array2<f64>,
    pub own_output: usize,
    pub n_family_outputs: usize,
}

impl BlockEffectiveJacobian for AdditiveBlockJacobian {
    fn effective_jacobian_at(
        &self,
        state: &FamilyLinearizationState<'_>,
    ) -> Result<Array2<f64>, String> {
        let n = self.design.nrows();
        let p = self.design.ncols();
        // Additive (linear) block: Jacobian is β-independent — design does
        // not depend on state.beta. Verify beta contains no NaN when provided.
        if !state.beta.is_empty() && state.beta.iter().any(|v| v.is_nan()) {
            return Err(
                "AdditiveBlockJacobian::effective_jacobian_at: beta contains NaN".to_string(),
            );
        }
        let total_rows = self.n_family_outputs * n;
        let mut jac = Array2::<f64>::zeros((total_rows, p));
        let row_start = self.own_output * n;
        jac.slice_mut(ndarray::s![row_start..row_start + n, ..])
            .assign(&self.design);
        Ok(jac)
    }

    fn n_outputs(&self) -> usize {
        self.n_family_outputs
    }
}

/// A [`BlockEffectiveJacobian`] for a single-output block whose contribution
/// to the linear predictor is `diag(eta_scaling) · design` (row-wise scaling).
///
/// This is the canonical replacement for the former `eta_row_scaling` field on
/// [`ParameterBlockSpec`].  The identifiability audit's skewness-aware bias
/// correction can recover the scaling vector via
/// [`BlockEffectiveJacobian::eta_row_scaling_for_skewness`].
pub struct RowScaledJacobian {
    pub design: Arc<Array2<f64>>,
    pub eta_scaling: Arc<[f64]>,
}

impl BlockEffectiveJacobian for RowScaledJacobian {
    fn effective_jacobian_at(
        &self,
        state: &FamilyLinearizationState<'_>,
    ) -> Result<Array2<f64>, String> {
        let n = self.design.nrows();
        if self.eta_scaling.len() != n {
            return Err(format!(
                "RowScaledJacobian: eta_scaling length {} != design nrows {}",
                self.eta_scaling.len(),
                n,
            ));
        }
        // Row-scaled blocks are β-linear; verify the linearization point
        // contains no NaN when β is provided (sanity check on caller state).
        if !state.beta.is_empty() && state.beta.iter().any(|v| v.is_nan()) {
            return Err(
                "RowScaledJacobian::effective_jacobian_at: state.beta contains NaN".to_string(),
            );
        }
        let mut scaled = self.design.as_ref().clone();
        for i in 0..n {
            let s = self.eta_scaling[i];
            for j in 0..scaled.ncols() {
                scaled[[i, j]] *= s;
            }
        }
        Ok(scaled)
    }

    fn eta_row_scaling_for_skewness(&self) -> Option<Arc<[f64]>> {
        Some(Arc::clone(&self.eta_scaling))
    }
}

/// Static specification for one parameter block in a custom family.
///
/// `design` and `stacked_design` are two structurally distinct operators:
///
/// * `design` is the **canonical, single-channel, n-observation operator**.
///   `design.nrows()` ALWAYS equals `n_obs` (one row per training
///   observation).  This is the matrix the identifiability audit, the
///   shape policy, and every "what shape is this block?" reader inspect.
///   For most blocks `design` is also the eta-producing operator used by
///   the solver — see [`Self::solver_design`].
/// * `stacked_design`, when `Some`, is the **multi-channel eta-producing
///   operator** used by the solver.  Survival time-varying blocks stack
///   `[exit; entry; deriv]` into a `(3·n × p)` operator here so the
///   solver can produce a `3·n`-long `eta` in one mat-vec; the audit
///   never sees this matrix.  When `None`, the solver uses `design` (the
///   single-channel default).
///
/// The single contract that downstream code can rely on:
/// `design.nrows() == n_obs`.  No more dual semantics on `design`.
///
/// Read access:
/// * Audit / canonicalize / "n_obs is the row count" code → `&spec.design`.
/// * Eta-producing solver code → [`Self::solver_design`].
#[derive(Clone)]
pub struct ParameterBlockSpec {
    pub name: String,
    pub design: DesignMatrix,
    pub offset: Array1<f64>,
    /// Block-local penalty matrices (all p_block x p_block).
    pub penalties: Vec<PenaltyMatrix>,
    /// Structural nullspace dimension of each penalty matrix (same length as `penalties`).
    /// Used by the penalty pseudo-logdet to determine rank without numerical thresholds.
    /// If empty, falls back to eigenvalue-based rank detection.
    pub nullspace_dims: Vec<usize>,
    /// Initial log-smoothing parameters for this block (same length as `penalties`).
    pub initial_log_lambdas: Array1<f64>,
    /// Optional initial coefficients (defaults to zeros if omitted).
    pub initial_beta: Option<Array1<f64>>,
    /// Gauge ownership priority. Higher = more likely to retain a
    /// redundant direction during canonical-gauge reparameterisation.
    /// Defaults to 100. Set higher for blocks that should "own" shared
    /// affine/null-space directions (e.g. baseline time in survival).
    pub gauge_priority: u8,
    /// Full β-dependent Jacobian callback.  When `Some`, this is the
    /// authoritative source for `effective_jacobian_at`.  For simple
    /// single-output row-scaled blocks use [`RowScaledJacobian`].
    pub jacobian_callback: Option<Arc<dyn BlockEffectiveJacobian>>,
    /// Optional multi-channel eta-producing operator used by the solver.
    ///
    /// When `Some`, the solver consumes this matrix (typically
    /// `(k·n × p)` for `k` stacked channels — e.g. survival
    /// `[exit; entry; deriv]` with `k = 3`) to evaluate `eta = stacked · β + stacked_offset`.
    /// The audit and shape policy NEVER read this field; they only ever
    /// inspect `design` (which always has `n_obs` rows).
    ///
    /// When `None`, the solver falls back to `design` — the correct
    /// behavior for every single-channel block (i.e. all non-survival
    /// time-varying blocks).
    ///
    /// Read this field via [`Self::solver_design`], never directly.
    ///
    /// Invariant: when `stacked_design = Some(_)`, `stacked_offset` MUST
    /// also be `Some(_)` and its length MUST equal `stacked_design.nrows()`.
    pub stacked_design: Option<DesignMatrix>,
    /// Optional offset paired with [`Self::stacked_design`]. Same Option
    /// state as `stacked_design` (both `Some` or both `None`).
    /// Read via [`Self::solver_offset`].
    pub stacked_offset: Option<Array1<f64>>,
}

impl std::fmt::Debug for ParameterBlockSpec {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ParameterBlockSpec")
            .field("name", &self.name)
            .field("design", &self.design)
            .field("offset", &self.offset)
            .field("penalties", &self.penalties)
            .field("nullspace_dims", &self.nullspace_dims)
            .field("initial_log_lambdas", &self.initial_log_lambdas)
            .field("initial_beta", &self.initial_beta)
            .field("gauge_priority", &self.gauge_priority)
            .field(
                "jacobian_callback",
                &self
                    .jacobian_callback
                    .as_ref()
                    .map(|_| "<BlockEffectiveJacobian>"),
            )
            .finish()
    }
}

impl ParameterBlockSpec {
    /// Returns a ParameterBlockSpec with sensible defaults for all optional
    /// fields. Callers using struct literal syntax can use
    /// `..ParameterBlockSpec::defaults()` to fill in any fields added after
    /// the literal was written.
    pub fn defaults() -> Self {
        Self {
            name: String::new(),
            design: DesignMatrix::Dense(crate::linalg::matrix::DenseDesignMatrix::from(
                ndarray::Array2::<f64>::zeros((0, 0)),
            )),
            offset: ndarray::Array1::<f64>::zeros(0),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: ndarray::Array1::<f64>::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }
    }

    /// Returns the eta-producing operator used by the solver.
    ///
    /// Resolution order:
    ///   1. `stacked_design = Some(d)` → return `d` (multi-channel
    ///      operator, e.g. `(3n × p)` for survival time-varying blocks).
    ///   2. otherwise → return `&self.design` (the single-channel default).
    ///
    /// Solver code that needs `eta = D · β` MUST call this accessor;
    /// reading `&self.design` directly silently breaks multi-channel
    /// (survival LS time-varying) blocks because `self.design.nrows()`
    /// always equals `n_obs`, never `3·n_obs`.
    pub fn solver_design(&self) -> &DesignMatrix {
        self.stacked_design.as_ref().unwrap_or(&self.design)
    }

    /// Returns the offset paired with [`Self::solver_design`]. When
    /// `stacked_offset = Some(o)` this returns `&o`; otherwise it falls
    /// back to `&self.offset`.
    pub fn solver_offset(&self) -> &Array1<f64> {
        self.stacked_offset.as_ref().unwrap_or(&self.offset)
    }

    /// Returns the effective design `D_eff` for this block at β = 0 with no
    /// family scalars — a convenience wrapper around [`Self::effective_jacobian_at`]
    /// for the single-output (n_outputs = 1) case.
    ///
    /// Callers that need multi-output Jacobians or β-dependent scalars should
    /// call `effective_jacobian_at` directly with the appropriate state.
    ///
    /// Returns `Err` if the design cannot be densified.
    pub fn effective_design(&self, caller: &str) -> Result<ndarray::Array2<f64>, String> {
        let p = self.design.ncols();
        let zeros = vec![0.0f64; p];
        let state = FamilyLinearizationState {
            beta: &zeros,
            family_scalars: None,
            channel_hessian: None,
            probit_frailty_scale: 1.0,
        };
        self.effective_jacobian_at(caller, &state)
    }

    /// Returns the β-dependent stacked Jacobian `J(β)` for this block.
    ///
    /// Shape: `(n_rows * n_outputs, p_block)`.  For most blocks `n_outputs = 1`
    /// and the result is the familiar `(n_rows, p_block)` effective design.
    ///
    /// Dispatch order:
    ///   1. `jacobian_callback = Some(cb)` → `cb.effective_jacobian_at(state)`.
    ///   2. `jacobian_callback = None` → `design.clone()` (ignores `beta` and `family_scalars`).
    ///
    /// Returns `Err` if the design cannot be densified.
    pub fn effective_jacobian_at(
        &self,
        caller: &str,
        state: &FamilyLinearizationState<'_>,
    ) -> Result<ndarray::Array2<f64>, String> {
        if let Some(cb) = self.jacobian_callback.as_ref() {
            return cb.effective_jacobian_at(state);
        }
        self.design
            .try_to_dense_arc(&format!(
                "{caller}::effective_jacobian_at block '{}'",
                self.name
            ))
            .map(|arc| arc.as_ref().clone())
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CoefficientBlockSelector {
    Name(String),
    Index(usize),
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CoefficientLabel {
    pub block: CoefficientBlockSelector,
    pub column: usize,
}

impl CoefficientLabel {
    pub fn by_block_name(block: impl Into<String>, column: usize) -> Self {
        Self {
            block: CoefficientBlockSelector::Name(block.into()),
            column,
        }
    }
}

pub fn coefficient_label(block: impl Into<String>, column: usize) -> CoefficientLabel {
    CoefficientLabel::by_block_name(block, column)
}

#[derive(Debug, Clone, PartialEq)]
pub enum CoefficientGroupPrior {
    Flat,
    NormalLogPrecision {
        mean: f64,
        sd: f64,
    },
    GammaPrecision {
        shape: f64,
        rate: f64,
    },
    /// Penalized-complexity prior calibrated by `P(exp(-ρ/2) > upper) =
    /// tail_prob`; see [`crate::types::RhoPrior::PenalizedComplexity`].
    PenalizedComplexity {
        upper: f64,
        tail_prob: f64,
    },
}

impl CoefficientGroupPrior {
    pub fn to_rho_prior(&self) -> crate::types::RhoPrior {
        match *self {
            Self::Flat => crate::types::RhoPrior::Flat,
            Self::NormalLogPrecision { mean, sd } => crate::types::RhoPrior::Normal { mean, sd },
            Self::GammaPrecision { shape, rate } => {
                crate::types::RhoPrior::GammaPrecision { shape, rate }
            }
            Self::PenalizedComplexity { upper, tail_prob } => {
                crate::types::RhoPrior::PenalizedComplexity { upper, tail_prob }
            }
        }
    }

    fn validate(&self, context: &str) -> Result<(), String> {
        match *self {
            Self::Flat => Ok(()),
            Self::NormalLogPrecision { mean, sd } => {
                if !mean.is_finite() {
                    return Err(format!(
                        "{context} Normal log-precision prior requires finite mean, got {mean}"
                    ));
                }
                if !sd.is_finite() || sd <= 0.0 {
                    return Err(format!(
                        "{context} Normal log-precision prior requires sd > 0, got {sd}"
                    ));
                }
                Ok(())
            }
            Self::PenalizedComplexity { upper, tail_prob } => {
                validate_penalized_complexity_prior(context, upper, tail_prob)
            }
            Self::GammaPrecision { shape, rate } => {
                if !shape.is_finite() || shape <= 0.0 {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "{context} Gamma precision prior requires shape > 0, got {shape}"
                        ),
                    }
                    .into());
                }
                if !rate.is_finite() || rate < 0.0 {
                    return Err(format!(
                        "{context} Gamma precision prior requires rate >= 0, got {rate}"
                    ));
                }
                Ok(())
            }
        }
    }
}

#[derive(Debug, Clone, PartialEq)]
pub struct CoefficientGroupSpec {
    pub label: String,
    pub coefficients: Vec<CoefficientLabel>,
    pub parent: Option<String>,
    pub prior: Option<CoefficientGroupPrior>,
    pub initial_log_precision: Option<f64>,
}

impl CoefficientGroupSpec {
    pub fn new(label: impl Into<String>, coefficients: Vec<CoefficientLabel>) -> Self {
        Self {
            label: label.into(),
            coefficients,
            parent: None,
            prior: None,
            initial_log_precision: None,
        }
    }

    pub fn with_parent(mut self, parent: impl Into<String>) -> Self {
        self.parent = Some(parent.into());
        self
    }

    pub fn with_prior(mut self, prior: CoefficientGroupPrior) -> Self {
        self.prior = Some(prior);
        self
    }
}

#[derive(Debug, Clone, PartialEq)]
pub struct RealizedCoefficientGroup {
    pub label: String,
    pub parent: Option<String>,
    pub coefficients: Vec<(usize, usize)>,
    pub prior: Option<CoefficientGroupPrior>,
    pub initial_log_precision: f64,
}

#[derive(Debug, Clone)]
pub struct RealizedCoefficientGroupSpecs {
    pub specs: Vec<ParameterBlockSpec>,
    pub groups: Vec<RealizedCoefficientGroup>,
    /// One entry per realized penalty in flattened block order. Built-in
    /// penalties receive unique internal labels; user groups carry their
    /// declared labels. Consumers that optimize one coordinate per label can
    /// use this to tie cross-block penalty pieces to a shared precision.
    pub penalty_labels: Vec<String>,
    /// Per-coordinate priors in `outer_labels` order.
    pub rho_prior: crate::types::RhoPrior,
    pub outer_labels: Vec<String>,
}

fn coefficient_group_block_index(
    specs: &[ParameterBlockSpec],
    selector: &CoefficientBlockSelector,
) -> Result<usize, String> {
    match selector {
        CoefficientBlockSelector::Index(index) => {
            if *index >= specs.len() {
                Err(format!(
                    "coefficient group references block index {index}, but only {} blocks exist",
                    specs.len()
                ))
            } else {
                Ok(*index)
            }
        }
        CoefficientBlockSelector::Name(name) => specs
            .iter()
            .position(|spec| spec.name == *name)
            .ok_or_else(|| format!("coefficient group references unknown block '{name}'")),
    }
}

fn validate_group_rho_prior_coordinate(
    prior: &crate::types::RhoPrior,
    context: &str,
) -> Result<(), String> {
    match prior {
        crate::types::RhoPrior::Flat => Ok(()),
        crate::types::RhoPrior::Normal { mean, sd } => {
            if !mean.is_finite() {
                return Err(format!(
                    "{context} Normal log-precision prior requires finite mean, got {mean}"
                ));
            }
            if !sd.is_finite() || *sd <= 0.0 {
                return Err(format!(
                    "{context} Normal log-precision prior requires sd > 0, got {sd}"
                ));
            }
            Ok(())
        }
        crate::types::RhoPrior::GammaPrecision { shape, rate } => {
            if !shape.is_finite() || *shape <= 0.0 {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "{context} Gamma precision prior requires shape > 0, got {shape}"
                    ),
                }
                .into());
            }
            if !rate.is_finite() || *rate < 0.0 {
                return Err(format!(
                    "{context} Gamma precision prior requires rate >= 0, got {rate}"
                ));
            }
            Ok(())
        }
        crate::types::RhoPrior::PenalizedComplexity { upper, tail_prob } => {
            validate_penalized_complexity_prior(context, *upper, *tail_prob)
        }
        crate::types::RhoPrior::Independent(_) => Err(CustomFamilyError::ConstraintViolation {
            reason: format!("{context} must be a scalar rho prior, not a nested Independent prior"),
        }
        .into()),
    }
}

/// Shared validation of penalized-complexity hyperparameters: `upper` finite and
/// strictly positive, `tail_prob` a probability in the open interval `(0, 1)`.
fn validate_penalized_complexity_prior(
    context: &str,
    upper: f64,
    tail_prob: f64,
) -> Result<(), String> {
    if !upper.is_finite() || upper <= 0.0 {
        return Err(format!(
            "{context} penalized-complexity prior requires upper > 0, got {upper}"
        ));
    }
    if !tail_prob.is_finite() || tail_prob <= 0.0 || tail_prob >= 1.0 {
        return Err(format!(
            "{context} penalized-complexity prior requires tail probability in (0, 1), got {tail_prob}"
        ));
    }
    Ok(())
}

fn expand_custom_group_base_prior(
    base_prior: &crate::types::RhoPrior,
    base_count: usize,
    context: &str,
) -> Result<Vec<crate::types::RhoPrior>, String> {
    match base_prior {
        crate::types::RhoPrior::Independent(priors) => {
            if priors.len() != base_count {
                return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                    "{context} base Independent rho prior length mismatch: got {}, expected {base_count}",
                    priors.len()
                ) }.into());
            }
            for (idx, prior) in priors.iter().enumerate() {
                validate_group_rho_prior_coordinate(prior, &format!("{context} base prior {idx}"))?;
            }
            Ok(priors.clone())
        }
        prior => {
            validate_group_rho_prior_coordinate(prior, context)?;
            Ok((0..base_count).map(|_| prior.clone()).collect())
        }
    }
}

pub fn realize_coefficient_groups_for_custom_family(
    specs: &[ParameterBlockSpec],
    groups: &[CoefficientGroupSpec],
    base_prior: crate::types::RhoPrior,
) -> Result<RealizedCoefficientGroupSpecs, String> {
    use crate::terms::coefficient_group_resolver::{ResolvedGroup, ResolvedGroupHierarchy};

    validate_blockspecs(specs)?;
    // Carrier-specific validation. The prior and the custom-only
    // `initial_log_precision` field are validated here because they have no
    // analogue on the standard-term carrier; label, duplicate, empty-set, and
    // hierarchy checks are delegated to the shared resolver below.
    for group in groups {
        if let Some(prior) = group.prior.as_ref() {
            prior.validate(&format!("coefficient group '{}'", group.label))?;
        }
        if let Some(initial) = group.initial_log_precision
            && !initial.is_finite()
        {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "coefficient group '{}' initial log precision must be finite, got {initial}",
                    group.label
                ),
            }
            .into());
        }
    }

    // Carrier = `(block_idx, column)` coordinates of the parameter blocks.
    // Resolve every declared label into its coordinate set, then hand the
    // carrier-agnostic policy (labels, hierarchy, subsets, child unions) to the
    // shared resolver.
    let resolved_groups = groups
        .iter()
        .map(|group| {
            let mut coordinates = BTreeSet::<(usize, usize)>::new();
            for label in &group.coefficients {
                let block_idx = coefficient_group_block_index(specs, &label.block)?;
                let p = specs[block_idx].design.ncols();
                if label.column >= p {
                    return Err(format!(
                        "coefficient group '{}' references column {} in block '{}' (index {block_idx}), but the block has {p} columns",
                        group.label, label.column, specs[block_idx].name
                    ));
                }
                coordinates.insert((block_idx, label.column));
            }
            Ok(ResolvedGroup {
                label: group.label.clone(),
                parent: group.parent.clone(),
                coordinates,
            })
        })
        .collect::<Result<Vec<_>, String>>()?;
    let hierarchy = ResolvedGroupHierarchy::build(resolved_groups)?;

    let realized_groups = groups
        .iter()
        .zip(hierarchy.groups())
        .map(|(group, resolved)| RealizedCoefficientGroup {
            label: group.label.clone(),
            parent: group.parent.clone(),
            coefficients: resolved.coordinates.iter().copied().collect(),
            prior: group.prior.clone(),
            initial_log_precision: group.initial_log_precision.unwrap_or(0.0),
        })
        .collect::<Vec<_>>();

    let mut realized_specs = specs.to_vec();
    let mut penalty_labels = Vec::<String>::new();
    let mut outer_labels = Vec::<String>::new();
    let base_count = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
    let mut priors = expand_custom_group_base_prior(&base_prior, base_count, "coefficient groups")?;
    let mut base_prior_idx = 0usize;

    for (block_idx, spec) in specs.iter().enumerate() {
        for penalty_idx in 0..spec.penalties.len() {
            let label = format!("__block_{block_idx}_penalty_{penalty_idx}");
            penalty_labels.push(label.clone());
            outer_labels.push(label);
            base_prior_idx += 1;
        }
    }
    assert_eq!(base_prior_idx, base_count);

    for group in &realized_groups {
        outer_labels.push(group.label.clone());
        let group_prior = match group.prior.as_ref() {
            Some(prior) => prior.to_rho_prior(),
            None => match &base_prior {
                crate::types::RhoPrior::Independent(_) => {
                    return Err(CustomFamilyError::ConstraintViolation { reason: format!(
                        "coefficient group '{}' must declare a prior when base_prior is Independent",
                        group.label
                    ) }.into());
                }
                prior => prior.clone(),
            },
        };
        priors.push(group_prior);

        // Hierarchical Gamma precision update.
        //
        // For one Gaussian coefficient group with fixed beta and precision
        // lambda,
        //
        //   p(beta_g | lambda) p(lambda)
        //     ∝ lambda^{|g|/2} exp[-lambda q_g/2]
        //       lambda^{a_g-1} exp[-b_g lambda],
        //   q_g = (beta_g - mu_g)' S_g (beta_g - mu_g).
        //
        // Maximizing the log posterior in lambda gives
        //
        //   lambda* = (a_g + |g|/2 - 1) / (b_g + q_g/2).
        //
        // If a node has children, beta_g is the concatenation of the child
        // coefficient vectors.  The parent density is therefore the product
        // of those child Gaussian factors under one lambda_g: replace |g| and
        // q_g by sums over the child components, expanding recursively when a
        // child is itself an interior node.  We preserve that identity by
        // emitting one physical penalty piece per concatenated child component
        // and tying those pieces with the parent's precision label.  This is
        // not a block-sum shortcut: overlapping children remain separate
        // factors, so their log normalizers and quadratic contributions both
        // add.
        let penalty_components = hierarchy.concatenated_penalty_components(&group.label);
        for component in penalty_components {
            let mut by_block = BTreeMap::<usize, Vec<usize>>::new();
            for &(block_idx, column) in &component {
                by_block.entry(block_idx).or_default().push(column);
            }
            for (block_idx, columns) in by_block {
                let p = realized_specs[block_idx].design.ncols();
                let mut matrix = Array2::<f64>::zeros((p, p));
                for column in &columns {
                    matrix[[*column, *column]] = 1.0;
                }
                realized_specs[block_idx]
                    .penalties
                    .push(PenaltyMatrix::Dense(matrix).with_precision_label(group.label.clone()));
                realized_specs[block_idx]
                    .nullspace_dims
                    .push(p.saturating_sub(columns.len()));
                let mut rho =
                    Array1::<f64>::zeros(realized_specs[block_idx].initial_log_lambdas.len() + 1);
                if !realized_specs[block_idx].initial_log_lambdas.is_empty() {
                    let old_len = realized_specs[block_idx].initial_log_lambdas.len();
                    rho.slice_mut(s![..old_len])
                        .assign(&realized_specs[block_idx].initial_log_lambdas);
                }
                let last = rho.len() - 1;
                rho[last] = group.initial_log_precision;
                realized_specs[block_idx].initial_log_lambdas = rho;
                penalty_labels.push(group.label.clone());
            }
        }
    }

    Ok(RealizedCoefficientGroupSpecs {
        specs: realized_specs,
        groups: realized_groups,
        penalty_labels,
        rho_prior: crate::types::RhoPrior::Independent(priors),
        outer_labels,
    })
}

fn custom_family_block_role(
    name: &str,
    index: usize,
    n_blocks: usize,
) -> crate::solver::estimate::BlockRole {
    use crate::solver::estimate::BlockRole;

    if n_blocks == 1 {
        return BlockRole::Mean;
    }

    match name.trim().to_ascii_lowercase().as_str() {
        "eta" | "mean" | "beta" => BlockRole::Mean,
        "mu" | "location" | "marginal_surface" => BlockRole::Location,
        "threshold" => BlockRole::Threshold,
        "log_sigma" | "scale" | "logslope_surface" => BlockRole::Scale,
        "time" | "time_transform" | "time_surface" => BlockRole::Time,
        name if name.starts_with("time_cause_") => BlockRole::Time,
        "wiggle" | "linkwiggle" => BlockRole::LinkWiggle,
        _ if index == 0 => BlockRole::Location,
        _ => BlockRole::Scale,
    }
}

/// Current state for a parameter block.
#[derive(Clone, Debug)]
pub struct ParameterBlockState {
    pub beta: Array1<f64>,
    pub eta: Array1<f64>,
}

#[derive(Clone)]
pub struct BlockGeometryDirectionalDerivative {
    /// Directional derivative of the block design matrix along a coefficient-space direction.
    pub d_design: Option<Array2<f64>>,
    /// Directional derivative of the block offset along the same direction.
    pub d_offset: Array1<f64>,
}

/// Working quantities supplied by a custom family for one block.
///
/// # Observed vs expected information (see response.md Section 3)
///
/// For the outer REML/LAML criterion, the Hessian used in log|H| and trace terms
/// must be the **observed** (actual) Hessian at the mode, not the expected Fisher.
///
/// - `ExactNewton`: provides -nabla^2 log L directly, which is the observed Hessian
///   by construction. This is always correct.
///
/// - `Diagonal`: provides IRLS working weights W such that the per-block Hessian
///   is X'WX. For canonical links (logit-Binomial, log-Poisson), W_obs = W_Fisher.
///   For supported non-canonical diagonal links, W must be the observed weight
///   W_obs = W_Fisher - (y-mu)*B so the outer REML uses the exact Laplace
///   Hessian. The matching [`CustomFamily::diagonalworking_weights_directional_derivative`]
///   callback must differentiate the same observed W surface; silently using Fisher
///   weights or zero `dW` would change the criterion into a PQL-type surrogate.
#[derive(Clone, Debug)]
pub enum BlockWorkingSet {
    /// Standard IRLS/GLM-style diagonal working set for eta-space updates.
    Diagonal {
        /// IRLS pseudo-response for this block's linear predictor.
        working_response: Array1<f64>,
        /// IRLS working weights for this block (non-negative, length n).
        ///
        /// For the inner solver, Fisher or observed weights both find the same mode.
        /// For the outer REML/LAML log|H| term, observed weights are the correct
        /// Laplace choice (see response.md Section 3). Canonical-link families need
        /// no correction since observed = Fisher.
        working_weights: Array1<f64>,
    },
    /// Exact Newton block update in coefficient space.
    ///
    /// `gradient` is nabla log L wrt block coefficients.
    /// `hessian` is -nabla^2 log L wrt block coefficients (positive semidefinite near optimum).
    ///
    /// This is the observed Hessian by construction (actual second derivative of the
    /// log-likelihood), which is the correct quantity for the outer REML Laplace
    /// approximation.
    ExactNewton {
        gradient: Array1<f64>,
        hessian: SymmetricMatrix,
    },
}

impl BlockWorkingSet {
    /// Construct a `Diagonal` working set with the length invariant
    /// (`working_response.len() == working_weights.len()`) enforced at the
    /// type boundary. Use this from any new code path that produces a
    /// diagonal IRLS block; the legacy struct-literal form is preserved for
    /// existing call sites pending a full migration.
    #[inline]
    pub fn diagonal_checked(
        working_response: Array1<f64>,
        working_weights: Array1<f64>,
    ) -> Result<Self, String> {
        if working_response.len() != working_weights.len() {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "BlockWorkingSet::Diagonal length mismatch: working_response={}, working_weights={}",
                working_response.len(),
                working_weights.len(),
            ) }.into());
        }
        Ok(Self::Diagonal {
            working_response,
            working_weights,
        })
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExactNewtonOuterObjective {
    RidgedQuadraticReml,
    StrictPseudoLaplace,
}

/// Highest exact outer derivative order a family wants to expose at the
/// current realized problem scale.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum ExactOuterDerivativeOrder {
    Zeroth,
    First,
    Second,
}

impl ExactOuterDerivativeOrder {
    pub const fn has_gradient(self) -> bool {
        !matches!(self, Self::Zeroth)
    }

    pub const fn has_hessian(self) -> bool {
        matches!(self, Self::Second)
    }
}

/// Exact outer derivative order for families that expose second-order
/// coefficient geometry.
///
/// This used to be a cost gate that demoted large large-scale problems to
/// first-order BFGS. That was a policy leak into the math layer: if the family
/// supplies analytic dense Hessian blocks or an analytic profiled-Hessian HVP,
/// the outer optimizer should see the exact second-order objective. Runtime
/// representation choices (dense vs operator) belong below this declaration,
/// not in a first-order downgrade.
/// Precondition check for the family capability / operator hooks (e.g.
/// `batched_outer_hessian_terms`, `outer_hyper_hessian_operator`).
///
/// These hooks operate on whatever block geometry the caller has assembled and
/// must validate the *consistency* of the specs they are handed — never the
/// fit-level "at least one block" precondition, which belongs to the fit entry
/// points (`validate_blockspecs`). An empty, self-consistent argument set is a
/// valid no-op probe of the operator path (the operator may ignore the specs
/// entirely), so it must not panic here.
fn assert_valid_blockspecs(specs: &[ParameterBlockSpec], context: &str) {
    assert!(
        validate_blockspec_consistency(specs).is_ok(),
        "{context}: inconsistent parameter block specs"
    );
}

fn assert_valid_options(options: &BlockwiseFitOptions, context: &str) {
    assert!(
        options.inner_tol.is_finite() && options.inner_tol >= 0.0,
        "{context}: inner_tol must be finite and non-negative"
    );
    assert!(
        options.outer_tol.is_finite() && options.outer_tol >= 0.0,
        "{context}: outer_tol must be finite and non-negative"
    );
    assert!(
        options.minweight.is_finite() && options.minweight >= 0.0,
        "{context}: minweight must be finite and non-negative"
    );
    assert!(
        options.ridge_floor.is_finite() && options.ridge_floor >= 0.0,
        "{context}: ridge_floor must be finite and non-negative"
    );
    if let Some(threshold) = options.early_exit_threshold {
        assert!(
            threshold.is_finite(),
            "{context}: early_exit_threshold must be finite"
        );
    }
}

fn assert_states_match_specs(
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    context: &str,
) {
    assert_eq!(
        states.len(),
        specs.len(),
        "{context}: state/spec block count mismatch"
    );
    for (block, (state, spec)) in states.iter().zip(specs).enumerate() {
        assert_eq!(
            state.beta.len(),
            spec.design.ncols(),
            "{context}: beta length mismatch in block {block}"
        );
        // `state.eta` is produced from `solver_design()` (see
        // `refresh_all_block_etas`), which is `stacked_design` when set
        // (3·n_obs rows for survival LS time-varying blocks) and `design`
        // (n_obs rows) otherwise. Use the same accessor here.
        assert_eq!(
            state.eta.len(),
            spec.solver_design().nrows(),
            "{context}: eta length mismatch in block {block}"
        );
    }
}

fn assert_derivative_blocks_match_specs(
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    specs: &[ParameterBlockSpec],
    context: &str,
) {
    assert_eq!(
        derivative_blocks.len(),
        specs.len(),
        "{context}: derivative/spec block count mismatch"
    );
}

fn assert_rho_matches_specs(rho: &Array1<f64>, specs: &[ParameterBlockSpec], context: &str) {
    let expected = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
    assert_eq!(
        rho.len(),
        expected,
        "{context}: rho length does not match penalty count"
    );
}

fn validate_hessian_workspace_ready(
    hessian_workspace: &Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    context: &str,
) -> Result<(), String> {
    if let Some(workspace) = hessian_workspace.as_ref() {
        workspace
            .warm_up_outer_caches()
            .map_err(|err| format!("{context}: failed to warm Hessian workspace caches: {err}"))?;
    }
    Ok(())
}

pub fn exact_outer_order_from_capability(
    specs: &[ParameterBlockSpec],
    coefficient_cost: u64,
) -> ExactOuterDerivativeOrder {
    assert_valid_blockspecs(specs, "exact outer derivative order");
    match coefficient_cost {
        0 => ExactOuterDerivativeOrder::Second,
        _ => ExactOuterDerivativeOrder::Second,
    }
}

/// Capability-aware variant of [`exact_outer_order_from_capability`].
///
/// Kept as the public declaration helper for existing family impls, but it no
/// longer gates by cost. Once a caller has established dense or HVP analytic
/// second-order support, the correct derivative order is `Second`.
pub fn exact_outer_order_with_outer_hvp(
    specs: &[ParameterBlockSpec],
    coefficient_cost: u64,
    outer_hyper_hessian_hvp_available: bool,
) -> ExactOuterDerivativeOrder {
    if outer_hyper_hessian_hvp_available {
        assert_valid_blockspecs(specs, "exact outer derivative order with HVP");
        match coefficient_cost {
            0 => ExactOuterDerivativeOrder::Second,
            _ => ExactOuterDerivativeOrder::Second,
        }
    } else {
        exact_outer_order_from_capability(specs, coefficient_cost)
    }
}

/// Realized outer-derivative policy at the current problem size.
///
/// Capability (the family can produce exact second-order calculus) controls
/// whether the Hessian is declared. Runtime cost controls only representation
/// and staging choices below this layer. Large problems must stay on the exact
/// analytic Hessian path and use an operator representation when dense assembly
/// is too expensive; they are not demoted to first-order BFGS here.
///
/// `OuterDerivativePolicy` records the family's *capability*, the *predicted
/// per-eval cost* for both gradient-only and Hessian paths, and exposes the
/// two policy queries the outer optimizer actually needs:
///
/// * [`order_for_evaluation`](Self::order_for_evaluation) — clamp a requested
///   evaluation order against the policy gate.
/// * [`declared_hessian_form`](Self::declared_hessian_form) — what shape the
///   outer-strategy planner should declare to its plan ladder.
/// * [`should_use_staged_kappa`](Self::should_use_staged_kappa) — auto-route
///   the κ optimizer through the pilot/polish schedule at large `n`.
///
/// All thresholds are *const* — no env vars, no CLI flags. The cost model is
/// the family's own `coefficient_gradient_cost` / `coefficient_hessian_cost`
/// scaled by the joint outer-coordinate dimension, with `saturating_mul` so
/// overflow rounds up to the budget ceiling rather than wrapping silently.
#[derive(Clone, Copy, Debug)]
pub struct OuterDerivativePolicy {
    /// What exact calculus the family advertises in principle.
    pub capability: ExactOuterDerivativeOrder,
    /// Predicted per-eval work for one `ValueGradientHessian` evaluation.
    /// Rounded conservatively *up* via `saturating_mul`. Informational for
    /// representation and diagnostics; it does not disable Hessian capability.
    pub predicted_hessian_work: u128,
    /// Predicted per-eval work for one `ValueAndGradient` evaluation.
    /// Rounded conservatively *up* via `saturating_mul`.
    pub predicted_gradient_work: u128,
    /// True when the family's outer-only paths consume
    /// [`BlockwiseFitOptions::outer_score_subsample`] and produce
    /// Horvitz-Thompson-weighted partial sums (i.e. the family overrides
    /// `log_likelihood_only_with_options`,
    /// `exact_newton_joint_psi_workspace_with_options`, and any other
    /// outer-only hooks reached by `evaluate_custom_family_joint_hyper`).
    ///
    /// Determines whether the κ optimizer's pilot/polish staging schedule
    /// engages: when this is `false`, [`Self::should_use_staged_kappa`]
    /// returns `false` regardless of `n`. Engaging the schedule on a
    /// family that ignores the subsample is strictly worse than not
    /// engaging it — the schedule builds a `RowSet::Subsample` and the
    /// boundary plumbing installs an `OuterScoreSubsample` on options,
    /// but the family's default outer-only paths fall back to full-data
    /// sums, so the pilot evaluation costs the same as the polish but
    /// adds a Vec allocation per eval.
    ///
    /// Families that do **not** consume the subsample (default for new
    /// implementations, including the GAMLSS location-scale families
    /// today) leave this `false`. Families that do consume (today:
    /// `BernoulliMarginalSlopeFamily`) override `outer_derivative_policy`
    /// to set this `true`.
    pub subsample_capable: bool,
}

impl OuterDerivativePolicy {
    /// Per-eval gradient work ceiling above which the κ schedule switches
    /// to the staged pilot/polish path. At large scale (n ≳ 100 k) even
    /// the gradient sweep takes minutes per outer iter; subsampling the
    /// pilot stage cuts that to seconds and leaves the final polish on
    /// full data to recover the MLE.
    pub const OUTER_GRADIENT_WORK_BUDGET: u128 = 50_000_000_000;

    /// Pilot subsample auto-engages when full-data `n` exceeds this. Below
    /// this the κ schedule collapses to a single full-data stage —
    /// behaviour identical to the pre-P7 path.
    pub const STAGED_KAPPA_TRIGGER_N: usize = 30_000;

    /// Clamp a requested evaluation order against the policy gate.
    ///
    /// Returns the highest order this policy permits for the requested order:
    /// * `ValueGradientHessian` requested → keep only if `declared_hessian_form`
    ///   is something other than `Unavailable`.
    /// * `ValueAndGradient` requested → always permitted (gradient-only is
    ///   universal).
    pub fn order_for_evaluation(
        &self,
        requested: crate::solver::outer_strategy::OuterEvalOrder,
    ) -> crate::solver::outer_strategy::OuterEvalOrder {
        use crate::solver::outer_strategy::OuterEvalOrder;
        match requested {
            OuterEvalOrder::ValueAndGradient => OuterEvalOrder::ValueAndGradient,
            OuterEvalOrder::ValueGradientHessian => {
                if matches!(
                    self.declared_hessian_form(),
                    crate::solver::outer_strategy::DeclaredHessianForm::Unavailable
                ) {
                    OuterEvalOrder::ValueAndGradient
                } else {
                    OuterEvalOrder::ValueGradientHessian
                }
            }
        }
    }

    /// Outer Hessian declaration for the outer-strategy planner.
    ///
    /// `Either` ⇔ capability has Hessian. Work estimates select dense vs
    /// operator assembly later; they must not erase analytic second-order
    /// capability from the planner.
    pub fn declared_hessian_form(&self) -> crate::solver::outer_strategy::DeclaredHessianForm {
        use crate::solver::outer_strategy::DeclaredHessianForm;
        if !self.capability.has_hessian() {
            return DeclaredHessianForm::Unavailable;
        }
        DeclaredHessianForm::Either
    }

    /// True when the κ optimizer should auto-route through the staged
    /// pilot/polish schedule. Triggers when **either** the data is big
    /// (`n ≥ STAGED_KAPPA_TRIGGER_N`) **or** the per-eval gradient work
    /// exceeds `OUTER_GRADIENT_WORK_BUDGET`. The second clause catches
    /// problems with moderate `n` but very wide design (large `p_total`
    /// or `psi_dim`) where a single full-data gradient sweep still
    /// dominates the κ trajectory.
    pub fn should_use_staged_kappa(&self, n: usize) -> bool {
        if !self.subsample_capable {
            // Family does not consume `outer_score_subsample` on its
            // outer-only paths. Engaging the schedule would build a
            // pilot `RowSet::Subsample` whose only effect is per-eval
            // Vec/Arc bookkeeping — the underlying coefficient gradient
            // would still sum every row. Gate the schedule off until
            // the family override declares consumption.
            return false;
        }
        n >= Self::STAGED_KAPPA_TRIGGER_N
            || self.predicted_gradient_work > Self::OUTER_GRADIENT_WORK_BUDGET
    }
}

/// Total outer-coordinate dimensionality used by the default policy work
/// model: `rho_dim + psi_dim`. Each outer evaluation propagates one
/// directional derivative per outer coordinate through the inner solve.
#[inline]
fn outer_coord_dim_for_policy(specs: &[ParameterBlockSpec], psi_dim: usize) -> u128 {
    let rho_total: u128 = specs
        .iter()
        .map(|s| s.penalties.len() as u128)
        .fold(0u128, |acc, k| acc.saturating_add(k));
    rho_total.saturating_add(psi_dim as u128)
}

/// Default predicted-cost model for [`OuterDerivativePolicy`]:
///
/// * gradient work ≈ `coefficient_gradient_cost · (rho_dim + psi_dim)`
/// * Hessian work  ≈ `coefficient_hessian_cost  · (rho_dim + psi_dim)`
///
/// Each outer coordinate triggers one analytic directional derivative
/// through the inner solve; the dense Hessian assembly carries the extra
/// `O(p_total)` factor already captured by `coefficient_hessian_cost`.
///
/// All multiplications saturate so an overflow rounds *up* to the gate
/// ceiling: we'd rather drop one Hessian evaluation that we could have
/// afforded than crash on a 600 s eval.
pub fn default_outer_derivative_policy_costs(
    specs: &[ParameterBlockSpec],
    psi_dim: usize,
    grad_cost: u64,
    hess_cost: u64,
) -> (u128, u128) {
    let k = outer_coord_dim_for_policy(specs, psi_dim);
    let grad = (grad_cost as u128).saturating_mul(k.max(1));
    let hess = (hess_cost as u128).saturating_mul(k.max(1));
    (grad, hess)
}

/// Default coefficient-space Hessian cost: `Σ_b n_b · p_b²`, summed across
/// blocks. Represents the work to assemble or apply the dense block-diagonal
/// inner Hessian once.
pub fn default_coefficient_hessian_cost(specs: &[ParameterBlockSpec]) -> u64 {
    specs
        .iter()
        .map(|s| {
            let n = s.design.nrows() as u64;
            let p = s.design.ncols() as u64;
            n.saturating_mul(p.saturating_mul(p))
        })
        .fold(0u64, |acc, c| acc.saturating_add(c))
}

/// Joint-coupled coefficient-space Hessian cost: `n · (Σ_b p_b)²`. The honest
/// per-evaluation work for any family whose row likelihood couples every block
/// (every observation contributes a rank-`m` outer-product update to the full
/// joint Hessian over `Σ p_b` coefficients), as opposed to the block-diagonal
/// `default_coefficient_hessian_cost` which assumes each `X_b' W_b X_b` is
/// assembled independently.
///
/// Used by all GAMLSS, marginal-slope, and joint-latent families. CTN does
/// not delegate here — it uses its Khatri–Rao factor dimensions internally.
pub fn joint_coupled_coefficient_hessian_cost(n: u64, specs: &[ParameterBlockSpec]) -> u64 {
    let p_total: u64 = specs
        .iter()
        .map(|s| s.design.ncols() as u64)
        .fold(0u64, |acc, p| acc.saturating_add(p));
    n.saturating_mul(p_total.saturating_mul(p_total))
}

/// Default coefficient-space gradient cost: half the Hessian cost.
///
/// The first-order analytic gradient in the unified evaluator runs the same
/// inner Newton solve as the second-order path but skips the `K`-fold
/// pairwise Hessian assembly (`B_{j,k}` blocks) and the `K`-fold inner
/// derivative solves; what remains is the inner solve plus a single
/// gradient-only sweep through the data. Empirically this is roughly half
/// the per-evaluation arithmetic of forming the dense Hessian, hence the
/// `/2` default. Families whose gradient assembly differs structurally
/// (e.g. matrix-free Hv operators with no dense Hessian assembly to halve)
/// should override [`CustomFamily::coefficient_gradient_cost`] explicitly.
pub fn default_coefficient_gradient_cost(specs: &[ParameterBlockSpec]) -> u64 {
    default_coefficient_hessian_cost(specs) / 2
}

/// Compute β-block column ranges from a slice of `ParameterBlockSpec`s.
///
/// Returns one `Range<usize>` per spec, covering the spec's columns in the
/// concatenated β vector (i.e. `offset .. offset + p_block` where `p_block =
/// spec.design.ncols()`). The ranges are non-overlapping, sorted, and their
/// union covers `0..Σ p_block`.
///
/// This is the canonical source of `block_offsets` for every
/// [`crate::solver::arrow_schur::ArrowSchurSystem`] built for a custom family
/// (survival, GAMLSS, transformation-normal, latent-survival, marginal-slope,
/// …). Pass the result to
/// [`crate::solver::arrow_schur::ArrowSchurSystem::set_block_offsets`] before
/// calling `solve` or `solve_with_options` whenever the system will use
/// [`crate::solver::arrow_schur::ArrowSolverMode::InexactPCG`].
///
/// Specs with zero columns produce a zero-width range; callers that want to
/// skip trivial blocks may filter on `r.start < r.end` after calling this
/// function.
pub fn block_offsets_from_specs(specs: &[ParameterBlockSpec]) -> Arc<[Range<usize>]> {
    let mut ranges: Vec<Range<usize>> = Vec::with_capacity(specs.len());
    let mut cursor = 0usize;
    for spec in specs {
        let p = spec.design.ncols();
        ranges.push(cursor..cursor + p);
        cursor += p;
    }
    Arc::from(ranges.into_boxed_slice())
}

/// Bound first-order outer iterations when each analytic-gradient evaluation is
/// already large-scale work. This is only applied after the planner has
/// selected a gradient-only route; second-order/ARC plans keep their requested
/// iteration budget.
pub fn cost_gated_first_order_max_iter(
    requested: usize,
    coefficient_gradient_cost: u64,
    has_outer_hessian: bool,
) -> usize {
    const FIRST_ORDER_OUTER_WORK_BUDGET: u64 = 80_000_000_000;
    const MIN_FIRST_ORDER_ITERS: usize = 4;

    if has_outer_hessian || requested <= 1 || coefficient_gradient_cost == 0 {
        return requested;
    }

    let affordable = (FIRST_ORDER_OUTER_WORK_BUDGET / coefficient_gradient_cost) as usize;
    requested.min(affordable.max(MIN_FIRST_ORDER_ITERS))
}

/// Local trust budget for first-order outer BFGS on log-smoothing parameters.
///
/// One unit in `rho = log(lambda)` is an `e`-fold smoothing-parameter change.
/// Previously this cap was `1.0`, which throttled BFGS to ~1/5 of its
/// quasi-Newton step on flat REML surfaces (the natural BFGS direction has
/// `|d|_inf` of ~5 in log-λ for large-scale survival fits). Probes whose
/// `step_inf > cap` are rejected for free in `OuterFirstOrderBridge::eval_cost`
/// (returning `BFGS_LINE_SEARCH_REJECT_COST` without running an inner solve),
/// so a larger cap costs nothing on rejection — it only lets Strong-Wolfe
/// accept bigger steps that the inner-PIRLS divergence guard can already
/// validate. `5.0` allows up to `e^5 ≈ 148`-fold smoothing-parameter change
/// per accepted outer iter, which matches the typical quasi-Newton direction
/// magnitude while still bounding pathological probes.
pub const fn first_order_bfgs_loglambda_step_cap(has_outer_hessian: bool) -> Option<f64> {
    if has_outer_hessian { None } else { Some(5.0) }
}

pub(crate) fn exact_newton_outer_geometry_supports_second_order_solver<F: CustomFamily + ?Sized>(
    family: &F,
) -> bool {
    family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
}

/// Family evaluation over all parameter blocks.
#[derive(Clone, Debug)]
pub struct FamilyEvaluation {
    pub log_likelihood: f64,
    pub blockworking_sets: Vec<BlockWorkingSet>,
}

pub struct ExactNewtonJointGradientEvaluation {
    pub log_likelihood: f64,
    pub gradient: Array1<f64>,
}

/// Batched per-θ_j contributions to the analytic outer gradient.
///
/// Used by [`CustomFamily::batched_outer_gradient_terms`] to amortize the
/// joint-Hessian factorization across all K hyperparameters: instead of
/// computing each `tr(H⁻¹ · Ḣ_j)` independently (K independent solves), the
/// family factors `H` once, computes per-row leverages `L_i = Z_i H⁻¹ Z_iᵀ`,
/// and accumulates all K traces in a single streaming pass.
///
/// All three vectors have length equal to the total number of outer
/// hyperparameters (K = `rho.len() + Σ derivative_blocks[b].len()`), in the
/// same coordinate order as the unified evaluator's gradient: ρ-coords first,
/// ψ-coords appended.
///
/// # Assembly formula
///
/// The caller assembles the outer gradient as
///
/// ```text
///   grad[j] = objective_theta[j]
///           + 0.5 * trace_h_inv_hdot[j]
///           - 0.5 * trace_s_pinv_sdot[j]
/// ```
///
/// matching the three-term convention in [`outer_gradient_entry`] (penalty +
/// trace − det).
pub struct BatchedOuterHessianTerms {
    /// Exact profiled outer Hessian over θ = (ρ, ψ), assembled or exposed in
    /// operator form by the family in one amortized evaluation.
    pub outer_hessian: crate::solver::outer_strategy::HessianResult,
}

pub struct BatchedOuterGradientTerms {
    /// Explicit ∂J/∂θ_j contributions evaluated at the converged β̂ holding
    /// β fixed (i.e. the part that does NOT flow through H or S):
    ///
    /// * For ρ-coords: `½ β̂ᵀ A_k β̂` (penalty quadratic).
    /// * For ψ-coords: `V_i^explicit + g_i^explicit · β̂` style contributions.
    pub objective_theta: Array1<f64>,
    /// `tr(H⁻¹ · ∂H/∂θ_j)` for each j, with H = -∇²log L + S the full
    /// penalized Hessian at the mode.
    pub trace_h_inv_hdot: Array1<f64>,
    /// `tr(S⁺ · ∂S/∂θ_j)` for each j (penalty pseudo-logdet first derivative).
    pub trace_s_pinv_sdot: Array1<f64>,
}

/// User-defined family contract for multi-block generalized models.
pub trait CustomFamily {
    /// Family-owned fingerprint for persistent coefficient warm-starts.
    ///
    /// The generic block specs contain design matrices, offsets, penalties,
    /// and dimensions, but they deliberately do not know the family response
    /// vector or likelihood-side data stored on `Self`. Reusing β across
    /// different responses is mathematically unsafe, so persistent block-level
    /// warm-starts are enabled only for families that provide a fingerprint of
    /// the data that defines their likelihood. Outer ρ cache remains available
    /// independently through `BlockwiseFitOptions::cache_session`.
    fn persistent_warm_start_fingerprint(
        &self,
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> Option<String> {
        assert_valid_blockspecs(specs, "persistent warm-start fingerprint");
        assert_valid_options(options, "persistent warm-start fingerprint");
        None
    }

    /// Evaluate log-likelihood and per-block working quantities at current block predictors.
    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String>;

    /// Compute only the log-likelihood without building working sets.
    ///
    /// This is used in backtracking line searches where only the objective value
    /// is needed, avoiding the O(n × blocks) cost of assembling IRLS working
    /// weights and responses that will be immediately discarded.
    ///
    /// The default implementation falls back to `evaluate()` and discards the
    /// working sets.  Families with expensive working-set assembly should
    /// override this for a significant speedup.
    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
        self.evaluate(block_states).map(|e| e.log_likelihood)
    }

    /// Options-aware log-likelihood evaluation for line search.
    ///
    /// Default forwards to [`log_likelihood_only`] and ignores `_options`.
    /// Families that consult `options.outer_score_subsample` (or other
    /// per-call options that affect the LL value) must override this so the
    /// joint-Newton line search and the post-accept gradient reload agree
    /// on which row subset is being evaluated. Large-scale outer-only
    /// callers (including the joint-Newton line-search screening path) can
    /// override this to evaluate a deterministic paired Horvitz-Thompson
    /// estimate without constructing a full exact-Newton workspace.
    fn log_likelihood_only_with_options(
        &self,
        block_states: &[ParameterBlockState],
        options: &BlockwiseFitOptions,
    ) -> Result<f64, String> {
        assert_valid_options(options, "log_likelihood_only_with_options");
        self.log_likelihood_only(block_states)
    }

    /// Whether `log_likelihood_only_with_options` can use
    /// `BlockwiseFitOptions::early_exit_threshold` to reject line-search trials
    /// without computing the full log-likelihood.
    fn supports_log_likelihood_early_exit(&self) -> bool {
        false
    }

    /// Selects the outer objective semantics for exact-Newton families.
    ///
    /// `RidgedQuadraticReml` is the explicit ridged surrogate REML surface:
    ///
    ///   -loglik + penalty + 0.5 (log|H| - log|S|_+)
    ///
    /// The determinant terms in this mode are evaluated on the stabilized
    /// curvature surface declared by `ridge_policy`, so this objective is an
    /// explicitly modified surrogate rather than an exact Laplace expansion
    /// at an indefinite Hessian.
    ///
    /// `StrictPseudoLaplace` is the exact-mode pseudo-Laplace surface used by the
    /// Charbonnier spatial family:
    ///
    ///   -loglik + penalty + 0.5 log|H|
    ///
    /// The latter deliberately omits the quadratic-only `-0.5 log|S|_+`
    /// normalization term because there is no tractable exact analogue for the
    /// nonquadratic prior without introducing the intractable prior normalizer.
    fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
        ExactNewtonOuterObjective::RidgedQuadraticReml
    }

    /// Whether the joint likelihood Hessian H_L depends on β.
    ///
    /// When `true`, the unified evaluator includes M_j[u] = D_β B_j[u]
    /// moving-design drift correction for ψ coordinates and marks
    /// `HyperCoord::b_depends_on_beta = true`.
    ///
    /// Default: `true` for StrictPseudoLaplace, `false` for RidgedQuadraticReml.
    /// Gaussian location-scale must override to `true` because their
    /// joint Hessian depends on β even though outer objective is RidgedQuadraticReml.
    fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
        self.exact_newton_outerobjective() != ExactNewtonOuterObjective::RidgedQuadraticReml
    }

    /// Whether the outer REML/LAML logdet term `½ log|H + Sλ|` and its analytic
    /// trace gradient `½ tr((H+Sλ)⁺ ∂Sλ)` are evaluated over the FULL
    /// identifiable subspace `range(H + Sλ)` (mgcv's generalized determinant,
    /// gam#752) rather than the penalty-range subspace `range(Sλ)`.
    ///
    /// This is a value/gradient SUBSPACE-CONSISTENCY concern, orthogonal to
    /// whether the Hessian depends on β (`exact_newton_joint_hessian_beta_dependent`,
    /// which gates the *drift* corrections). The previous code conflated the two
    /// by gating the projected logdet on β-dependence, so `RidgedQuadraticReml`
    /// families (survival/bernoulli marginal-slope) silently used the
    /// `range(Sλ)`-only determinant: on a near-collinear penalty-null trend (the
    /// clustered-PC matern marginal-slope geometry) that DROPS the penalty-null
    /// likelihood determinant `log|U_kᵀ H U_k|` from the value while
    /// `½ log|Sλ|₊` is correctly over `range(Sλ)`, making the ρ-derivative of the
    /// REML criterion inconsistent. The outer optimizer then drives that block's
    /// λ → ∞ and the envelope gradient (valid only at a stationary β̂) freezes —
    /// the constant-‖g‖ outer stall in gam#808/#787.
    ///
    /// The generalized determinant is the correct objective in ALL cases: when
    /// `H + Sλ` is full rank it equals the ordinary logdet (the projection is a
    /// no-op, so the correction is ≈0), and when it is rank-deficient it drops
    /// only the truly unidentified `ker(H) ∩ ker(Sλ)` directions — exactly the
    /// directions `½ log|Sλ|₊` also omits, keeping value and gradient over one
    /// subspace. Always enabled by default.
    fn use_projected_penalty_logdet(&self) -> bool {
        true
    }

    /// Per-evaluation arithmetic cost of forming or applying the inner
    /// coefficient-space Hessian once, in flop-equivalent units. This is used
    /// for diagnostics, seed-budget policy, and first-order iteration caps
    /// when a family genuinely lacks analytic second-order support. It is not
    /// allowed to hide an analytic Hessian from the outer optimizer.
    ///
    /// The default returns `Σ_b n_b · p_b²` via [`default_coefficient_hessian_cost`],
    /// which is the honest assembly cost only when the joint Hessian is
    /// **block-diagonal** — i.e. the inner solver assembles each block's
    /// `X_b' W_b X_b` independently, with no cross-block coupling per row.
    /// Families whose row likelihood couples all blocks (every row contributes
    /// a rank-`m` outer-product update to the full joint Hessian over
    /// `Σ p_b` coefficients) **must** override and delegate to
    /// [`joint_coupled_coefficient_hessian_cost`] (or the equivalent factored
    /// form for tensor designs), otherwise the default undercounts the
    /// cross-block outer-product terms `2·Σ_{a<b} n·p_a·p_b`.
    ///
    /// Concretely:
    ///
    /// * **Block-diagonal** (default OK): `LatentBinaryFamily` collects
    ///   separate `hess_time` and `hess_mean` per row, never forming an
    ///   off-diagonal contribution.
    /// * **Joint-coupled** (override via [`joint_coupled_coefficient_hessian_cost`]):
    ///   GAMLSS location-scale, GAMLSS wiggle variants, marginal-slope families
    ///   (Bernoulli, Survival), `LatentSurvivalFamily`,
    ///   `SurvivalLocationScaleFamily` — every row contributes to the full
    ///   `(Σ p_b)²` joint Hessian via Jacobian pullback of a multi-dimensional
    ///   primary kernel.
    /// * **Single-block** (default OK): tensor designs whose `design.ncols()`
    ///   already equals `p_total` (e.g. CTN's Khatri–Rao `n × (p_resp·p_cov)`);
    ///   `n · p²` reduces correctly to `n · p_resp² · p_cov²`.
    /// * **Matrix-free Hessian operator**: families that expose
    ///   [`Self::exact_newton_joint_hessian_workspace`] with operator-form
    ///   directional derivatives (CTN at large scale) may instead return
    ///   the per-`Hv` matvec cost (e.g. `n·(p_resp + p_cov)` for Khatri–Rao)
    ///   so the gate reflects the operator path rather than the dense
    ///   build that the unified evaluator skips.
    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        default_coefficient_hessian_cost(specs)
    }

    /// Per-evaluation arithmetic cost of one analytic-gradient outer
    /// evaluation, in flop-equivalent units. Used only when the family
    /// genuinely has no analytic outer Hessian and the planner must use a
    /// first-order optimizer.
    ///
    /// The default returns `coefficient_hessian_cost / 2` (see
    /// [`default_coefficient_gradient_cost`]). Families whose gradient
    /// assembly differs structurally should override; in particular,
    /// joint-coupled families that override `coefficient_hessian_cost` to
    /// `joint_coupled_coefficient_hessian_cost(n, specs)` automatically
    /// inherit the corresponding gradient cost via this default — no
    /// per-family override is required for the GAMLSS / marginal-slope /
    /// joint-latent path.
    fn coefficient_gradient_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
        self.coefficient_hessian_cost(specs) / 2
    }

    /// Declares how much exact outer calculus this family wants to expose for
    /// the current realized problem size.
    ///
    /// The default exposes exact second-order calculus whenever the family
    /// advertises either dense outer Hessian blocks or profiled outer-Hessian
    /// HVPs. Large problems must stay exact and select an operator
    /// representation; they are not demoted to first-order optimizers.
    ///
    /// **Capability vs representation.** This method reports the highest
    /// analytic order this family implements. The realized policy carries
    /// work estimates for dense/operator routing and staged κ schedules, but
    /// those estimates do not downgrade a second-order family to a first-order
    /// optimizer.
    fn exact_outer_derivative_order(
        &self,
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> ExactOuterDerivativeOrder {
        assert!(std::mem::size_of_val(options) > 0);
        let coefficient_work = self
            .coefficient_hessian_cost(specs)
            .max(self.coefficient_gradient_cost(specs));
        if !self.outer_hyper_hessian_dense_available(specs)
            && !self.outer_hyper_hessian_hvp_available(specs)
        {
            return ExactOuterDerivativeOrder::First;
        }
        exact_outer_order_with_outer_hvp(
            specs,
            coefficient_work,
            self.outer_hyper_hessian_hvp_available(specs),
        )
    }

    /// Realized outer-derivative policy at the current problem size.
    ///
    /// Combines the capability query [`Self::exact_outer_derivative_order`]
    /// with predicted per-eval costs from [`Self::coefficient_gradient_cost`] /
    /// [`Self::coefficient_hessian_cost`] and the joint outer-coordinate
    /// dimension `rho_dim + psi_dim`. Capability decides derivative order;
    /// predicted costs inform dense/operator routing and staged κ schedules.
    ///
    /// Families with non-generic cost models (Khatri–Rao CTN, matrix-free
    /// HVP families, marginal-slope row-third workloads) should override
    /// this directly and set the `predicted_*_work` fields from their own
    /// cost model. The default uses the generic
    /// `n × (rho_dim + psi_dim) × p_total` shape via
    /// [`default_outer_derivative_policy_costs`].
    fn outer_derivative_policy(
        &self,
        specs: &[ParameterBlockSpec],
        psi_dim: usize,
        options: &BlockwiseFitOptions,
    ) -> OuterDerivativePolicy {
        let capability = self.exact_outer_derivative_order(specs, options);
        let grad_cost = self.coefficient_gradient_cost(specs);
        let hess_cost = self.coefficient_hessian_cost(specs);
        let (predicted_gradient_work, predicted_hessian_work) =
            default_outer_derivative_policy_costs(specs, psi_dim, grad_cost, hess_cost);
        OuterDerivativePolicy {
            capability,
            predicted_gradient_work,
            predicted_hessian_work,
            subsample_capable: self.outer_derivative_subsample_capable(),
        }
    }

    /// Whether this family's outer-only paths honour HT-weighted partial sums
    /// over `options.outer_score_subsample`.
    ///
    /// Default `false`: the trait's default outer-only paths
    /// (`log_likelihood_only_with_options`,
    /// `exact_newton_joint_psi_workspace_with_options`, ...) forward to the
    /// no-options variants and ignore `outer_score_subsample`. Families that
    /// override those hooks to honour HT-weighted partial sums should override
    /// this hook to return `true`; the default [`Self::outer_derivative_policy`]
    /// then threads the flag into the emitted [`OuterDerivativePolicy`].
    fn outer_derivative_subsample_capable(&self) -> bool {
        false
    }

    /// Family-specific outer seeding policy.
    ///
    /// The default preserves the generic custom-family behavior. Families with
    /// a strong warm start can override this to keep seed screening from
    /// dominating the fit.
    fn outer_seed_config(&self, n_params: usize) -> crate::seeding::SeedConfig {
        if n_params == 0 {
            return crate::seeding::SeedConfig::default();
        }
        let mut config = crate::seeding::SeedConfig::default();
        config.max_seeds = if n_params <= 4 { 6 } else { 4 };
        config.seed_budget = 1;
        config.screen_max_inner_iterations = 2;
        config
    }

    /// Whether outer hyper-derivative evaluation must use a joint exact path.
    ///
    /// Default `false` allows the generic blockwise diagonal fallback when a
    /// family does not provide joint exact curvature.
    ///
    /// Families with coupled multi-block likelihoods can override this to
    /// prevent the outer code from silently evaluating a mathematically
    /// invalid block-local surrogate. The failure mode is:
    ///
    /// 1. the outer derivative still has block-local forcing
    ///      g_k = A_k beta
    ///    because `rho_k` enters only through the penalty;
    /// 2. but the fitted mode response is not block-local,
    ///      H u_k = -g_k,
    ///    because the likelihood Hessian has off-diagonal block coupling;
    /// 3. therefore a blockwise solve
    ///      H_b u_{k,b} = -(A_k beta)_b
    ///    is not the derivative of the profiled objective the code claims to
    ///    be optimizing.
    ///
    /// When this flag is `true`, the family is asserting that any outer
    /// hyper-derivative path must first obtain the full joint exact curvature
    /// before it can return a mathematically valid result.
    fn requires_joint_outer_hyper_path(&self) -> bool {
        false
    }

    /// Per-block output-channel assignment for the identifiability audit.
    ///
    /// Multi-parameter families (Dirichlet, beta, Gaussian/binomial
    /// location-scale, multinomial, …) drive several *independent* linear
    /// predictors `η_r = X_r β_r`, one per distributional parameter / class.
    /// Each [`ParameterBlockSpec`] feeds exactly one of those output channels.
    /// When two blocks share the same covariate basis (e.g. every Dirichlet
    /// component uses the same `[1 | B]`), their columns are *not* gauge
    /// aliases — they are block-diagonal entries of the true joint Jacobian
    /// `blkdiag(X_0, …, X_{m-1})`, full rank `Σ p_b`.
    ///
    /// The pre-fit identifiability audit can only see this block-diagonal
    /// structure through the **channel-aware** route, which requires each
    /// block to carry a multi-output `jacobian_callback` (n_outputs > 1).
    /// Families built via the canonical helpers (`build_location_scale_block`,
    /// `MultinomialFamily::build_block_specs`) wire that callback themselves;
    /// families fit through the low-level `fit_custom_family` API with
    /// hand-built specs do not, and the flat audit then mistakes the repeated
    /// shared basis for cross-block aliases and refuses a well-posed fit
    /// (issues #319 / #363 / #558).
    ///
    /// Returning `Some(channels)` — a vector of length `specs.len()` giving the
    /// zero-based output channel each block drives — lets `fit_custom_family`
    /// install the appropriate [`AdditiveBlockJacobian`] on any block that
    /// lacks an explicit callback, so the audit routes channel-aware
    /// automatically. The total channel count is `channels.iter().max() + 1`.
    ///
    /// Default: every block drives output channel 0. `wire_output_channels`
    /// recognizes this as the single-output flat route and leaves specs unchanged.
    ///
    /// When `Some`, the returned vector MUST have length equal to the number
    /// of blocks; `fit_custom_family` surfaces a structured error otherwise.
    fn output_channel_assignment(&self, specs: &[ParameterBlockSpec]) -> Option<Vec<usize>> {
        Some(vec![0; specs.len()])
    }

    /// Optional dynamic geometry hook for blocks whose design/offset depend on
    /// current values of other blocks.
    fn block_geometry(
        &self,
        block_states: &[ParameterBlockState],
        spec: &ParameterBlockSpec,
    ) -> Result<(DesignMatrix, Array1<f64>), String> {
        assert!(block_states.len() <= isize::MAX as usize);
        Ok((spec.design.clone(), spec.offset.clone()))
    }

    /// Whether `block_geometry(...)` can change with the current block state.
    ///
    /// The default implementation is static: the effective geometry is just the
    /// stored `spec.design/spec.offset`, so the fit engine can use those
    /// references directly without repeatedly cloning dense matrices.
    ///
    /// Families that override `block_geometry(...)` with state-dependent
    /// behavior must override this to return `true`.
    fn block_geometry_is_dynamic(&self) -> bool {
        false
    }

    /// Optional directional derivative of the effective block geometry wrt the
    /// current block coefficients.
    ///
    /// For a block with effective predictor
    ///
    ///   eta(beta) = X(beta) beta + o(beta),
    ///
    /// the directional derivative along `d_beta` is
    ///
    ///   D eta[d_beta] = X d_beta + (D X[d_beta]) beta + D o[d_beta].
    ///
    /// For diagonal working-set REML derivatives this contributes to both:
    ///
    ///   D H[d_beta]
    ///   = (D X[d_beta])^T W X
    ///   + X^T W (D X[d_beta])
    ///   + X^T diag(D w[D eta[d_beta]]) X,
    ///
    /// and to the predictor drift fed into the weight directional derivative.
    ///
    /// Default `None` means the family is declaring that the current block's
    /// geometry has no coefficient-dependent drift beyond the base `X d_beta`
    /// term. Families with dynamic `block_geometry` must implement this hook
    /// when that declaration is false.
    fn block_geometry_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        block_spec: &ParameterBlockSpec,
        arr: &Array1<f64>,
    ) -> Result<Option<BlockGeometryDirectionalDerivative>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(!block_spec.name.is_empty());
        assert!(arr.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Optional per-block coefficient projection applied after each block update.
    fn post_update_block_beta(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        block_spec: &ParameterBlockSpec,
        beta: Array1<f64>,
    ) -> Result<Array1<f64>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(!block_spec.name.is_empty());
        Ok(beta)
    }

    /// Optional barrier-aware maximum feasible step size for a block update.
    ///
    /// Given the current block state and a proposed step direction `delta`,
    /// returns `Some(alpha_max)` where `alpha_max` is the largest step size
    /// in `(0, 1]` such that `beta + alpha_max * delta` remains strictly
    /// feasible with respect to any implicit barrier in the likelihood.
    ///
    /// Families whose log-likelihood contains natural log-barrier terms
    /// (e.g. `log(h')` in transformation-normal) should implement this to
    /// prevent the line search from evaluating the likelihood at infeasible
    /// points.  A fraction-to-boundary safety factor (e.g. 0.995) should be
    /// applied internally.
    ///
    /// Returns `None` if no barrier constraint applies (the default).
    fn max_feasible_step_size(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        arr: &Array1<f64>,
    ) -> Result<Option<f64>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Optional linear inequality constraints for a block update:
    /// `A * beta_block >= b`.
    fn block_linear_constraints(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        block_spec: &ParameterBlockSpec,
    ) -> Result<Option<LinearInequalityConstraints>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(!block_spec.name.is_empty());
        Ok(None)
    }

    /// Optional exact directional derivative of a block's ExactNewton Hessian.
    ///
    /// Returns `Some(dH)` where:
    /// - `dH` is the directional derivative of the block Hessian with respect to
    ///   the provided coefficient-space direction `d_beta` at current state.
    /// - shape is `(p_block, p_block)`.
    ///
    /// Default `None` means no exact directional Hessian drift is available.
    /// Exact REML/LAML derivative paths that require this term should treat
    /// `None` as unavailable rather than silently substituting zero.
    fn exact_newton_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        arr: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Optional exact second directional derivative of a block's ExactNewton Hessian.
    ///
    /// Returns `Some(d2H)` where:
    /// - `d2H` is `D²_beta H_L[u, v]` for the provided block-local
    ///   coefficient-space directions.
    /// - shape is `(p_block, p_block)`.
    ///
    /// Generic single-block REML/LAML Hessian evaluation requires this term for
    /// `BlockWorkingSet::ExactNewton` blocks; `None` means the exact second
    /// Hessian drift is unavailable.
    fn exact_newton_hessian_second_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        arr: &Array1<f64>,
        arr2: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        assert!(arr2.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Optional exact joint coefficient-space Hessian across all blocks.
    ///
    /// Returns the unpenalized matrix `H_L = -nabla^2 log L` in the flattened block order.
    ///
    /// This is the **observed** (actual) Hessian of the log-likelihood at the mode,
    /// NOT the expected Fisher information. The outer REML/LAML evaluator requires
    /// the observed Hessian for the exact Laplace approximation (see response.md
    /// Section 3). Since this method returns the actual second derivative of log L,
    /// it is correct by construction.
    ///
    /// For families using `BlockWorkingSet::Diagonal` (IRLS-style updates), the
    /// per-block Hessian is X'WX where W is the working weight. For canonical links
    /// W_obs = W_Fisher, but for non-canonical links the working weight should include
    /// the observed-information correction W_obs = W_Fisher - (y-mu)*B.
    fn exact_newton_joint_hessian(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<Option<Array2<f64>>, String> {
        // Default block-diagonal assembly from per-block ExactNewton hessians.
        // This is the inner-fit-side default and is *intentionally* not gated
        // by `likelihood_blocks_uncoupled()`: the inner joint-Newton loop only
        // uses this Hessian as a Newton-direction surrogate that is
        // immediately validated by the line-search + objective decrease, so
        // even if the family is coupled, an under-resolved block-diagonal
        // direction will simply backtrack instead of corrupting the outer
        // REML score.  The strict coupling gate lives one layer up, on
        // `exact_newton_joint_hessian_with_specs`, where outer REML trace
        // algebra would silently produce wrong answers from a missing
        // cross-block term.
        exact_newton_joint_hessian_from_exact_blocks(self, block_states)
    }

    /// Optional exact joint log-likelihood / score evaluation in flattened
    /// coefficient space without building per-block Hessian working sets.
    fn exact_newton_joint_gradient_evaluation(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
    ) -> Result<Option<ExactNewtonJointGradientEvaluation>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(block_specs.len() <= isize::MAX as usize);
        Ok(None)
    }

    /// Optional exact directional derivative of the joint coefficient-space Hessian.
    ///
    /// Returns `Some(dH)` where `dH` is the directional derivative of the
    /// unpenalized joint Hessian `H = -∇² log L` along the flattened
    /// coefficient-space direction `d_beta_flat`.
    fn exact_newton_joint_hessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        exact_newton_joint_hessian_directional_derivative_from_blocks(
            self,
            block_states,
            d_beta_flat,
        )
    }

    /// Optional exact second directional derivative of the joint Hessian.
    ///
    /// Returns `Some(d2H)` where `d2H` is:
    ///   D²H[u, v] = d/dε d/dδ H(beta + εu + δv) |_{ε=δ=0}
    /// for flattened coefficient-space directions `u = d_beta_u_flat`,
    /// `v = d_betav_flat`.
    fn exact_newton_joint_hessiansecond_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        exact_newton_joint_hessiansecond_directional_derivative_from_blocks(
            self,
            block_states,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    /// Optional per-evaluation workspace for exact joint Hessian operators and
    /// directional derivatives.
    ///
    /// Families with expensive cache construction can override this to build
    /// shared state once and reuse it across the repeated `dH[v]` / `d²H[u,v]`
    /// calls made by the unified outer evaluator.
    fn exact_newton_joint_hessian_workspace(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(block_specs.len() <= isize::MAX as usize);
        Ok(None)
    }

    /// Outer-aware variant of `exact_newton_joint_hessian_workspace`.
    ///
    /// Families that consume the optional outer-only stratified row subsample
    /// (`options.outer_score_subsample`) override this method so the joint
    /// Hessian workspace can be constructed with the subsample mask attached.
    /// Generic families can stick with the default implementation, which
    /// simply forwards to the legacy no-options method and ignores the
    /// options. This keeps full backward compatibility with existing
    /// implementors while letting the marginal-slope families thread the
    /// subsample down into the cached per-evaluation joint-Hessian directional
    /// derivative paths.
    fn exact_newton_joint_hessian_workspace_with_options(
        &self,
        states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointHessianWorkspace>>, String> {
        assert_valid_options(options, "exact Newton joint Hessian workspace");
        self.exact_newton_joint_hessian_workspace(states, specs)
    }

    /// Optional batched analytic-gradient hook.
    ///
    /// Returns the K per-θ_j gradient contributions ([`BatchedOuterGradientTerms`])
    /// in one amortized pass when the family can factor its joint Hessian
    /// once and stream row-block leverages instead of computing each
    /// `tr(H⁻¹ · ∂H/∂θ_j)` independently.
    ///
    /// # Cost amortization
    ///
    /// Generic per-θ_j path: `O(K · n · p²)` (K independent dense traces).
    /// Batched path: `O(n · p²)` (single factor + leverage stream)
    ///                 + `O(K · n · m²)` (per-row block-diagonal accumulators
    ///                   with `m` = per-row predictor dimension; m = 2 for
    ///                   GAMLSS location-scale, 1 for scalar GLMs).
    ///
    /// At large scale with K ≈ 15, p ≈ 64, m = 2 the batched path is
    /// ≈ K·p²/(p² + K·m²) ≈ 15× cheaper.
    ///
    /// # Default
    ///
    /// Returns `Ok(None)`. The unified outer gradient evaluator falls back
    /// to its generic per-coordinate path. Families with row-coupled
    /// likelihoods (GAMLSS location-scale, marginal-slope) should override.
    ///
    /// Implementations may return `Ok(None)` for ψ-coordinates whose
    /// design-drift is too involved for a batched leverage form, letting
    /// the generic path handle those cases.
    fn batched_outer_gradient_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        rho: &Array1<f64>,
        options: &BlockwiseFitOptions,
        hessian_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    ) -> Result<Option<BatchedOuterGradientTerms>, String> {
        assert_valid_blockspecs(specs, "batched outer gradient terms");
        assert_states_match_specs(block_states, specs, "batched outer gradient terms");
        assert_derivative_blocks_match_specs(
            derivative_blocks,
            specs,
            "batched outer gradient terms",
        );
        assert_rho_matches_specs(rho, specs, "batched outer gradient terms");
        assert_valid_options(options, "batched outer gradient terms");
        validate_hessian_workspace_ready(&hessian_workspace, "batched outer gradient terms")?;
        Ok(None)
    }

    /// Optional batched analytic-Hessian / HVP hook.
    ///
    /// This is the Hessian-side analogue of
    /// [`Self::batched_outer_gradient_terms`]: families that can share a
    /// single factorization, row-leverage stream, or directional θθ kernel
    /// across all explicit outer-Hessian terms return the exact profiled
    /// Hessian here.  The evaluator uses this hook only for Hessian-capable
    /// families and only after the inner mode has been fitted; default
    /// `None` leaves unsupported families on their existing exact path.
    fn batched_outer_hessian_terms(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        rho: &Array1<f64>,
        hessian_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    ) -> Result<Option<BatchedOuterHessianTerms>, String> {
        assert_valid_blockspecs(specs, "batched outer Hessian terms");
        assert_states_match_specs(block_states, specs, "batched outer Hessian terms");
        assert_derivative_blocks_match_specs(
            derivative_blocks,
            specs,
            "batched outer Hessian terms",
        );
        assert_rho_matches_specs(rho, specs, "batched outer Hessian terms");
        validate_hessian_workspace_ready(&hessian_workspace, "batched outer Hessian terms")?;
        Ok(self
            .outer_hyper_hessian_operator(specs)
            .map(|operator| BatchedOuterHessianTerms {
                outer_hessian: crate::solver::outer_strategy::HessianResult::Operator(operator),
            }))
    }

    /// Explicit name for the inner coefficient-space Hessian HVP capability.
    ///
    /// Kept separate from outer hyper-Hessian capabilities so CTN/GAMLSS row
    /// operators do not accidentally advertise pairwise θθ calculus as cheap.
    fn inner_coefficient_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        assert_valid_blockspecs(specs, "inner coefficient Hessian HVP availability");
        false
    }

    fn inner_joint_workspace_gradient_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        assert_valid_blockspecs(specs, "inner joint workspace gradient availability");
        false
    }

    /// Opt families in to the matrix-free inner-Newton/PCG path on top of the
    /// generic `use_joint_matrix_free_path` heuristic.
    ///
    /// `use_joint_matrix_free_path` is tuned for families with cheap per-row
    /// work where dense `O(n·p²)` assembly is itself the bottleneck and HVPs
    /// cost the same. Families with very expensive per-row work (e.g. BMS flex
    /// streaming cell partitions + flex-jet evaluations per row) can override
    /// this to force the operator path even at moderate `p`, because each HVP
    /// reuses the row stream once and PCG converges in a handful of iters.
    /// Default `false` keeps the heuristic untouched for everyone else.
    fn prefers_matrix_free_inner_joint(
        &self,
        specs: &[ParameterBlockSpec],
        states: &[ParameterBlockState],
    ) -> bool {
        assert_valid_blockspecs(specs, "matrix-free inner-joint preference");
        assert!(states.len() <= isize::MAX as usize);
        false
    }

    fn inner_joint_workspace_log_likelihood_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        assert_valid_blockspecs(specs, "inner joint workspace log-likelihood availability");
        false
    }

    /// True only when the family has a real profiled outer Hessian-vector
    /// product over θ = (ρ, ψ), without enumerating all θ_i θ_j pairs.
    fn outer_hyper_hessian_hvp_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        assert_valid_blockspecs(specs, "outer hyper-Hessian HVP availability");
        false
    }

    /// True when the family can expose the dense profiled outer Hessian.
    /// Generic custom-family pairwise derivative paths default to dense
    /// availability; families with only inner HVP support should override this
    /// if dense θθ assembly is not a valid capability for their path.
    fn outer_hyper_hessian_dense_available(&self, specs: &[ParameterBlockSpec]) -> bool {
        assert_valid_blockspecs(specs, "outer hyper-Hessian dense availability");
        true
    }

    /// Family-supplied exact outer Hessian operator over θ = (ρ, ψ).
    ///
    /// When a family can produce the full profiled outer Hessian as a
    /// matrix-free Hv operator — using its own directional θθ kernels and
    /// trace algebra rather than the generic per-pair enumeration — it
    /// overrides this method and returns `Some(op)`.  The unified REML/LAML
    /// evaluator wires the operator into [`HessianResult::Operator`] via
    /// the [`HessianDerivativeProvider::family_outer_hessian_operator`] hook
    /// the family installs on its provider; consumers see a generic
    /// `Arc<dyn OuterHessianOperator>` (matvec / dim / mul_mat /
    /// is_cheap_to_materialize).
    ///
    /// Default returns `None`, leaving the family on the existing pairwise
    /// assembly path.  This is the architectural contract for CTN, survival
    /// (Gompertz-Makeham + timewiggle), GAMLSS location-scale, and
    /// Bernoulli marginal-slope families to plug their directional
    /// outer-HVP operators into the same surface.
    fn outer_hyper_hessian_operator(
        &self,
        specs: &[ParameterBlockSpec],
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        assert_valid_blockspecs(specs, "outer hyper-Hessian operator");
        None
    }

    /// Optional spec-aware exact joint Hessian.
    ///
    /// This hook exists because the outer hyper-derivative code works from the
    /// realized block specs, while some family instances may or may not cache
    /// those realized designs internally.
    ///
    /// The profiled/Laplace outer objective used here is
    ///
    ///   J(theta)
    ///   = V(beta(theta), theta)
    ///     + 0.5 log|H(beta(theta), theta)|
    ///     - 0.5 log|S(theta)|_+,
    ///
    /// evaluated at the fitted inner mode defined by
    ///
    ///   F(beta, theta) := D_beta V(beta, theta) = 0,
    ///   H(beta, theta) := F_beta(beta, theta) = H_L(beta, theta) + S(theta).
    ///
    /// For pure rho directions on families whose likelihood has no explicit
    /// rho-dependence, the fixed-beta forcing is
    ///
    ///   g_k := F_{rho_k} = A_k beta,
    ///   A_k := dS/drho_k.
    ///
    /// Differentiating stationarity gives the exact joint mode response
    ///
    ///   H u_k = -g_k,
    ///   u_k = d beta / d rho_k.
    ///
    /// Even if `A_k` is supported in only one penalty block, the solve for
    /// `u_k` must use the full joint Hessian `H`, because the likelihood can
    /// couple blocks through off-diagonal curvature. The first outer
    /// derivative is then
    ///
    ///   dJ/dtheta_i
    ///   = 0.5 beta^T A_k beta
    ///     + 0.5 tr(H^{-1}(A_k + D_beta H_L[u_k]))
    ///     - 0.5 tr(S^+ A_k),
    ///
    /// and when psi moves realized penalties the same spec-aware hook must be
    /// able to reconstruct H(beta, theta), D_beta H[u], and D_beta^2 H[u, v]
    /// from the current realized specs so the generic joint assembler can form
    ///
    ///   dot H_i  = H_i + D_beta H[beta_i],
    ///   ddot H_ij
    ///   = H_ij + T_i[beta_j] + T_j[beta_i]
    ///     + D_beta H[beta_ij] + D_beta^2 H[beta_i, beta_j].
    ///
    /// Families such as binomial location-scale with
    ///
    ///   q = -eta_t exp(-eta_ls)
    ///
    /// have exactly that coupled structure: the penalty forcing is block-local
    /// but the fitted mode response and the resulting `D_beta H_L[u_k]` drift
    /// are joint objects. If the realized `specs` already contain the designs
    /// needed to build those objects, the outer code should use them directly
    /// rather than falling back to a weaker blockwise surrogate just because
    /// the family instance itself did not cache the same designs.
    ///
    /// The default implementation delegates to `exact_newton_joint_hessian`.
    ///
    /// For multi-block families, the working-set fallback only fires when the
    /// family has explicitly declared its blocks are uncoupled in the
    /// likelihood Hessian via `likelihood_blocks_uncoupled() = true`.  This
    /// is critical: `exact_newton_joint_hessian_from_working_sets` produces a
    /// strictly block-diagonal joint Hessian, which silently drops cross-block
    /// `∂²L/∂β_a∂β_b` terms for coupled likelihoods (GAMLSS μ-σ, marginal
    /// slope, survival location-scale, etc.).  Default `false` ⇒ multi-block
    /// custom families must override `exact_newton_joint_hessian` (or
    /// `exact_newton_outer_curvature`) and the higher layer surfaces a loud
    /// "joint outer path required" error rather than silently using
    /// block-diagonal curvature.
    fn exact_newton_joint_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        // Multi-axis dispatch over the joint Hessian source:
        //
        // * Single-block, or family declared `likelihood_blocks_uncoupled` —
        //   the working-sets block-diagonal IS exact (no cross-block coupling
        //   exists), so it's a valid fallback when the family override
        //   returns None.
        //
        // * Multi-block coupled with `has_explicit_joint_hessian = true` —
        //   the family override IS the only trusted joint Hessian.  If it
        //   returns None (e.g. dense form too large for memory at large-scale
        //   scale), propagate None.  Substituting the working-sets
        //   block-diagonal would silently drop the cross-block
        //   ∂²L/∂β_a∂β_b curvature the family is the only source of —
        //   exactly the corruption this gate exists to prevent.
        //
        // * Multi-block coupled, no explicit override — refuse entirely so
        //   the multi-block error surfaces upstream.
        if specs.len() <= 1 || self.likelihood_blocks_uncoupled() {
            match self.exact_newton_joint_hessian(block_states)? {
                Some(hessian) => Ok(Some(hessian)),
                None => exact_newton_joint_hessian_from_working_sets(self, block_states, specs),
            }
        } else if self.has_explicit_joint_hessian() {
            self.exact_newton_joint_hessian(block_states)
        } else {
            // Multi-block coupled family that did NOT set the explicit marker.
            // The marker exists because the trait cannot reflect on whether
            // `exact_newton_joint_hessian` was overridden — its *default* impl
            // assembles a strictly block-diagonal matrix from per-block exact
            // blocks, which would silently drop cross-block ∂²L/∂β_a∂β_b
            // curvature for a coupled likelihood. But the marker is not the
            // only available signal: a family that genuinely overrides the
            // joint Hessian with true coupled curvature produces a matrix with
            // *nonzero off-diagonal blocks*, which the block-diagonal default
            // can never produce. Detect that structurally and trust it. A
            // returned matrix that is block-diagonal is indistinguishable from
            // the default for a coupled family, so it stays gated to None.
            match self.exact_newton_joint_hessian(block_states)? {
                Some(hessian) if joint_hessian_has_cross_block_coupling(&hessian, block_states) => {
                    Ok(Some(hessian))
                }
                _ => Ok(None),
            }
        }
    }

    /// Structural-coupling probe shared by the `_with_specs` joint dispatch
    /// gates: is the family's `exact_newton_joint_hessian` a genuinely coupled
    /// matrix (nonzero off-diagonal blocks), as opposed to the trait's
    /// block-diagonal default? This is the marker-free signal that lets the
    /// engine trust a coupled multi-block family that overrode the joint
    /// Hessian without hand-setting `has_explicit_joint_hessian()`. Returns
    /// `false` when no joint Hessian is available or it is block-diagonal.
    fn joint_hessian_is_structurally_coupled(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<bool, String> {
        Ok(match self.exact_newton_joint_hessian(block_states)? {
            Some(hessian) => joint_hessian_has_cross_block_coupling(&hessian, block_states),
            None => false,
        })
    }

    /// Whether the family's log-likelihood Hessian is block-diagonal in the
    /// joint coefficient vector — i.e. `∂²L/∂β_a∂β_b = 0` for every pair of
    /// distinct blocks `a ≠ b`.  Default `false` (assume coupling, the safe
    /// answer); families whose blocks share no η/W coupling override to
    /// `true` to opt into the default working-set joint-Hessian assembly for
    /// multi-block specs.
    fn likelihood_blocks_uncoupled(&self) -> bool {
        false
    }

    /// Whether the family has an explicit override of `exact_newton_joint_hessian`
    /// (or its `_with_specs` variant) that returns the *true* coupled joint
    /// Hessian rather than the trait's block-diagonal default.
    ///
    /// Default `false`.  Production families that override
    /// `exact_newton_joint_hessian` with their analytic coupled curvature must
    /// set this to `true` so the outer-REML path can trust the override
    /// downstream of `exact_newton_joint_hessian_with_specs`.  The trait can't
    /// detect override status by reflection, so this marker is the contract
    /// signal.
    fn has_explicit_joint_hessian(&self) -> bool {
        false
    }

    /// Whether the family's inner/outer solves need the full-span Jeffreys
    /// curvature `H_Φ` and score `∇Φ`.
    ///
    /// Default `true` to preserve the existing separation/near-singular
    /// robustness on every family the term was historically armed for
    /// (probit/binomial, GAMLSS location-scale, BMS, survival marginal-slope).
    ///
    /// A family overrides this to `false` when it has no
    /// separation/under-identification regime by construction — the
    /// canonical case is a continuous-response monotone-transformation
    /// family like `TransformationNormalFamily`, where the Fisher information
    /// is `O(n)` on every identified direction at every working point and
    /// the Jeffreys gate would always smooth-step to zero anyway. There the
    /// term is pure overhead: each evaluation runs `p` directional
    /// derivatives of the joint Hessian (`O(n·p²)` per call for the SCOP
    /// directional derivative), called multiple times per inner cycle and
    /// once per outer evaluation. At large scale (`p=144`, `n=20000`) the
    /// overhead is the dominant per-cycle cost and exhausts the CI budget
    /// long before the inner Newton converges, while contributing
    /// essentially zero to the converged gradient and curvature.
    fn joint_jeffreys_term_required(&self) -> bool {
        true
    }

    /// Whether the coupled-joint inner Newton should engage its self-vanishing
    /// Levenberg–Marquardt damping `μ` on a FULL-RANK-but-ILL-CONDITIONED
    /// penalized Hessian (cond > `COND_NEWTON_SAFETY`), not only on a
    /// rank-deficient one (`nullity > 0`). Default `false` (binary / AFT /
    /// others byte-identical). Survival marginal-slope overrides to `true`
    /// (#808: full-rank but cond ≈ 5.8e6; the self-vanishing μ shapes only the
    /// trajectory, so the converged β is unbiased and the log-slope target is
    /// preserved). Survival-local by trait override so the shared spectral-range
    /// solver stays byte-identical for every other family — in particular AFT
    /// (`survival_location_scale`), whose intercept-only-scale fits can be
    /// high-cond and which a shared (unconditional) gate would regress (#735/#736).
    fn levenberg_on_ill_conditioning(&self) -> bool {
        false
    }

    /// Internal helper: do the outer-REML `_with_specs` defaults trust the
    /// inner-fit's block-diagonal-from-blocks output for this family?
    ///
    /// Trustworthy iff:
    /// - single-block (no cross-block coupling possible), or
    /// - the family has declared its blocks uncoupled in the likelihood
    ///   Hessian (`likelihood_blocks_uncoupled` ⇒ block-diagonal IS exact),
    ///   or
    /// - the family has an explicit joint-Hessian override
    ///   (`has_explicit_joint_hessian` ⇒ what we receive from
    ///   `exact_newton_joint_hessian` is the true coupled Hessian, not the
    ///   block-diagonal default).
    fn outer_default_trustworthy_for_joint_hessian(&self, specs: &[ParameterBlockSpec]) -> bool {
        specs.len() <= 1 || self.likelihood_blocks_uncoupled() || self.has_explicit_joint_hessian()
    }

    /// Optional scale-aware exact joint curvature for the outer REML calculus.
    ///
    /// Families whose exact derivatives can overflow may return a uniformly
    /// rescaled Hessian together with the metadata needed to keep every outer
    /// path consistent:
    ///
    /// - `hessian`: the scale-stabilized unpenalized joint Hessian
    /// - `rho_curvature_scale`: the uniform factor applied to every ρ-driven
    ///   penalty Hessian derivative in H-dependent trace / solve terms
    /// - `hessian_logdet_correction`: the additive correction needed to recover
    ///   `log|H_exact|` from `log|H_scaled|`
    ///
    /// The scale is evaluation-local metadata: callers must use the same
    /// factor for `H`, `dH`, `d²H`, and penalized trace operators within that
    /// evaluation, but they do not differentiate the scale itself.
    ///
    /// Families overriding this must also make
    /// `exact_newton_outer_curvature_directional_derivative[_with_specs]` and
    /// `exact_newton_outer_curvature_second_directional_derivative[_with_specs]`
    /// return derivatives in that same scaled curvature space.
    fn exact_newton_outer_curvature(
        &self,
        block_states: &[ParameterBlockState],
    ) -> Result<Option<ExactNewtonOuterCurvature>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        Ok(None)
    }

    /// Optional first directional derivative matching
    /// `exact_newton_outer_curvature`.
    fn exact_newton_outer_curvature_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)
    }

    /// Spec-aware variant of `exact_newton_outer_curvature_directional_derivative`.
    fn exact_newton_outer_curvature_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        assert!(block_specs.len() <= isize::MAX as usize);
        self.exact_newton_outer_curvature_directional_derivative(block_states, d_beta_flat)
    }

    /// Optional second directional derivative matching
    /// `exact_newton_outer_curvature`.
    fn exact_newton_outer_curvature_second_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessiansecond_directional_derivative(
            block_states,
            d_beta_u_flat,
            d_beta_v_flat,
        )
    }

    /// Spec-aware variant of `exact_newton_outer_curvature_second_directional_derivative`.
    fn exact_newton_outer_curvature_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_beta_v_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        assert!(block_specs.len() <= isize::MAX as usize);
        self.exact_newton_outer_curvature_second_directional_derivative(
            block_states,
            d_beta_u_flat,
            d_beta_v_flat,
        )
    }

    /// Optional spec-aware exact first directional derivative of the joint Hessian.
    ///
    /// This is the spec-aware analogue of
    /// `exact_newton_joint_hessian_directional_derivative`. It returns the
    /// exact joint likelihood-curvature drift
    ///
    ///   D_beta H_L[u],
    ///
    /// for a flattened coefficient-space direction `u`. In the profiled
    /// Laplace gradient this appears after solving the exact joint mode
    /// response
    ///
    ///   H u_k = -A_k beta,
    ///   dot H_k = A_k + D_beta H_L[u_k].
    ///
    /// Families that can reconstruct the exact joint geometry from `specs`
    /// should override this alongside
    /// `exact_newton_joint_hessian_with_specs`.
    fn exact_newton_joint_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        // Same trust dispatch as `exact_newton_joint_hessian_with_specs` —
        // the default `_directional_derivative` and `_from_working_sets`
        // both build a block-diagonal `D_β H[u]`, which silently drops the
        // cross-block `∂²L_ab/∂β_a∂β_b · u_b` rows that drive the outer
        // mode-response correction for coupled families.
        if specs.len() <= 1 || self.likelihood_blocks_uncoupled() {
            match self
                .exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)?
            {
                Some(dh) => Ok(Some(dh)),
                None => exact_newton_joint_hessian_directional_derivative_from_working_sets(
                    self,
                    block_states,
                    specs,
                    d_beta_flat,
                ),
            }
        } else if self.has_explicit_joint_hessian()
            || self.joint_hessian_is_structurally_coupled(block_states)?
        {
            // Marked, or structurally detected coupled (see
            // `exact_newton_joint_hessian_with_specs`): the family's own
            // directional derivative is the trusted cross-block `D_β H[u]`.
            self.exact_newton_joint_hessian_directional_derivative(block_states, d_beta_flat)
        } else {
            Ok(None)
        }
    }

    /// Optional spec-aware exact second directional derivative of the joint Hessian.
    ///
    /// This is the spec-aware analogue of
    /// `exact_newton_joint_hessiansecond_directional_derivative`. For
    /// rho/rho outer Hessian entries it supplies the exact joint second-order
    /// likelihood-curvature drift
    ///
    ///   D_beta^2 H_L[u_l, u_k],
    ///
    /// which combines with
    ///
    ///   dot H_k = A_k + D_beta H_L[u_k]
    ///
    /// and the second mode response
    ///
    ///   H u_{k,l}
    ///   = -(A_k u_l + A_l u_k + B_{k,l} beta + D_beta H_L[u_l] u_k)
    ///
    /// to form
    ///
    ///   ddot H_{k,l}
    ///   = B_{k,l} + D_beta H_L[u_{k,l}] + D_beta^2 H_L[u_l, u_k].
    fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        // Same trust dispatch as the Hessian / first-derivative paths.  The
        // delegated `exact_newton_joint_hessiansecond_directional_derivative`
        // default is block-diagonal-from-blocks, which is silently wrong for
        // outer trace assembly on coupled families.  Unlike the lower-order
        // paths, there is no working-sets fallback — both trusted branches
        // call the same delegate, so a single helper predicate suffices.
        // The marker predicate is supplemented by the marker-free structural
        // probe so an auto-routed coupled family (one that returns a genuinely
        // off-diagonal joint Hessian without setting the explicit marker) is
        // trusted consistently across all three derivative orders.
        if !self.outer_default_trustworthy_for_joint_hessian(specs)
            && !self.joint_hessian_is_structurally_coupled(block_states)?
        {
            return Ok(None);
        }
        self.exact_newton_joint_hessiansecond_directional_derivative(
            block_states,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    /// Optional joint multi-block outer-hyper surrogate Hessian over the
    /// flattened coefficient vector.
    ///
    /// This hook exists for families whose inner working representation is
    /// block-diagonal/diagonal in `evaluate(...)`, but whose outer profiled
    /// smoothing derivatives are still joint because the fitted mode response
    /// couples blocks. The generic blockwise outer-hyper surrogate only sees
    /// per-block working sets, so it cannot recover missing cross-block
    /// curvature on its own.
    ///
    /// Families that can construct a mathematically valid joint surrogate
    /// `H_L(beta)` for the current realized `specs` may override this and the
    /// two directional derivative hooks below. Generic code then reuses the
    /// same joint rho-calculus as the exact path, but on the family-supplied
    /// surrogate curvature instead of the exact Newton Hessian.
    ///
    /// Default behavior is to reuse the spec-aware exact joint curvature when
    /// the family already provides it. That is the mathematically correct
    /// repair for the old broken multi-block blockwise surrogate path: if the
    /// family knows the full coupled Hessian and its beta-drifts, generic code
    /// should use that joint information instead of pretending per-block
    /// working sets are enough.
    fn joint_outer_hyper_surrogate_hessian_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_with_specs(block_states, specs)
    }

    /// Optional first beta-directional derivative of the joint surrogate
    /// outer-hyper Hessian.
    fn joint_outer_hyper_surrogate_hessian_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_directional_derivative_with_specs(
            block_states,
            specs,
            d_beta_flat,
        )
    }

    /// Optional second beta-directional derivative of the joint surrogate
    /// outer-hyper Hessian.
    fn joint_outer_hyper_surrogate_hessian_second_directional_derivative_with_specs(
        &self,
        block_states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        d_beta_u_flat: &Array1<f64>,
        d_betav_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.exact_newton_joint_hessian_second_directional_derivative_with_specs(
            block_states,
            specs,
            d_beta_u_flat,
            d_betav_flat,
        )
    }

    /// Optional exact directional derivative of diagonal working weights along
    /// a predictor-space direction `d_eta` for `BlockWorkingSet::Diagonal`.
    ///
    /// This callback supplies the `dw` term in
    ///
    ///   D_beta J[u] = X^T diag(dw) X
    ///
    /// for diagonal working-set blocks with
    ///
    ///   J = X^T W X + S.
    ///
    /// Default `None` means no exact working-weight directional derivative is
    /// available. Exact REML/LAML derivative paths should not silently replace
    /// this with zero unless the family truly has constant working weights.
    fn diagonalworking_weights_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        arr: &Array1<f64>,
    ) -> Result<Option<Array1<f64>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Optional exact second directional derivative of diagonal working weights.
    ///
    /// This callback supplies the `d²w` term for static-design single-block
    /// generic fallback Hessian drift:
    ///
    ///   D²_beta H_L[u, v] = X^T diag(D²w[D eta_u, D eta_v]) X.
    ///
    /// Families with coefficient-dependent block geometry must use an exact
    /// Newton Hessian path or a joint outer path until second-order geometry
    /// hooks are available; the generic diagonal fallback will reject nonzero
    /// first-order geometry while building `d²H`.
    fn diagonalworking_weights_second_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        idx: usize,
        arr: &Array1<f64>,
        arr2: &Array1<f64>,
    ) -> Result<Option<Array1<f64>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        assert!(arr2.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Optional exact first-order joint psi terms over the flattened
    /// coefficient vector.
    ///
    /// Families with coupled exact-joint curvature must provide psi objects in
    /// the same flattened coefficient space used by the existing joint Hessian
    /// hooks:
    ///
    ///   objective_psi = V_psi^explicit,
    ///   score_psi     = g_psi^explicit,
    ///   hessian_psi   = H_psi^explicit.
    ///
    /// Generic code then adds the realized penalty surface, solves
    ///
    ///   beta_i = -H^{-1} g_i,
    ///
    /// forms
    ///
    ///   dot H_i = H_i + D_beta H[beta_i],
    ///
    /// and plugs those objects into the unified profiled/Laplace gradient
    ///
    ///   J_i = V_i + 0.5 tr(H^{-1} dot H_i) - 0.5 partial_i log|S(theta)|_+.
    ///
    /// The current block-local exact-Newton psi hooks are not sufficient for a
    /// full joint hyper Hessian on coupled families; joint exact-joint hyper
    /// evaluation must use this flattened-coefficient hook instead.
    fn exact_newton_joint_psi_terms(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        idx: usize,
    ) -> Result<Option<ExactNewtonJointPsiTerms>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(block_specs.len() <= isize::MAX as usize);
        assert!(derivative_blocks.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        Ok(None)
    }

    /// Optional exact second-order joint psi terms over the flattened
    /// coefficient vector.
    ///
    /// For two outer coordinates theta_i, theta_j the exact profiled/Laplace
    /// Hessian uses fixed-beta second partials
    ///
    ///   V_{ij}^explicit, g_{ij}^explicit, H_{ij}^explicit.
    ///
    /// For psi/psi blocks this callback returns those explicit family terms in
    /// flattened coefficient coordinates. Generic code adds penalty
    /// contributions and profile/Laplace corrections.
    fn exact_newton_joint_psisecond_order_terms(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        idx: usize,
        idx2: usize,
    ) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(block_specs.len() <= isize::MAX as usize);
        assert!(derivative_blocks.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(idx2 < usize::MAX);
        Ok(None)
    }

    /// Optional per-evaluation workspace for exact joint ψ derivatives.
    ///
    /// Families with expensive exact ψ calculus can override this hook to
    /// precompute shared state once per outer evaluation and serve:
    ///
    /// - exact fixed-β ψψ second-order terms, and
    /// - exact mixed β/ψ Hessian drifts `D_β H_ψ[u]`
    ///
    /// from one cached workspace. Generic code falls back to the direct hooks
    /// above when no workspace is provided.
    fn exact_newton_joint_psi_workspace(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(block_specs.len() <= isize::MAX as usize);
        assert!(derivative_blocks.len() <= isize::MAX as usize);
        Ok(None)
    }

    /// Outer-aware variant of `exact_newton_joint_psi_workspace`.
    ///
    /// Families that consume the optional outer-only stratified row subsample
    /// (`options.outer_score_subsample`) override this method so the workspace
    /// can be constructed with the subsample mask attached. Generic families
    /// can stick with the default implementation, which simply forwards to
    /// the legacy no-options method and ignores the options. This keeps full
    /// backward compatibility with existing implementors while letting the
    /// marginal-slope families thread the subsample down into the cached
    /// per-evaluation ψ calculus.
    fn exact_newton_joint_psi_workspace_with_options(
        &self,
        states: &[ParameterBlockState],
        specs: &[ParameterBlockSpec],
        derivs: &[Vec<CustomFamilyBlockPsiDerivative>],
        options: &BlockwiseFitOptions,
    ) -> Result<Option<Arc<dyn ExactNewtonJointPsiWorkspace>>, String> {
        assert_valid_options(options, "exact Newton joint psi workspace");
        self.exact_newton_joint_psi_workspace(states, specs, derivs)
    }

    /// Whether the family's exact joint ψ workspace should also be built for
    /// first-order ψ terms during outer gradient evaluation.
    ///
    /// Default `false` avoids forcing every family to pay workspace setup cost
    /// on gradient-only outer evaluations. Families with expensive shared state
    /// that is reused by both first- and second-order ψ calculus can opt in.
    fn exact_newton_joint_psi_workspace_for_first_order_terms(&self) -> bool {
        false
    }

    /// Optional mixed beta/psi Hessian drift D_beta H_psi[u].
    ///
    /// This is the missing T_i[u] object in the full exact joint profiled
    /// Hessian:
    ///
    ///   ddot H_{ij}
    ///   = H_{ij}
    ///     + D_beta H_i[beta_j]
    ///     + D_beta H_j[beta_i]
    ///     + D_beta H[beta_{ij}]
    ///     + D_beta^2 H[beta_i, beta_j].
    ///
    /// For i = psi_a this hook supplies D_beta H_{psi_a}[u].
    ///
    /// This direct hook is dense-only. Families that can keep the drift in an
    /// operator-backed or block-local form should expose it through
    /// `exact_newton_joint_psi_workspace()` instead.
    fn exact_newton_joint_psihessian_directional_derivative(
        &self,
        block_states: &[ParameterBlockState],
        block_specs: &[ParameterBlockSpec],
        derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
        idx: usize,
        arr: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        assert!(block_states.len() <= isize::MAX as usize);
        assert!(block_specs.len() <= isize::MAX as usize);
        assert!(derivative_blocks.len() <= isize::MAX as usize);
        assert!(idx < usize::MAX);
        assert!(arr.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// How the penalized Hessian's log-determinant and its derivatives
    /// should handle eigenvalues below the numerical-stability floor.
    ///
    /// See [`PseudoLogdetMode`].  Default: `Smooth`, the stable choice for
    /// full-rank Hessians.  Families whose model structure carries a
    /// numerical null-space direction — e.g. multi-block GAMLSS wiggle
    /// models where `q = q_0 + B(q_0)^⊤ β_w` is not identified from a
    /// threshold shift — should override to `HardPseudo` so the null
    /// direction drops out of both the REML cost and its gradient
    /// consistently, rather than leaking a spurious first-order
    /// contribution through the eigensolver's arbitrary choice of basis
    /// inside the null space.
    fn pseudo_logdet_mode(&self) -> PseudoLogdetMode {
        PseudoLogdetMode::Smooth
    }
}

/// Scope of an outer-evaluation context — distinguishes a real outer
/// derivative evaluation (where auto-subsample is allowed to install a
/// fresh stratified mask and emit phase prints) from an inner
/// coefficient line-search trial (where the family must reuse the outer
/// row measure, so auto-subsample must stay disabled).
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum EvalScope {
    /// Real outer derivative evaluation: ρ has advanced; auto-subsample
    /// install paths may build/refresh a mask keyed on this ρ.
    OuterDerivative,
    /// Inner coefficient trial (joint-Newton / line-search) at fixed
    /// outer ρ: row measure must remain identical to the surrounding
    /// outer eval, so auto-subsample must not install a fresh mask.
    InnerCoefficient,
}

/// Context published by the outer smoothing optimizer for every
/// downstream family evaluation. Carries the current outer ρ and a
/// monotonic per-outer-eval id alongside the [`EvalScope`] tag used to
/// gate auto-subsample installation. See the
/// [`BlockwiseFitOptions::outer_eval_context`] field doc for the bug
/// this prevents.
#[derive(Clone, Debug)]
pub struct OuterEvalContext {
    pub rho: Arc<Array1<f64>>,
    pub eval_id: usize,
    pub scope: EvalScope,
}

/// Stable public API for installing outer-score subsampling.
#[derive(Clone)]
pub struct BlockwiseFitOptions {
    pub inner_max_cycles: usize,
    pub inner_tol: f64,
    pub outer_max_iter: usize,
    pub outer_tol: f64,
    pub minweight: f64,
    pub ridge_floor: f64,
    /// Shared ridge semantics used by solve/quadratic/logdet terms.
    pub ridge_policy: RidgePolicy,
    /// If true, outer smoothing optimization uses a Laplace/REML-style objective:
    ///   -loglik + penalty + 0.5(log|H| - log|S|_+)
    /// where H is blockwise working curvature and S is blockwise penalty.
    pub use_remlobjective: bool,
    /// If false, the outer smoothing optimizer uses exact gradients but does
    /// not request an analytic outer Hessian from the family.
    pub use_outer_hessian: bool,
    /// If false, skip post-fit joint covariance assembly.
    pub compute_covariance: bool,
    /// Shared cap engaged during seed screening so cost-only evaluations can
    /// stop inner iterations early without affecting the full solve.
    pub screening_max_inner_iterations: Option<Arc<AtomicUsize>>,
    /// Shared cap engaged during regular outer iterations. Unlike screening,
    /// this is only a budget: capped solves still have to earn the ordinary
    /// KKT certificate before derivatives may be exposed.
    pub outer_inner_max_iterations: Option<Arc<AtomicUsize>>,
    /// Optional line-search objective ceiling for lazy log-likelihood-only
    /// evaluations. Families whose per-row log-likelihood contributions are
    /// non-positive may stop once the partial negative log-likelihood is already
    /// above this ceiling, because the unvisited rows cannot improve the trial
    /// objective enough to be accepted. Default `None` preserves exact full-sum
    /// behavior and is the only mode used outside backtracking rejection tests.
    pub early_exit_threshold: Option<f64>,
    /// Stable public API for installing outer-score subsampling.
    ///
    /// Optional stratified row subsample used by outer-only score/gradient
    /// passes. When `Some(s)`, outer score/gradient hot loops should iterate
    /// only over `s.rows` and multiply each contribution by that row's
    /// Horvitz-Thompson inverse-inclusion weight. Inner-PIRLS and final
    /// covariance passes always run on the full data, so this field is
    /// consulted only by outer-only call sites. Default `None` preserves the
    /// full-data behavior. Wrapping in `Arc` keeps `Clone` cheap across the
    /// many places `BlockwiseFitOptions` is duplicated per-eval.
    pub outer_score_subsample:
        Option<Arc<crate::families::marginal_slope_shared::OuterScoreSubsample>>,
    /// Gate for marginal-slope families to auto-derive a stratified
    /// outer-score subsample at large scale (see
    /// [`crate::families::marginal_slope_shared::auto_outer_score_subsample`]).
    ///
    /// **Default `true`.** Auto-subsampling makes the early rho-gradient
    /// evaluations unbiased stochastic estimators with bounded relative
    /// variance (≈ 1 % at the conservative defaults), then the family switches
    /// back to full-data gradients for the remaining outer iterations. That
    /// keeps large marginal-slope fits fast during the high-motion part of the
    /// trajectory while preserving the default tight `outer_tol` polish on
    /// exact gradients. For small datasets the auto path declines to install a
    /// mask and the fit remains full-data throughout.
    ///
    /// When `outer_score_subsample` is already `Some(...)` the auto
    /// path is bypassed entirely (caller-provided masks always win).
    pub auto_outer_subsample: bool,
    /// Outer-evaluation context populated by the smoothing optimizer at
    /// the top of each real outer derivative evaluation. Used by
    /// auto-subsample install paths to key the stratified mask on the
    /// outer ρ rather than the inner β proxy: during the inner trust-
    /// region / coefficient line search β changes on every trial step,
    /// so keying on β re-fires phase prints (and re-shuffles the mask)
    /// inside a single outer eval. Keying on (rho, eval_id) instead
    /// keeps the mask stable across the inner Newton at one ρ, and
    /// suppresses auto-subsample entirely on inner trial evaluations via
    /// the [`EvalScope::InnerCoefficient`] tag set by
    /// [`coefficient_line_search_options`].
    ///
    /// `None` preserves legacy behavior (no context — install paths fall
    /// back to "no auto-subsample"). Default `None`.
    pub outer_eval_context: Option<OuterEvalContext>,
    /// Optional persistent warm-start cache session. When `Some`, the
    /// outer smoothing optimizer consults the on-disk cache before
    /// starting (to seed θ from the last accepted iterate) and writes
    /// checkpoints + a final entry on completion. When `None`, the fit
    /// runs cold and writes nothing — the default for unit tests and
    /// any caller that pinned a deterministic optimum.
    ///
    /// The session is opened at the workflow-level `fit_model`
    /// dispatcher so every family flows through one chokepoint; family
    /// code never has to remember to wire it. This mirrors the standard
    /// REML cache wiring in `solver/estimate.rs:2701`.
    pub cache_session: Option<Arc<crate::cache::Session>>,
    /// Optional mirror sessions that receive a copy of the final-result
    /// finalize() write. Used by the workflow dispatcher to broadcast a
    /// converged ρ to additional keyspace(s) — notably the data-
    /// independent seed prefix — so future fits with related structure
    /// can warm-start from this run. Writes still pass through the session
    /// rate limiter, so mirroring checkpoints does not add unbounded I/O.
    pub cache_mirror_sessions: Vec<Arc<crate::cache::Session>>,
    /// Optional bundle of cross-block (full-width) penalties, paired with
    /// their current `log λ` values from the outer ρ vector. When `Some`,
    /// the inner joint-Newton primitives add the contributions
    ///
    /// * objective: `½ Σ_j exp(ρ_j) βᵀ S_j β`
    /// * gradient:  `Σ_j exp(ρ_j) S_j β`
    /// * Hessian:   `Σ_j exp(ρ_j) S_j`
    ///
    /// in addition to the per-block penalty stack assembled from
    /// `ParameterBlockSpec.penalties`. The per-block path is unchanged.
    /// `None` preserves legacy behaviour for every existing caller.
    pub joint_penalties: Option<Arc<crate::families::joint_penalty::JointPenaltyBundle>>,
    /// Whether the outer smoothing optimizer screens the explicit
    /// `initial_rho` seed through the seed-screening cascade before the
    /// solver starts.
    ///
    /// **Default `true`** — the general path benefits from ranking the
    /// initial seed against the generated exploration seeds via cheap
    /// capped proxy fits.
    ///
    /// A caller sets this `false` when `initial_rho` is already the correct,
    /// identified optimum for its regime so that re-screening it adds only
    /// cost. The survival location-scale constant-scale (parametric-AFT)
    /// path uses this: its time-warp ρ seed is pinned AT the inner ρ box
    /// bound (the affine-baseline limit), where the REML/LAML profile is a
    /// dead-flat unidentified ridge. Running the screening cascade there
    /// drives each proxy fit (and, when every capped stage collapses to
    /// non-finite cost, the uncapped final stage) into a full inner solve on
    /// the near-singular flat Hessian — the source of the multi-minute
    /// no-iteration-log stall (#736, #735, #721). Skipping screening lets the
    /// already-correct seed flow straight to the outer solver, which certifies
    /// box-constraint stationarity at iteration 0. Genuinely flexible regimes
    /// (smooth scale / spatial) leave this `true` and keep full screening.
    pub screen_initial_rho: bool,
    /// Set ONLY while the inner solve is invoked from the seed-screening proxy
    /// (`custom_family_seed_screening_proxy_labeled`), which RANKS candidate
    /// seeds by their penalized objective and never produces the final fit.
    ///
    /// When `true`, the inner joint-Newton skips the full per-axis
    /// Jeffreys/Firth curvature (`custom_family_joint_jeffreys_term`'s
    /// `for k in 0..p` directional-derivative loop, O(p · per-axis-Hdot) per
    /// cycle), keeping ONLY the cheap value-only Jeffreys term
    /// (`custom_family_joint_jeffreys_value`, one reduced-info eigendecomposition)
    /// in the screening score. The per-axis gradient/curvature is what the inner
    /// Newton step needs to *converge* a near-separating fit; the screening proxy
    /// is capped and only ranks, so it does not need step convergence — it needs
    /// a finite, separation-aware score cheaply. For a K-block coupled family
    /// (Dirichlet/multinomial) each per-axis directional derivative is itself
    /// O(K²·n·p), so running the full term for every cascade candidate over the
    /// joint width `p` is the wrong cost class and made the coupled fit
    /// non-completing during screening alone (gam#729/#808). The actual fit
    /// (after a seed is selected) runs with this `false`, so the load-bearing
    /// Firth curvature is fully present where it matters.
    ///
    /// **Default `false`** — only the screening proxy sets it `true`.
    pub seed_screening: bool,
}

pub const DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES: usize = 1200;

impl Default for BlockwiseFitOptions {
    fn default() -> Self {
        Self {
            // Large-scale custom-family marginal-slope fits can have a
            // long, monotone joint-Newton tail: objective and step size keep
            // shrinking, but the exact KKT residual may need several hundred
            // additional cycles after the old 300-cycle cap. The outer
            // REML/LAML derivative path is correct only at a stationary inner
            // mode, so a merely descended iterate must not be accepted as
            // converged. Use a production-sized cap by default and rely on the
            // KKT/objective certificates to exit early for well-conditioned
            // Gaussian, logistic, and small-n fits.
            inner_max_cycles: DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES,
            inner_tol: 1e-6,
            outer_max_iter: 60,
            outer_tol: 1e-5,
            minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
            // `ridge_floor` is an ExplicitPrior in the canonical
            // stabilization ledger taxonomy (`StabilizationKind::ExplicitPrior`):
            // its δ enters the quadratic term, the Laplace Hessian, and the
            // penalty log-determinant — `ridge_policy` below is the live
            // policy that confirms which terms it lands in. The default
            // pos-part policy enables every inclusion flag, so callers
            // wanting solver-only damping should construct a custom policy
            // (or, preferably, a `StabilizationLedger::numerical_perturbation`)
            // rather than reusing this field.
            ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
            ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
            use_remlobjective: true,
            // Default ON: families expose exact outer Hessians whenever their
            // analytic dense or operator representation is implemented.
            use_outer_hessian: true,
            compute_covariance: false,
            screening_max_inner_iterations: None,
            outer_inner_max_iterations: None,
            seed_screening: false,
            early_exit_threshold: None,
            outer_score_subsample: None,
            auto_outer_subsample: true,
            outer_eval_context: None,
            cache_session: None,
            cache_mirror_sessions: Vec::new(),
            joint_penalties: None,
            screen_initial_rho: true,
        }
    }
}

#[derive(Clone)]
pub struct BlockwiseInnerResult {
    pub block_states: Vec<ParameterBlockState>,
    pub active_sets: Vec<Option<Vec<usize>>>,
    pub log_likelihood: f64,
    pub penalty_value: f64,
    pub cycles: usize,
    pub converged: bool,
    pub block_logdet_h: f64,
    pub block_logdet_s: f64,
    /// Cached assembled penalty matrices S(ρ) = Σ_k exp(ρ_k) S_k per block.
    /// Avoids redundant re-assembly in the outer objective evaluation.
    pub s_lambdas: Vec<Array2<f64>>,
    pub joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    /// Projected KKT residual at the converged inner iterate, propagated to
    /// the unified evaluator's `InnerAssembly::kkt_residual` for the
    /// outer REML/LAML scoring path. `None` when the solver path doesn't
    /// produce a typed KKT diagnostic (blockwise NR fallback, eager-stop).
    pub kkt_residual: Option<crate::estimate::reml::unified::ProjectedKktResidual>,
    /// Active linear-inequality constraint rows at the converged inner
    /// iterate. When `Some`, the unified evaluator builds the
    /// constraint-aware kernel `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`
    /// for per-coordinate mode responses `v_k = ∂β/∂ρ_k`.
    pub active_constraints:
        Option<Arc<crate::estimate::reml::unified::ActiveLinearConstraintBlock>>,
}

impl std::fmt::Debug for BlockwiseInnerResult {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("BlockwiseInnerResult")
            .field("block_states", &self.block_states)
            .field("active_sets", &self.active_sets)
            .field("log_likelihood", &self.log_likelihood)
            .field("penalty_value", &self.penalty_value)
            .field("cycles", &self.cycles)
            .field("converged", &self.converged)
            .field("block_logdet_h", &self.block_logdet_h)
            .field("block_logdet_s", &self.block_logdet_s)
            .field("s_lambdas", &self.s_lambdas)
            .field(
                "joint_workspace",
                &self.joint_workspace.as_ref().map(|_| "<workspace>"),
            )
            .finish()
    }
}

#[derive(Clone)]
struct ConstrainedWarmStart {
    rho: Array1<f64>,
    block_beta: Vec<Array1<f64>>,
    active_sets: Vec<Option<Vec<usize>>>,
    cached_inner: Option<CachedInnerMode>,
}

#[derive(Clone)]
struct CachedInnerMode {
    log_likelihood: f64,
    penalty_value: f64,
    cycles: usize,
    converged: bool,
    block_logdet_h: f64,
    block_logdet_s: f64,
    joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    kkt_residual: Option<crate::estimate::reml::unified::ProjectedKktResidual>,
    active_constraints: Option<Arc<crate::estimate::reml::unified::ActiveLinearConstraintBlock>>,
}

fn screened_outer_warm_start<'a>(
    warm_start: Option<&'a ConstrainedWarmStart>,
    rho: &Array1<f64>,
) -> Option<&'a ConstrainedWarmStart> {
    warm_start.filter(|seed| seed.rho.len() == rho.len())
}

fn warm_start_matches_block_log_lambdas(
    seed: &ConstrainedWarmStart,
    block_log_lambdas: &[Array1<f64>],
) -> bool {
    let expected = block_log_lambdas
        .iter()
        .map(|values| values.len())
        .sum::<usize>();
    if seed.rho.len() != expected {
        return false;
    }
    let mut offset = 0usize;
    for block in block_log_lambdas {
        let end = offset + block.len();
        if seed.rho.slice(s![offset..end]) != block.view() {
            return false;
        }
        offset = end;
    }
    true
}

fn cached_inner_mode_from_result(result: &BlockwiseInnerResult) -> CachedInnerMode {
    CachedInnerMode {
        log_likelihood: result.log_likelihood,
        penalty_value: result.penalty_value,
        cycles: result.cycles,
        converged: result.converged,
        block_logdet_h: result.block_logdet_h,
        block_logdet_s: result.block_logdet_s,
        joint_workspace: result.joint_workspace.clone(),
        kkt_residual: result.kkt_residual.clone(),
        active_constraints: result.active_constraints.clone(),
    }
}

fn constrained_warm_start_from_inner(
    rho: &Array1<f64>,
    inner: &BlockwiseInnerResult,
) -> ConstrainedWarmStart {
    ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|state| state.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(inner)),
    }
}

fn constrained_warm_start_from_cached_beta(
    rho_dim: usize,
    specs: &[ParameterBlockSpec],
    beta: &Array1<f64>,
) -> Result<ConstrainedWarmStart, EstimationError> {
    let expected = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    if beta.len() != expected {
        crate::bail_invalid_estim!(
            "cached inner beta has length {}, but custom-family blocks require length {}",
            beta.len(),
            expected
        );
    }
    crate::families::marginal_slope_shared::bail_if_cached_beta_non_finite(beta)?;

    let mut offset = 0usize;
    let mut block_beta = Vec::with_capacity(specs.len());
    for spec in specs {
        let end = offset + spec.design.ncols();
        block_beta.push(beta.slice(s![offset..end]).to_owned());
        offset = end;
    }

    Ok(ConstrainedWarmStart {
        rho: Array1::zeros(rho_dim),
        block_beta,
        active_sets: vec![None; specs.len()],
        cached_inner: None,
    })
}

fn inner_penalized_objective(
    inner: &BlockwiseInnerResult,
    include_logdet_h: bool,
    include_logdet_s: bool,
    context: &str,
) -> Result<f64, String> {
    let reml_term = if include_logdet_h {
        0.5 * inner.block_logdet_h
    } else {
        0.0
    } - if include_logdet_s {
        0.5 * inner.block_logdet_s
    } else {
        0.0
    };
    checked_penalizedobjective(
        inner.log_likelihood,
        inner.penalty_value,
        reml_term,
        context,
    )
}

fn nonconverged_outer_efs_result(
    inner: &BlockwiseInnerResult,
    rho: &Array1<f64>,
    theta_dim: usize,
    include_logdet_h: bool,
    include_logdet_s: bool,
    context: &str,
) -> Result<
    (
        crate::solver::outer_strategy::EfsEval,
        ConstrainedWarmStart,
        bool,
    ),
    String,
> {
    Ok((
        crate::solver::outer_strategy::EfsEval {
            cost: inner_penalized_objective(inner, include_logdet_h, include_logdet_s, context)?,
            steps: vec![0.0; theta_dim],
            beta: None,
            psi_gradient: None,
            psi_indices: None,
            inner_hessian_scale: None,
            logdet_enclosure_gap: None,
        },
        constrained_warm_start_from_inner(rho, inner),
        false,
    ))
}

fn warm_start_without_cached_inner_for_psi_derivatives(
    warm_start: Option<&ConstrainedWarmStart>,
    has_psi_derivatives: bool,
) -> Option<ConstrainedWarmStart> {
    if !has_psi_derivatives {
        return None;
    }
    warm_start.cloned().map(|mut warm| {
        warm.cached_inner = None;
        warm
    })
}

fn hash_cf_array_view(hasher: &mut Fingerprinter, values: ndarray::ArrayView1<'_, f64>) {
    hasher.write_usize(values.len());
    for &value in values {
        hasher.write_f64(value);
    }
}

fn hash_cf_array2(hasher: &mut Fingerprinter, values: &Array2<f64>) {
    hasher.write_usize(values.nrows());
    hasher.write_usize(values.ncols());
    for &value in values {
        hasher.write_f64(value);
    }
}

fn hash_cf_design_matrix(hasher: &mut Fingerprinter, design: &DesignMatrix) -> Result<(), String> {
    let n = design.nrows();
    let p = design.ncols();
    hasher.write_usize(n);
    hasher.write_usize(p);
    let bytes_per_row = p.saturating_mul(std::mem::size_of::<f64>()).max(1);
    let chunk_rows = ((8 * 1024 * 1024) / bytes_per_row).clamp(1, 4096);
    for start in (0..n).step_by(chunk_rows) {
        let end = (start + chunk_rows).min(n);
        let chunk = design
            .try_row_chunk(start..end)
            .map_err(|e| format!("custom-family persistent warm-start design hash failed: {e}"))?;
        hash_cf_array2(hasher, &chunk);
    }
    Ok(())
}

fn hash_cf_penalty(hasher: &mut Fingerprinter, penalty: &PenaltyMatrix) {
    match penalty {
        PenaltyMatrix::Dense(matrix) => {
            hasher.write_str("dense");
            hash_cf_array2(hasher, matrix);
        }
        PenaltyMatrix::KroneckerFactored { left, right } => {
            hasher.write_str("kron");
            hash_cf_array2(hasher, left);
            hash_cf_array2(hasher, right);
        }
        PenaltyMatrix::Blockwise {
            local,
            col_range,
            total_dim,
        } => {
            hasher.write_str("blockwise");
            hasher.write_usize(col_range.start);
            hasher.write_usize(col_range.end);
            hasher.write_usize(*total_dim);
            hash_cf_array2(hasher, local);
        }
        PenaltyMatrix::Labeled { label, inner } => {
            hasher.write_str("labeled");
            hasher.write_str(label);
            hash_cf_penalty(hasher, inner);
        }
        PenaltyMatrix::Fixed { log_lambda, inner } => {
            hasher.write_str("fixed");
            hasher.write_u64(log_lambda.to_bits());
            hash_cf_penalty(hasher, inner);
        }
    }
}

fn persistent_custom_family_key<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
) -> Option<String> {
    let mut hasher = Fingerprinter::new();
    hasher.write_str("gamfit-persistent-block-warm-start");
    hasher.write_str(&crate::solver::persistent_warm_start::cache_schema_tag());
    hasher.write_str(type_name::<F>());
    hasher.write_str(&family.persistent_warm_start_fingerprint(specs, options)?);
    hasher.write_usize(specs.len());
    for spec in specs {
        hasher.write_str(&spec.name);
        hash_cf_design_matrix(&mut hasher, &spec.design).ok()?;
        hash_cf_array_view(&mut hasher, spec.offset.view());
        hasher.write_usize(spec.penalties.len());
        for penalty in &spec.penalties {
            hash_cf_penalty(&mut hasher, penalty);
        }
        hasher.write_usize(spec.nullspace_dims.len());
        for &dim in &spec.nullspace_dims {
            hasher.write_usize(dim);
        }
        hash_cf_array_view(&mut hasher, spec.initial_log_lambdas.view());
    }
    hasher.write_usize(options.inner_max_cycles);
    hasher.write_f64(options.inner_tol);
    hasher.write_usize(options.outer_max_iter);
    hasher.write_f64(options.outer_tol);
    hasher.write_f64(options.minweight);
    hasher.write_f64(options.ridge_floor);
    hasher.write_str(&format!("{:?}", options.ridge_policy));
    hasher.write_bool(options.use_remlobjective);
    hasher.write_bool(options.use_outer_hessian);
    hasher.write_bool(options.compute_covariance);
    hasher.write_bool(options.early_exit_threshold.is_some());
    if let Some(value) = options.early_exit_threshold {
        hasher.write_f64(value);
    }
    hasher.write_bool(options.outer_score_subsample.is_some());
    hasher.write_bool(options.auto_outer_subsample);
    Some(format!("cf-{}", hasher.finish_hex()))
}

fn custom_family_cache_shape(specs: &[ParameterBlockSpec]) -> (usize, Vec<String>, Vec<usize>) {
    let n_rows = specs.first().map(|spec| spec.design.nrows()).unwrap_or(0);
    let block_names = specs.iter().map(|spec| spec.name.clone()).collect();
    let block_dims = specs.iter().map(|spec| spec.design.ncols()).collect();
    (n_rows, block_names, block_dims)
}

fn load_persistent_custom_family_warm_start<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    rho_len: usize,
) -> (Option<String>, Option<ConstrainedWarmStart>) {
    let Some(key) = persistent_custom_family_key::<F>(family, specs, options) else {
        return (None, None);
    };
    let (n_rows, block_names, block_dims) = custom_family_cache_shape(specs);
    let Some(record) = load_block_record(&key) else {
        return (Some(key), None);
    };
    if !record.is_compatible(&key, n_rows, &block_names, &block_dims, rho_len) {
        return (Some(key), None);
    }
    let active_sets = normalize_active_sets(record.active_sets);
    let cached_inner = record.inner.map(|inner| CachedInnerMode {
        log_likelihood: inner.log_likelihood,
        penalty_value: inner.penalty_value,
        cycles: inner.cycles,
        converged: inner.converged,
        block_logdet_h: inner.block_logdet_h,
        block_logdet_s: inner.block_logdet_s,
        joint_workspace: None,
        // Persistent warm-start records don't carry the KKT-residual or
        // active-constraint diagnostics (they're not serialized on disk;
        // they're rebuilt from the inner solve on next visit), so a
        // restored cache replay forces the unified evaluator's IFT
        // correction path to degrade to its no-data branch until a fresh
        // joint-Newton pass produces them.
        kkt_residual: None,
        active_constraints: None,
    });
    let inner_status = cached_inner.as_ref().map_or("missing", |inner| {
        if inner.converged {
            "converged"
        } else {
            "partial"
        }
    });
    log::info!(
        "[warm-start-cache] restored custom-family persistent warm start key={key} inner={inner_status}"
    );
    (
        Some(key),
        Some(ConstrainedWarmStart {
            rho: Array1::from_vec(record.rho),
            block_beta: record
                .block_beta
                .into_iter()
                .map(Array1::from_vec)
                .collect(),
            active_sets,
            cached_inner,
        }),
    )
}

fn persistent_block_inner_summary(
    warm_start: &ConstrainedWarmStart,
) -> Option<PersistentBlockInnerSummary> {
    warm_start.cached_inner.as_ref().and_then(|cached| {
        (cached.log_likelihood.is_finite()
            && cached.penalty_value.is_finite()
            && cached.block_logdet_h.is_finite()
            && cached.block_logdet_s.is_finite())
        .then_some(PersistentBlockInnerSummary {
            log_likelihood: cached.log_likelihood,
            penalty_value: cached.penalty_value,
            cycles: cached.cycles,
            converged: cached.converged,
            block_logdet_h: cached.block_logdet_h,
            block_logdet_s: cached.block_logdet_s,
        })
    })
}

fn store_persistent_custom_family_warm_start(
    key: Option<&str>,
    specs: &[ParameterBlockSpec],
    warm_start: &ConstrainedWarmStart,
) {
    let Some(key) = key else {
        return;
    };
    let (n_rows, block_names, block_dims) = custom_family_cache_shape(specs);
    if warm_start.block_beta.len() != block_dims.len()
        || warm_start
            .block_beta
            .iter()
            .zip(block_dims.iter())
            .any(|(beta, dim)| beta.len() != *dim || beta.iter().any(|v| !v.is_finite()))
        || warm_start.rho.iter().any(|v| !v.is_finite())
    {
        return;
    }
    // Saturation gate: never persist ρ that hit the outer optimizer's
    // box (|ρ_i| ≥ 9). Those iterates are either at a legitimate active
    // bound or a non-converged intermediate; either way they make poor
    // seed material because the load-side clamp pulls them back into
    // the interior anyway (see `outer_strategy.rs` `[CACHE] hit-clamp`).
    const SATURATION_THRESHOLD: f64 = 9.0;
    if warm_start
        .rho
        .iter()
        .any(|&v| v.abs() >= SATURATION_THRESHOLD)
    {
        log::debug!(
            "[warm-start-cache] skip persist custom-family key={} \
             reason=rho-saturated threshold=±{:.1} rho_inf_norm={:.3e}",
            key,
            SATURATION_THRESHOLD,
            warm_start
                .rho
                .iter()
                .fold(0.0_f64, |acc, &v| acc.max(v.abs())),
        );
        return;
    }
    let mut record =
        PersistentBlockWarmStartRecord::new(key.to_string(), n_rows, block_names, block_dims);
    record.updated_unix_secs = record.created_unix_secs;
    record.rho = warm_start.rho.to_vec();
    record.block_beta = warm_start
        .block_beta
        .iter()
        .map(|beta| beta.to_vec())
        .collect();
    record.active_sets = warm_start.active_sets.clone();
    record.inner = persistent_block_inner_summary(warm_start);
    if let Err(err) = store_block_record(&record) {
        log::warn!("[warm-start-cache] failed to persist custom-family warm start: {err}");
    }
}

const CUSTOM_OUTER_INNER_CAP_MARGIN: usize = 5;

fn update_custom_outer_inner_cap_from_warm_start(
    options: &BlockwiseFitOptions,
    warm_start: &ConstrainedWarmStart,
    gradient_norm: Option<f64>,
    initial_gradient_norm: &mut Option<f64>,
) {
    let Some(outer_cap) = options.outer_inner_max_iterations.as_ref() else {
        return;
    };
    let full_budget = options.inner_max_cycles.max(1);
    let Some(cached_inner) = warm_start.cached_inner.as_ref() else {
        outer_cap.store(full_budget, Ordering::Relaxed);
        return;
    };

    if let Some(norm) = gradient_norm.filter(|value| value.is_finite() && *value > 0.0) {
        if initial_gradient_norm.is_none() {
            *initial_gradient_norm = Some(norm);
        }
        if matches!(*initial_gradient_norm, Some(initial) if initial > 0.0 && norm / initial < 0.01)
        {
            outer_cap.store(full_budget, Ordering::Relaxed);
            return;
        }
    }

    let next_cap = if cached_inner.converged {
        cached_inner
            .cycles
            .saturating_add(CUSTOM_OUTER_INNER_CAP_MARGIN)
    } else {
        cached_inner.cycles.saturating_mul(2).max(
            cached_inner
                .cycles
                .saturating_add(CUSTOM_OUTER_INNER_CAP_MARGIN),
        )
    }
    .clamp(1, full_budget);
    outer_cap.store(next_cap, Ordering::Relaxed);
}

/// Helper struct mirroring the old `BlockwiseFitResultParts`.
pub struct BlockwiseFitResultParts {
    pub block_states: Vec<ParameterBlockState>,
    pub log_likelihood: f64,
    pub log_lambdas: Array1<f64>,
    pub lambdas: Array1<f64>,
    pub covariance_conditional: Option<Array2<f64>>,
    pub stable_penalty_term: f64,
    pub penalized_objective: f64,
    pub outer_iterations: usize,
    /// `None` = no gradient measured at termination (cache-hit, gradient-free,
    /// or trivial early-exit); `Some(g)` = measured norm. `outer_converged`
    /// is the authoritative convergence signal.
    pub outer_gradient_norm: Option<f64>,
    /// First-order optimality certificate from the outer smoothing solve
    /// (#934); `None` when no outer ran (fixed-λ, one-cycle probe) or the
    /// audit could not evaluate.
    pub criterion_certificate: Option<crate::solver::outer_strategy::CriterionCertificate>,
    pub inner_cycles: usize,
    pub outer_converged: bool,
    pub geometry: Option<FitGeometry>,
    /// Effective degrees of freedom computed by the caller in the *reduced*
    /// (canonical) coefficient space, where the penalized Hessian is full rank,
    /// as `(edf_total, edf_by_penalty, block_edf)`. The trace edf is invariant
    /// under the canonical reparameterization, so computing it in the reduced
    /// space and reporting it on the raw fit is exact — and it avoids the
    /// `tr((H_raw + εI)⁻¹ S_raw)` blow-up that a rank-deficient raw-lifted
    /// Hessian (zero rows/cols on canonicalization-dropped directions) would
    /// otherwise inject. `None` when the caller has no reduced geometry (e.g.
    /// the one-cycle inner probe), in which case `blockwise_fit_from_parts`
    /// falls back to computing edf from whatever geometry it was handed.
    pub precomputed_edf: Option<(f64, Vec<f64>, Vec<f64>)>,
}

fn validate_parameter_block_state_finiteness(
    label: &str,
    state: &ParameterBlockState,
) -> Result<(), String> {
    validate_all_finite_estimation(&format!("{label}.beta"), state.beta.iter().copied())
        .map_err(|e| e.to_string())?;
    validate_all_finite_estimation(&format!("{label}.eta"), state.eta.iter().copied())
        .map_err(|e| e.to_string())?;
    Ok(())
}

fn validate_lambda_pair_consistency(
    log_lambdas: &Array1<f64>,
    lambdas: &Array1<f64>,
    label: &str,
) -> Result<(), String> {
    if log_lambdas.len() != lambdas.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{label} length mismatch: log_lambdas={}, lambdas={}",
                log_lambdas.len(),
                lambdas.len()
            ),
        }
        .into());
    }
    for (idx, (&log_lambda, &lambda)) in log_lambdas.iter().zip(lambdas.iter()).enumerate() {
        let expected = log_lambda.exp();
        let tolerance = 1e-10 * expected.abs().max(1.0);
        if (lambda - expected).abs() > tolerance {
            return Err(format!(
                "{label}[{idx}] inconsistent with exp(log_lambda): got {lambda}, expected {expected}",
            ));
        }
    }
    Ok(())
}

/// Effective degrees of freedom for a converged blockwise custom-family fit,
/// computed from the joint penalized Hessian `H = X'W_HX + S(λ)` and the
/// per-penalty matrices `S_k` exactly as the standard GAM path and mgcv do:
///
/// ```text
/// edf_total   = p − Σ_k λ_k · tr(H⁻¹ S_k)
/// edf_penalty = (rank_k − λ_k · tr(H⁻¹ S_k))   clamped to [0, rank_k]
/// ```
///
/// `S_k` here is the *unscaled* penalty (its `λ_k` factor is applied here), and
/// each `S_k.to_dense()` is already embedded in the joint `p × p` coefficient
/// layout (the Blockwise / Kronecker variants place their local block at the
/// correct column range), so the trace solve runs in the full joint space and
/// no per-block offset bookkeeping is required.
///
/// The custom-family path (CTN transformation-normal, Dirichlet, …) builds its
/// fit through `blockwise_fit_from_parts` and previously left `inference` at
/// `None`, so `edf_total` was unavailable for every custom family even though
/// the converged geometry already carries the penalized Hessian. This mirrors
/// the survival-path repair (`survival_transformation_edf`, #565) for the
/// blockwise engine: the same trace formula, factorized with the same
/// ridge-retry stabilization so a marginally indefinite Hessian at a boundary
/// optimum still yields a usable trace instead of dropping inference.
///
/// `edf_penalty` is returned aligned 1:1 with the flattened `lambdas`
/// (one entry per penalty across all blocks), matching the
/// `FitInference::edf_by_block` ↔ `lambdas` length invariant. The per-block
/// aggregate edf (for `FittedBlock::edf`) is the sum of that block's penalty
/// edfs, with an unpenalized block contributing its full column count.
fn custom_family_blockwise_edf(
    penalized_hessian: &Array2<f64>,
    specs: &[ParameterBlockSpec],
    lambdas: &ndarray::ArrayView1<'_, f64>,
) -> Result<(f64, Vec<f64>, Vec<f64>), String> {
    let p = penalized_hessian.nrows();
    let total_cols: usize = specs.iter().map(|s| s.design.ncols()).sum();
    if penalized_hessian.ncols() != p || total_cols != p {
        return Err(format!(
            "custom-family edf: penalized Hessian {}x{} inconsistent with total block width {}",
            penalized_hessian.nrows(),
            penalized_hessian.ncols(),
            total_cols
        ));
    }
    let expected_rho: usize = specs.iter().map(|s| s.penalties.len()).sum();
    if lambdas.len() != expected_rho {
        return Err(format!(
            "custom-family edf: lambdas length {} does not match total penalty count {}",
            lambdas.len(),
            expected_rho
        ));
    }

    let h_sym = SymmetricMatrix::Dense(penalized_hessian.clone());
    // Sparse-aware factorization with ridge retry (mirrors estimate.rs and
    // survival_transformation_edf): a boundary-constrained optimum can leave
    // the penalized Hessian marginally indefinite, in which case we add the
    // smallest diagonal shift that restores definiteness so the trace solve
    // succeeds rather than dropping inference for the whole fit.
    let factor = {
        let scale = h_sym.max_abs_diag();
        let min_step = scale * 1e-10;
        let mut ridge = 0.0_f64;
        let mut attempts = 0_usize;
        loop {
            let candidate = if ridge > 0.0 {
                h_sym.addridge(ridge).unwrap_or_else(|_| h_sym.clone())
            } else {
                h_sym.clone()
            };
            if let Ok(f) = candidate.factorize() {
                break f;
            }
            attempts += 1;
            if attempts >= 8 {
                return Err(
                    "custom-family edf: penalized Hessian could not be factorized".to_string(),
                );
            }
            ridge = if ridge <= 0.0 { min_step } else { ridge * 10.0 };
        }
    };

    let mut edf_by_penalty = vec![0.0_f64; expected_rho];
    let mut block_edf = Vec::with_capacity(specs.len());
    let mut total_trace = 0.0_f64;
    let mut penalty_offset = 0usize;
    let mut block_col_start = 0usize;
    for spec in specs.iter() {
        let block_cols = spec.design.ncols();
        let mut block_edf_acc = block_cols as f64;
        for (local_k, penalty) in spec.penalties.iter().enumerate() {
            let global_k = penalty_offset + local_k;
            let lambda = lambdas[global_k];
            // Embed S_k into the full p×p joint layout. `PenaltyMatrix::to_dense`
            // returns the *local* block matrix for the `Dense` variant but the
            // already-embedded full-width matrix for `Blockwise`/`Kronecker`, so
            // dispatch on the materialized dimension: a local (block_cols-wide)
            // penalty is placed at this block's column range, a full-width
            // penalty is used as-is (mirrors `survival_transformation_edf`'s
            // explicit block placement).
            let s_local = penalty.to_dense();
            let mut s_full = Array2::<f64>::zeros((p, p));
            if s_local.nrows() == p && s_local.ncols() == p {
                s_full.assign(&s_local);
            } else if s_local.nrows() == block_cols && s_local.ncols() == block_cols {
                let r = block_col_start..block_col_start + block_cols;
                s_full.slice_mut(ndarray::s![r.clone(), r]).assign(&s_local);
            } else {
                return Err(format!(
                    "custom-family edf: penalty {global_k} materialized to {}x{}, expected {p}x{p} or {block_cols}x{block_cols}",
                    s_local.nrows(),
                    s_local.ncols()
                ));
            }
            // tr(H⁻¹ S_k) via H Z = S_k, summing the diagonal of Z.
            let z = factor.solvemulti(&s_full).map_err(|e| {
                format!("custom-family edf trace solve failed for penalty {global_k}: {e}")
            })?;
            let mut trace = 0.0_f64;
            for d in 0..p {
                trace += z[[d, d]];
            }
            let lam_trace = if lambda > 0.0 { lambda * trace } else { 0.0 };
            total_trace += lam_trace;
            // Per-penalty edf is bounded by the columns this penalty acts on,
            // i.e. its block's column count (a `Blockwise` penalty reports the
            // full joint width from `dim()`, so cap at `block_cols`, not `dim()`).
            let penalty_cols = block_cols as f64;
            let edf_k = (penalty_cols - lam_trace).clamp(0.0, penalty_cols);
            edf_by_penalty[global_k] = edf_k;
            // The block's edf is the column count minus the total trace this
            // block's penalties spend (so multiple penalties on one block
            // compose), clamped to the block's column count.
            block_edf_acc -= lam_trace;
        }
        block_edf.push(block_edf_acc.clamp(0.0, block_cols as f64));
        penalty_offset += spec.penalties.len();
        block_col_start += block_cols;
    }

    let edf_total = (p as f64 - total_trace).clamp(0.0, p as f64);
    if !edf_total.is_finite()
        || edf_by_penalty.iter().any(|v| !v.is_finite())
        || block_edf.iter().any(|v| !v.is_finite())
    {
        return Err("custom-family edf: non-finite effective degrees of freedom".to_string());
    }
    Ok((edf_total, edf_by_penalty, block_edf))
}

/// Compute reduced-space effective degrees of freedom for a converged fit,
/// to be carried through `BlockwiseFitResultParts::precomputed_edf`.
///
/// The reduced (canonical) geometry's penalized Hessian is full rank and its
/// `reduced_specs` carry the pulled-back penalties `T_iᵀ S_k T_i`, so the trace
/// edf is computed exactly here (no rank-deficiency ridge bias). Because the
/// trace edf is invariant under the canonical reparameterization, the resulting
/// `edf_total` / per-penalty / per-block values are the same as they would be
/// in the raw basis and are reported directly on the lifted raw fit. Returns
/// `None` when no reduced geometry is available, so the caller can leave
/// `precomputed_edf` unset (and the raw-geometry fallback applies).
fn reduced_blockwise_edf(
    reduced_geometry: Option<&FitGeometry>,
    canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
    lambdas: &Array1<f64>,
) -> Option<(f64, Vec<f64>, Vec<f64>)> {
    let geom = reduced_geometry?;
    match custom_family_blockwise_edf(
        geom.penalized_hessian.as_array(),
        &canonical.reduced_specs,
        &lambdas.view(),
    ) {
        Ok(triple) => Some(triple),
        Err(err) => {
            log::warn!(
                "[custom-family inference] reduced-space effective degrees of freedom unavailable: {err}"
            );
            None
        }
    }
}

/// Build a `UnifiedFitResult` from blockwise-specific fields.
pub fn blockwise_fit_from_parts(
    parts: BlockwiseFitResultParts,
    specs: &[ParameterBlockSpec],
) -> Result<crate::solver::estimate::UnifiedFitResult, String> {
    let BlockwiseFitResultParts {
        block_states,
        log_likelihood,
        log_lambdas,
        lambdas,
        covariance_conditional,
        stable_penalty_term,
        penalized_objective,
        outer_iterations,
        outer_gradient_norm,
        criterion_certificate,
        inner_cycles,
        outer_converged,
        geometry,
        precomputed_edf,
    } = parts;

    if block_states.is_empty() {
        return Err(CustomFamilyError::UnsupportedConfiguration {
            reason: "blockwise fit requires at least one block state".to_string(),
        }
        .into());
    }
    ensure_finite_scalar_estimation("blockwise_fit.log_likelihood", log_likelihood)
        .map_err(|e| e.to_string())?;
    validate_all_finite_estimation("blockwise_fit.log_lambdas", log_lambdas.iter().copied())
        .map_err(|e| e.to_string())?;
    validate_all_finite_estimation("blockwise_fit.lambdas", lambdas.iter().copied())
        .map_err(|e| e.to_string())?;
    validate_lambda_pair_consistency(&log_lambdas, &lambdas, "blockwise_fit.lambdas")?;
    ensure_finite_scalar_estimation("blockwise_fit.penalized_objective", penalized_objective)
        .map_err(|e| e.to_string())?;
    ensure_finite_scalar_estimation("blockwise_fit.stable_penalty_term", stable_penalty_term)
        .map_err(|e| e.to_string())?;
    if let Some(g) = outer_gradient_norm {
        ensure_finite_scalar_estimation("blockwise_fit.outer_gradient_norm", g)
            .map_err(|e| e.to_string())?;
    }

    if block_states.len() != specs.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "blockwise_fit.block_states length ({}) does not match specs length ({})",
                block_states.len(),
                specs.len()
            ),
        }
        .into());
    }
    let n = specs[0].design.nrows();
    let total_p = block_states
        .iter()
        .map(|state| state.beta.len())
        .sum::<usize>();
    for (idx, state) in block_states.iter().enumerate() {
        validate_parameter_block_state_finiteness(
            &format!("blockwise_fit.block_states[{idx}]"),
            state,
        )?;
        let expected_rows = specs[idx].solver_design().nrows();
        if state.eta.len() != expected_rows {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.block_states[{idx}] eta length mismatch: got {}, expected {} (solver design rows)",
                state.eta.len(),
                expected_rows
            ) }.into());
        }
    }

    if let Some(cov) = covariance_conditional.as_ref() {
        validate_all_finite_estimation("blockwise_fit.covariance_conditional", cov.iter().copied())
            .map_err(|e| e.to_string())?;
        let (rows, cols) = cov.dim();
        if rows != total_p || cols != total_p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "blockwise_fit.covariance_conditional must be {}x{}, got {}x{}",
                    total_p, total_p, rows, cols
                ),
            }
            .into());
        }
    }

    if let Some(geom) = geometry.as_ref() {
        geom.validate_numeric_finiteness()
            .map_err(|e| e.to_string())?;
        let (rows, cols) = geom.penalized_hessian.dim();
        if rows != total_p || cols != total_p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "blockwise_fit.geometry.penalized_hessian must be {}x{}, got {}x{}",
                    total_p, total_p, rows, cols
                ),
            }
            .into());
        }
        let geom_len = geom.working_weights.len();
        if geom_len != geom.working_response.len() {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.geometry working vector length mismatch: weights={}, response={}",
                geom.working_weights.len(),
                geom.working_response.len(),
            ) }.into());
        }
        if geom_len != n && (n == 0 || geom_len % n != 0) {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.geometry.working_weights length mismatch: got {geom_len}, expected {n} or a stacked multiple of {n}",
            ) }.into());
        }
        if geom.working_response.len() != n && (n == 0 || geom.working_response.len() % n != 0) {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.geometry.working_response length mismatch: got {}, expected {n} or a stacked multiple of {n}",
                geom.working_response.len(),
            ) }.into());
        }
    }

    // Build unified blocks from the blockwise states.
    use crate::solver::estimate::{FittedBlock, FittedLinkState, UnifiedFitResultParts};
    let expected_rho: usize = specs.iter().map(|s| s.penalties.len()).sum();
    if lambdas.len() != expected_rho {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "blockwise_fit.lambdas length ({}) does not match sum of per-block penalty counts ({})",
            lambdas.len(),
            expected_rho
        ) }.into());
    }
    // Effective degrees of freedom and the inference block. When the
    // converged geometry carries the joint penalized Hessian we compute the
    // mgcv trace edf `p − Σ_k λ_k·tr(H⁻¹ S_k)` here so every custom-family fit
    // (CTN transformation-normal, Dirichlet, …) reports `edf_total` /
    // per-block `edf` like the standard GAM path, instead of leaving inference
    // unpopulated. A factorization failure is non-fatal: the fit still returns
    // with `edf=0`/`inference=None` rather than aborting, but in practice the
    // ridge-retry inside `custom_family_blockwise_edf` recovers any boundary
    // indefiniteness.
    let (edf_total_opt, edf_by_penalty, block_edf): (Option<f64>, Vec<f64>, Vec<f64>) =
        match precomputed_edf {
            // Reduced-space edf supplied by the caller (the principled path:
            // the trace is computed where the Hessian is full rank, then
            // reported on the raw fit — exact because the trace edf is
            // reparameterization-invariant).
            Some((edf_total, edf_by_penalty, block_edf)) => {
                (Some(edf_total), edf_by_penalty, block_edf)
            }
            // Fallback: compute from whatever geometry we were handed. Used
            // only when the caller did not precompute (no reduced geometry);
            // the ridge-retry factorization makes this robust to a marginally
            // indefinite Hessian.
            None => match geometry.as_ref() {
                Some(geom) => {
                    match custom_family_blockwise_edf(
                        geom.penalized_hessian.as_array(),
                        specs,
                        &lambdas.view(),
                    ) {
                        Ok((edf_total, edf_by_penalty, block_edf)) => {
                            (Some(edf_total), edf_by_penalty, block_edf)
                        }
                        Err(err) => {
                            log::warn!(
                                "[custom-family inference] effective degrees of freedom unavailable: {err}"
                            );
                            (None, Vec::new(), vec![0.0; block_states.len()])
                        }
                    }
                }
                None => (None, Vec::new(), vec![0.0; block_states.len()]),
            },
        };

    let mut lambda_offset = 0usize;
    let blocks: Vec<FittedBlock> = block_states
        .iter()
        .enumerate()
        .map(|(i, bs)| {
            let role = custom_family_block_role(&specs[i].name, i, block_states.len());
            let k = specs[i].penalties.len();
            let block_lambdas = lambdas
                .slice(s![lambda_offset..lambda_offset + k])
                .to_owned();
            lambda_offset += k;
            FittedBlock {
                beta: bs.beta.clone(),
                role,
                edf: block_edf.get(i).copied().unwrap_or(0.0),
                lambdas: block_lambdas,
            }
        })
        .collect();
    let deviance = -2.0 * log_likelihood;

    // Assemble the inference block from the converged geometry. CTN and other
    // custom families estimate their own likelihood scale, so the penalized
    // Hessian is reported unscaled (dispersion = 1) — the EDF trace is
    // dispersion-free, and downstream covariance scaling pairs `H` with the
    // family's own dispersion where needed.
    let inference = match (edf_total_opt, geometry.as_ref()) {
        (Some(edf_total), Some(geom)) => Some(crate::solver::estimate::FitInference {
            edf_by_block: edf_by_penalty,
            edf_total,
            smoothing_correction: None,
            penalized_hessian: geom.penalized_hessian.clone(),
            working_weights: geom.working_weights.clone(),
            working_response: geom.working_response.clone(),
            reparam_qs: None,
            dispersion: crate::solver::estimate::Dispersion::Known(1.0),
            beta_covariance: None,
            beta_standard_errors: None,
            beta_covariance_corrected: None,
            beta_standard_errors_corrected: None,
            beta_covariance_frequentist: None,
            coefficient_influence: None,
            bias_correction_beta: None,
        }),
        _ => None,
    };

    crate::solver::estimate::UnifiedFitResult::try_from_parts(UnifiedFitResultParts {
        blocks,
        log_lambdas: log_lambdas.clone(),
        lambdas: lambdas.clone(),
        likelihood_family: None,
        likelihood_scale: crate::types::LikelihoodScaleMetadata::Unspecified,
        log_likelihood_normalization: crate::types::LogLikelihoodNormalization::UserProvided,
        log_likelihood,
        deviance,
        reml_score: penalized_objective,
        stable_penalty_term,
        penalized_objective,
        outer_iterations,
        outer_converged,
        outer_gradient_norm,
        standard_deviation: 1.0,
        covariance_conditional,
        covariance_corrected: None,
        inference,
        fitted_link: FittedLinkState::Standard(None),
        geometry,
        block_states,
        pirls_status: crate::pirls::PirlsStatus::Converged,
        max_abs_eta: 0.0,
        constraint_kkt: None,
        artifacts: crate::solver::estimate::FitArtifacts {
            pirls: None,
            criterion_certificate,
            ..Default::default()
        },
        inner_cycles,
    })
    .map_err(|e| e.to_string())
}

fn checked_penalizedobjective(
    log_likelihood: f64,
    penalty_value: f64,
    reml_term: f64,
    context: &str,
) -> Result<f64, String> {
    let objective = -log_likelihood + penalty_value + reml_term;
    if objective.is_finite() {
        Ok(objective)
    } else {
        Err(CustomFamilyError::NumericalFailure {
            reason: format!(
                "{context}: non-finite penalized objective \
             (log_likelihood={log_likelihood}, penalty_value={penalty_value}, \
             reml_term={reml_term}, objective={objective})"
            ),
        }
        .into())
    }
}

#[derive(Clone)]
pub struct CustomFamilyBlockPsiDerivative {
    pub penalty_index: Option<usize>,
    pub x_psi: Array2<f64>,
    pub s_psi: Array2<f64>,
    pub s_psi_components: Option<Vec<(usize, Array2<f64>)>>,
    pub s_psi_penalty_components: Option<Vec<(usize, PenaltyMatrix)>>,
    pub x_psi_psi: Option<Vec<Array2<f64>>>,
    pub s_psi_psi: Option<Vec<Array2<f64>>>,
    pub s_psi_psi_components: Option<Vec<Vec<(usize, Array2<f64>)>>>,
    pub s_psi_psi_penalty_components: Option<Vec<Vec<(usize, PenaltyMatrix)>>>,
    pub(crate) implicit_operator: Option<Arc<dyn CustomFamilyPsiDerivativeOperator>>,
    pub implicit_axis: usize,
    pub implicit_group_id: Option<usize>,
}

pub(crate) type SharedDerivativeBlocks = Arc<Vec<Vec<CustomFamilyBlockPsiDerivative>>>;

impl CustomFamilyBlockPsiDerivative {
    /// Public constructor for use in tests and external consumers.
    /// Sets `implicit_operator` to `None`.
    pub fn new(
        penalty_index: Option<usize>,
        x_psi: Array2<f64>,
        s_psi: Array2<f64>,
        s_psi_components: Option<Vec<(usize, Array2<f64>)>>,
        x_psi_psi: Option<Vec<Array2<f64>>>,
        s_psi_psi: Option<Vec<Array2<f64>>>,
        s_psi_psi_components: Option<Vec<Vec<(usize, Array2<f64>)>>>,
    ) -> Self {
        Self {
            penalty_index,
            x_psi,
            s_psi,
            s_psi_components,
            s_psi_penalty_components: None,
            x_psi_psi,
            s_psi_psi,
            s_psi_psi_components,
            s_psi_psi_penalty_components: None,
            implicit_operator: None,
            implicit_axis: 0,
            implicit_group_id: None,
        }
    }
}

pub(crate) trait CustomFamilyPsiDerivativeOperator: Send + Sync + Any {
    fn as_any(&self) -> &dyn Any;
    fn n_data(&self) -> usize;
    fn p_out(&self) -> usize;
    fn transpose_mul(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
    fn forward_mul(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
    fn transpose_mul_second_diag(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
    fn transpose_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
    fn forward_mul_second_diag(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
    fn forward_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>;
    fn row_chunk_first(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
    /// Single-row specialization of `row_chunk_first`. Default implementation
    /// delegates to `row_chunk_first(axis, row..row+1)` and copies the
    /// resulting row into the output buffer; implementations that can avoid
    /// the temporary matrix allocation should override this method.
    fn row_vector_first_into(
        &self,
        axis: usize,
        row: usize,
        mut out: ArrayViewMut1<'_, f64>,
    ) -> Result<(), crate::terms::basis::BasisError> {
        let chunk = self.row_chunk_first(axis, row..row + 1)?;
        out.assign(&chunk.row(0));
        Ok(())
    }
    fn row_chunk_second_diag(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
    fn row_chunk_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError>;

    /// Optional upcast to the dense materialization surface. Production exact
    /// paths should prefer the analytic matvec / row-chunk methods above and
    /// avoid forming the full derivative matrix; implementations that *do*
    /// support dense materialization (used by diagnostics, tests, and
    /// small-data fallbacks) should override this to return `Some(self)`.
    fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
        None
    }
}

/// Diagnostic / small-data extension that exposes dense materialization of
/// `\partial X / \partial \psi`. Production exact-Hessian code MUST NOT depend
/// on dense second-derivative materialization; second-order paths use the
/// row-chunk and matvec methods on [`CustomFamilyPsiDerivativeOperator`].
pub(crate) trait MaterializablePsiDerivativeOperator:
    CustomFamilyPsiDerivativeOperator
{
    fn materialize_first(
        &self,
        axis: usize,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError>;
}

impl CustomFamilyPsiDerivativeOperator for crate::terms::basis::ImplicitDesignPsiDerivative {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn n_data(&self) -> usize {
        crate::terms::basis::ImplicitDesignPsiDerivative::n_data(self)
    }

    fn p_out(&self) -> usize {
        crate::terms::basis::ImplicitDesignPsiDerivative::p_out(self)
    }

    fn transpose_mul(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::transpose_mul(self, axis, v)
    }

    fn forward_mul(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::forward_mul(self, axis, u)
    }

    fn row_chunk_first(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let f: fn(
            &crate::terms::basis::ImplicitDesignPsiDerivative,
            usize,
            Range<usize>,
        ) -> Result<Array2<f64>, crate::terms::basis::BasisError> =
            crate::terms::basis::ImplicitDesignPsiDerivative::row_chunk_first;
        f(self, axis, rows)
    }

    fn row_vector_first_into(
        &self,
        axis: usize,
        row: usize,
        out: ArrayViewMut1<'_, f64>,
    ) -> Result<(), crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::row_vector_first_into(
            self, axis, row, out,
        )
    }

    fn row_chunk_second_diag(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let f: fn(
            &crate::terms::basis::ImplicitDesignPsiDerivative,
            usize,
            Range<usize>,
        ) -> Result<Array2<f64>, crate::terms::basis::BasisError> =
            crate::terms::basis::ImplicitDesignPsiDerivative::row_chunk_second_diag;
        f(self, axis, rows)
    }

    fn row_chunk_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let f: fn(
            &crate::terms::basis::ImplicitDesignPsiDerivative,
            usize,
            usize,
            Range<usize>,
        ) -> Result<Array2<f64>, crate::terms::basis::BasisError> =
            crate::terms::basis::ImplicitDesignPsiDerivative::row_chunk_second_cross;
        f(self, axis_d, axis_e, rows)
    }

    fn transpose_mul_second_diag(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::transpose_mul_second_diag(self, axis, v)
    }

    fn transpose_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::transpose_mul_second_cross(
            self, axis_d, axis_e, v,
        )
    }

    fn forward_mul_second_diag(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::forward_mul_second_diag(self, axis, u)
    }

    fn forward_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::forward_mul_second_cross(
            self, axis_d, axis_e, u,
        )
    }

    fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
        Some(self)
    }
}

impl MaterializablePsiDerivativeOperator for crate::terms::basis::ImplicitDesignPsiDerivative {
    fn materialize_first(
        &self,
        axis: usize,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        crate::terms::basis::ImplicitDesignPsiDerivative::materialize_first(self, axis)
    }
}

pub(crate) struct EmbeddedImplicitPsiDerivativeOperator {
    base: Arc<crate::terms::basis::ImplicitDesignPsiDerivative>,
    total_p: usize,
    global_range: Range<usize>,
}

impl EmbeddedImplicitPsiDerivativeOperator {
    pub(crate) fn new(
        base: Arc<crate::terms::basis::ImplicitDesignPsiDerivative>,
        global_range: Range<usize>,
        total_p: usize,
    ) -> Result<Self, String> {
        if base.p_out() != global_range.len() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "embedded implicit psi operator width mismatch: got {}, expected {}",
                    base.p_out(),
                    global_range.len()
                ),
            }
            .into());
        }
        if global_range.end > total_p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "embedded implicit psi operator range {}..{} exceeds total width {total_p}",
                    global_range.start, global_range.end
                ),
            }
            .into());
        }
        Ok(Self {
            base,
            total_p,
            global_range,
        })
    }

    fn embed_vector(&self, local: Array1<f64>) -> Array1<f64> {
        let mut out = Array1::<f64>::zeros(self.total_p);
        out.slice_mut(ndarray::s![self.global_range.clone()])
            .assign(&local);
        out
    }

    fn local_coeffs(
        &self,
        u: &ArrayView1<'_, f64>,
        context: &str,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        if u.len() != self.total_p {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "{context} expected coefficient length {}, got {}",
                self.total_p,
                u.len()
            )));
        }
        Ok(u.slice(ndarray::s![self.global_range.clone()]).to_owned())
    }
}

impl CustomFamilyPsiDerivativeOperator for EmbeddedImplicitPsiDerivativeOperator {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn n_data(&self) -> usize {
        self.base.n_data()
    }

    fn p_out(&self) -> usize {
        self.total_p
    }

    fn transpose_mul(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        Ok(self.embed_vector(self.base.transpose_mul(axis, v)?))
    }

    fn forward_mul(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        let local = self.local_coeffs(u, "embedded implicit psi forward_mul")?;
        self.base.forward_mul(axis, &local.view())
    }

    fn transpose_mul_second_diag(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        Ok(self.embed_vector(self.base.transpose_mul_second_diag(axis, v)?))
    }

    fn transpose_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        Ok(self.embed_vector(self.base.transpose_mul_second_cross(axis_d, axis_e, v)?))
    }

    fn forward_mul_second_diag(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        let local = self.local_coeffs(u, "embedded implicit psi forward_mul_second_diag")?;
        self.base.forward_mul_second_diag(axis, &local.view())
    }

    fn forward_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        let local = self.local_coeffs(u, "embedded implicit psi forward_mul_second_cross")?;
        self.base
            .forward_mul_second_cross(axis_d, axis_e, &local.view())
    }

    fn row_chunk_first(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let local = self.base.row_chunk_first(axis, rows)?;
        Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
    }

    fn row_vector_first_into(
        &self,
        axis: usize,
        row: usize,
        mut out: ArrayViewMut1<'_, f64>,
    ) -> Result<(), crate::terms::basis::BasisError> {
        out.fill(0.0);
        let local_slice = out.slice_mut(ndarray::s![self.global_range.clone()]);
        self.base.row_vector_first_into(axis, row, local_slice)
    }

    fn row_chunk_second_diag(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let local = self.base.row_chunk_second_diag(axis, rows)?;
        Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
    }

    fn row_chunk_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let local = self.base.row_chunk_second_cross(axis_d, axis_e, rows)?;
        Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
    }

    fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
        Some(self)
    }
}

impl MaterializablePsiDerivativeOperator for EmbeddedImplicitPsiDerivativeOperator {
    fn materialize_first(
        &self,
        axis: usize,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        Ok(EmbeddedColumnBlock::new(
            &self.base.materialize_first(axis)?,
            self.global_range.clone(),
            self.total_p,
        )
        .materialize())
    }
}

/// Non-allocating zero operator for `\partial X / \partial \psi` derivative
/// blocks whose ψ coordinate does not move the design matrix at all (e.g.
/// the spatial-adaptive overlay's mass / tension / stiffness / ε
/// hyperparameters, which act through the penalty stack alone).
///
/// All matvec/transpose_mul methods return zero vectors of the correct
/// length, all row-chunk methods return chunk-sized zero matrices. The
/// operator never allocates an `(n, p)` dense buffer, which saves ~1.45 GiB
/// at the large-scale spatial-adaptive overlay (n ≈ 320 000, p ≈ 101,
/// six hyperparameters).
pub(crate) struct ZeroPsiDerivativeOperator {
    n: usize,
    p: usize,
}

impl ZeroPsiDerivativeOperator {
    pub(crate) fn new(n: usize, p: usize) -> Self {
        Self { n, p }
    }
}

impl CustomFamilyPsiDerivativeOperator for ZeroPsiDerivativeOperator {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn n_data(&self) -> usize {
        self.n
    }

    fn p_out(&self) -> usize {
        self.p
    }

    fn transpose_mul(
        &self,
        idx: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert_eq!(v.len(), self.n, "zero psi transpose_mul length mismatch");
        Ok(Array1::<f64>::zeros(self.p))
    }

    fn forward_mul(
        &self,
        idx: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert_eq!(u.len(), self.p, "zero psi forward_mul length mismatch");
        Ok(Array1::<f64>::zeros(self.n))
    }

    fn transpose_mul_second_diag(
        &self,
        idx: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert_eq!(
            v.len(),
            self.n,
            "zero psi transpose_mul_second_diag length mismatch"
        );
        Ok(Array1::<f64>::zeros(self.p))
    }

    fn transpose_mul_second_cross(
        &self,
        idx: usize,
        idx2: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert!(idx2 < usize::MAX);
        assert_eq!(
            v.len(),
            self.n,
            "zero psi transpose_mul_second_cross length mismatch"
        );
        Ok(Array1::<f64>::zeros(self.p))
    }

    fn forward_mul_second_diag(
        &self,
        idx: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert_eq!(
            u.len(),
            self.p,
            "zero psi forward_mul_second_diag length mismatch"
        );
        Ok(Array1::<f64>::zeros(self.n))
    }

    fn forward_mul_second_cross(
        &self,
        idx: usize,
        idx2: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert!(idx2 < usize::MAX);
        assert_eq!(
            u.len(),
            self.p,
            "zero psi forward_mul_second_cross length mismatch"
        );
        Ok(Array1::<f64>::zeros(self.n))
    }

    fn row_chunk_first(
        &self,
        idx: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert!(
            rows.start <= rows.end && rows.end <= self.n,
            "zero psi row_chunk_first row range out of bounds"
        );
        Ok(Array2::<f64>::zeros((rows.end - rows.start, self.p)))
    }

    fn row_vector_first_into(
        &self,
        idx: usize,
        row: usize,
        mut out: ArrayViewMut1<'_, f64>,
    ) -> Result<(), crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert!(
            row < self.n,
            "zero psi row_vector_first_into row out of bounds"
        );
        assert_eq!(
            out.len(),
            self.p,
            "zero psi row_vector_first_into output length mismatch"
        );
        out.fill(0.0);
        Ok(())
    }

    fn row_chunk_second_diag(
        &self,
        idx: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert!(
            rows.start <= rows.end && rows.end <= self.n,
            "zero psi row_chunk_second_diag row range out of bounds"
        );
        Ok(Array2::<f64>::zeros((rows.end - rows.start, self.p)))
    }

    fn row_chunk_second_cross(
        &self,
        idx: usize,
        idx2: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        assert!(idx < usize::MAX);
        assert!(idx2 < usize::MAX);
        assert!(
            rows.start <= rows.end && rows.end <= self.n,
            "zero psi row_chunk_second_cross row range out of bounds"
        );
        Ok(Array2::<f64>::zeros((rows.end - rows.start, self.p)))
    }
}

fn stack_dense_row_blocks(blocks: &[Array2<f64>]) -> Array2<f64> {
    let total_rows = blocks.iter().map(Array2::nrows).sum();
    let p = blocks.first().map(Array2::ncols).unwrap_or(0);
    let mut stacked = Array2::<f64>::zeros((total_rows, p));
    let mut row_start = 0usize;
    for block in blocks {
        assert_eq!(block.ncols(), p);
        let row_end = row_start + block.nrows();
        stacked
            .slice_mut(ndarray::s![row_start..row_end, ..])
            .assign(block);
        row_start = row_end;
    }
    stacked
}

struct EmbeddedDensePsiDerivativeOperator {
    axis: usize,
    total_p: usize,
    global_range: Range<usize>,
    first_local: Array2<f64>,
    second_diag_local: Array2<f64>,
    second_cross_local: HashMap<usize, Array2<f64>>,
}

impl EmbeddedDensePsiDerivativeOperator {
    fn new(
        axis: usize,
        total_p: usize,
        global_range: Range<usize>,
        first_local: Array2<f64>,
        second_diag_local: Array2<f64>,
        second_cross_local: HashMap<usize, Array2<f64>>,
    ) -> Result<Self, String> {
        let local_p = global_range.len();
        if first_local.ncols() != local_p {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "embedded dense psi operator first-derivative width mismatch: got {}, expected {local_p}",
                first_local.ncols()
            ) }.into());
        }
        if second_diag_local.ncols() != local_p {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "embedded dense psi operator second-diag width mismatch: got {}, expected {local_p}",
                second_diag_local.ncols()
            ) }.into());
        }
        for (cross_axis, local) in &second_cross_local {
            if local.ncols() != local_p {
                return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                    "embedded dense psi operator cross axis {cross_axis} width mismatch: got {}, expected {local_p}",
                    local.ncols()
                ) }.into());
            }
        }
        Ok(Self {
            axis,
            total_p,
            global_range,
            first_local,
            second_diag_local,
            second_cross_local,
        })
    }

    fn validate_axis(
        &self,
        axis: usize,
        context: &str,
    ) -> Result<(), crate::terms::basis::BasisError> {
        if axis == self.axis {
            Ok(())
        } else {
            Err(crate::terms::basis::BasisError::Other(format!(
                "{context} expected axis {}, got {axis}",
                self.axis
            )))
        }
    }

    fn embed_vector(&self, local: Array1<f64>) -> Array1<f64> {
        let mut out = Array1::<f64>::zeros(self.total_p);
        out.slice_mut(ndarray::s![self.global_range.clone()])
            .assign(&local);
        out
    }

    fn local_coeffs(
        &self,
        u: &ArrayView1<'_, f64>,
        context: &str,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        if u.len() != self.total_p {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "{context} expected coefficient length {}, got {}",
                self.total_p,
                u.len()
            )));
        }
        Ok(u.slice(ndarray::s![self.global_range.clone()]).to_owned())
    }

    fn cross_local(
        &self,
        axis_e: usize,
        context: &str,
    ) -> Result<&Array2<f64>, crate::terms::basis::BasisError> {
        self.second_cross_local.get(&axis_e).ok_or_else(|| {
            crate::terms::basis::BasisError::Other(format!(
                "{context} is missing cross-derivative data for axis {}",
                axis_e
            ))
        })
    }
}

impl CustomFamilyPsiDerivativeOperator for EmbeddedDensePsiDerivativeOperator {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn n_data(&self) -> usize {
        self.first_local.nrows()
    }

    fn p_out(&self) -> usize {
        self.total_p
    }

    fn transpose_mul(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi transpose_mul")?;
        if v.len() != self.n_data() {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "embedded dense psi transpose_mul expected {} rows, got {}",
                self.n_data(),
                v.len()
            )));
        }
        Ok(self.embed_vector(crate::faer_ndarray::fast_atv(&self.first_local, v)))
    }

    fn forward_mul(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi forward_mul")?;
        Ok(self
            .first_local
            .dot(&self.local_coeffs(u, "embedded dense psi forward_mul")?))
    }

    fn transpose_mul_second_diag(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi transpose_mul_second_diag")?;
        if v.len() != self.second_diag_local.nrows() {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "embedded dense psi transpose_mul_second_diag expected {} rows, got {}",
                self.second_diag_local.nrows(),
                v.len()
            )));
        }
        Ok(self.embed_vector(crate::faer_ndarray::fast_atv(&self.second_diag_local, v)))
    }

    fn transpose_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis_d, "embedded dense psi transpose_mul_second_cross")?;
        let local = self.cross_local(axis_e, "embedded dense psi transpose_mul_second_cross")?;
        if v.len() != local.nrows() {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "embedded dense psi transpose_mul_second_cross expected {} rows, got {}",
                local.nrows(),
                v.len()
            )));
        }
        Ok(self.embed_vector(crate::faer_ndarray::fast_atv(local, v)))
    }

    fn forward_mul_second_diag(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi forward_mul_second_diag")?;
        Ok(self
            .second_diag_local
            .dot(&self.local_coeffs(u, "embedded dense psi forward_mul_second_diag")?))
    }

    fn forward_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis_d, "embedded dense psi forward_mul_second_cross")?;
        Ok(self
            .cross_local(axis_e, "embedded dense psi forward_mul_second_cross")?
            .dot(&self.local_coeffs(u, "embedded dense psi forward_mul_second_cross")?))
    }

    fn row_chunk_first(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi row_chunk_first")?;
        let local = self.first_local.slice(ndarray::s![rows, ..]).to_owned();
        Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
    }

    fn row_vector_first_into(
        &self,
        axis: usize,
        row: usize,
        mut out: ArrayViewMut1<'_, f64>,
    ) -> Result<(), crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi row_vector_first_into")?;
        if row >= self.first_local.nrows() {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "embedded dense psi row_vector_first_into row {row} out of bounds for {}",
                self.first_local.nrows()
            )));
        }
        if out.len() != self.total_p {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "embedded dense psi row_vector_first_into expected length {}, got {}",
                self.total_p,
                out.len()
            )));
        }
        out.fill(0.0);
        out.slice_mut(ndarray::s![self.global_range.clone()])
            .assign(&self.first_local.row(row));
        Ok(())
    }

    fn row_chunk_second_diag(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi row_chunk_second_diag")?;
        let local = self
            .second_diag_local
            .slice(ndarray::s![rows, ..])
            .to_owned();
        Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
    }

    fn row_chunk_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis_d, "embedded dense psi row_chunk_second_cross")?;
        let local = self
            .cross_local(axis_e, "embedded dense psi row_chunk_second_cross")?
            .slice(ndarray::s![rows, ..])
            .to_owned();
        Ok(EmbeddedColumnBlock::new(&local, self.global_range.clone(), self.total_p).materialize())
    }

    fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
        Some(self)
    }
}

impl MaterializablePsiDerivativeOperator for EmbeddedDensePsiDerivativeOperator {
    fn materialize_first(
        &self,
        axis: usize,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.validate_axis(axis, "embedded dense psi materialize_first")?;
        Ok(
            EmbeddedColumnBlock::new(&self.first_local, self.global_range.clone(), self.total_p)
                .materialize(),
        )
    }
}

pub(crate) fn build_embedded_dense_psi_operator(
    first_local: &Array2<f64>,
    second_diag_local: &Array2<f64>,
    second_cross_local: Option<&Vec<(usize, Array2<f64>)>>,
    global_range: Range<usize>,
    total_p: usize,
    axis: usize,
) -> Result<Arc<dyn CustomFamilyPsiDerivativeOperator>, String> {
    let second_cross_local = second_cross_local
        .map(|rows| {
            rows.iter()
                .map(|(axis, local)| (*axis, local.clone()))
                .collect()
        })
        .unwrap_or_default();
    Ok(Arc::new(EmbeddedDensePsiDerivativeOperator::new(
        axis,
        total_p,
        global_range,
        first_local.clone(),
        second_diag_local.clone(),
        second_cross_local,
    )?))
}

struct RowwiseKroneckerPsiDerivativeOperator {
    base: Arc<dyn CustomFamilyPsiDerivativeOperator>,
    time_bases: Vec<Arc<Array2<f64>>>,
    n_per_block: usize,
    p_time: usize,
    p_out: usize,
}

impl RowwiseKroneckerPsiDerivativeOperator {
    fn new(
        base: Arc<dyn CustomFamilyPsiDerivativeOperator>,
        time_bases: Vec<Arc<Array2<f64>>>,
    ) -> Result<Self, String> {
        let first = time_bases.first().ok_or_else(|| {
            "rowwise kronecker psi operator needs at least one time basis".to_string()
        })?;
        let n_per_block = first.nrows();
        let p_time = first.ncols();
        for (idx, basis) in time_bases.iter().enumerate() {
            if basis.nrows() != n_per_block || basis.ncols() != p_time {
                return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                    "rowwise kronecker psi operator time basis {idx} shape mismatch: got {}x{}, expected {}x{}",
                    basis.nrows(),
                    basis.ncols(),
                    n_per_block,
                    p_time
                ) }.into());
            }
        }
        if base.n_data() != n_per_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "rowwise kronecker psi operator base row mismatch: got {}, expected {n_per_block}",
                base.n_data()
            ) }.into());
        }
        Ok(Self {
            p_out: base.p_out() * p_time,
            base,
            time_bases,
            n_per_block,
            p_time,
        })
    }

    fn split_time_columns(&self, u: &ArrayView1<'_, f64>) -> Vec<Array1<f64>> {
        let p_base = self.base.p_out();
        assert_eq!(u.len(), self.p_out);
        let mut cols = vec![Array1::<f64>::zeros(p_base); self.p_time];
        for j in 0..p_base {
            for t in 0..self.p_time {
                cols[t][j] = u[j * self.p_time + t];
            }
        }
        cols
    }

    fn lifted_row_chunk_with_base<F>(
        &self,
        rows: Range<usize>,
        mut base_chunk: F,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError>
    where
        F: FnMut(Range<usize>) -> Result<Array2<f64>, crate::terms::basis::BasisError>,
    {
        if rows.start > rows.end || rows.end > self.n_data() {
            return Err(crate::terms::basis::BasisError::Other(format!(
                "rowwise kronecker psi row chunk {}..{} out of bounds for {} rows",
                rows.start,
                rows.end,
                self.n_data()
            )));
        }
        if rows.is_empty() {
            return Ok(Array2::<f64>::zeros((0, self.p_out)));
        }

        let first_block = rows.start / self.n_per_block;
        let last_block = (rows.end - 1) / self.n_per_block;
        let mut blocks = Vec::with_capacity(last_block + 1 - first_block);
        for block_idx in first_block..=last_block {
            let block_global_start = block_idx * self.n_per_block;
            let local_start = rows.start.saturating_sub(block_global_start);
            let local_end = (rows.end - block_global_start).min(self.n_per_block);
            let local_rows = local_start..local_end;
            let base = base_chunk(local_rows.clone())?;
            let time = self.time_bases[block_idx]
                .slice(ndarray::s![local_rows, ..])
                .to_owned();
            blocks.push(dense_rowwise_kronecker(base.view(), time.view()));
        }
        Ok(stack_dense_row_blocks(&blocks))
    }

    /// Canonical transpose-direction lifted matvec: for each time column `t`,
    /// weight `v` by the time basis column, delegate to the base operator via
    /// `base_op`, and scatter the per-base accumulator into the lifted layout.
    fn lifted_transpose_mul_with_base<F>(
        &self,
        v: &ArrayView1<'_, f64>,
        mut base_op: F,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>
    where
        F: FnMut(&ArrayView1<'_, f64>) -> Result<Array1<f64>, crate::terms::basis::BasisError>,
    {
        assert_eq!(v.len(), self.n_data());
        let p_base = self.base.p_out();
        let mut out = Array1::<f64>::zeros(self.p_out);
        for t in 0..self.p_time {
            let mut accum = Array1::<f64>::zeros(p_base);
            for (block_idx, time_basis) in self.time_bases.iter().enumerate() {
                let row_start = block_idx * self.n_per_block;
                let row_end = row_start + self.n_per_block;
                let weighted = &v.slice(ndarray::s![row_start..row_end]).to_owned()
                    * &time_basis.column(t).to_owned();
                accum += &base_op(&weighted.view())?;
            }
            for j in 0..p_base {
                out[j * self.p_time + t] = accum[j];
            }
        }
        Ok(out)
    }

    /// Canonical forward-direction lifted matvec: split `u` into per-time-column
    /// coefficient vectors, delegate each to the base operator via `base_op`, and
    /// accumulate the time-basis-weighted contributions into the block rows.
    fn lifted_forward_mul_with_base<F>(
        &self,
        u: &ArrayView1<'_, f64>,
        mut base_op: F,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError>
    where
        F: FnMut(&ArrayView1<'_, f64>) -> Result<Array1<f64>, crate::terms::basis::BasisError>,
    {
        let time_cols = self.split_time_columns(u);
        let mut out = Array1::<f64>::zeros(self.n_data());
        for (t, coeffs) in time_cols.iter().enumerate() {
            let base_eval = base_op(&coeffs.view())?;
            for (block_idx, time_basis) in self.time_bases.iter().enumerate() {
                let row_start = block_idx * self.n_per_block;
                let row_end = row_start + self.n_per_block;
                let contrib = &base_eval * &time_basis.column(t).to_owned();
                let mut out_block = out.slice_mut(ndarray::s![row_start..row_end]);
                out_block += &contrib;
            }
        }
        Ok(out)
    }
}

impl CustomFamilyPsiDerivativeOperator for RowwiseKroneckerPsiDerivativeOperator {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn n_data(&self) -> usize {
        self.n_per_block * self.time_bases.len()
    }

    fn p_out(&self) -> usize {
        self.p_out
    }

    fn transpose_mul(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.lifted_transpose_mul_with_base(v, |weighted| self.base.transpose_mul(axis, weighted))
    }

    fn forward_mul(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.lifted_forward_mul_with_base(u, |coeffs| self.base.forward_mul(axis, coeffs))
    }

    fn transpose_mul_second_diag(
        &self,
        axis: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.lifted_transpose_mul_with_base(v, |weighted| {
            self.base.transpose_mul_second_diag(axis, weighted)
        })
    }

    fn transpose_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        v: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.lifted_transpose_mul_with_base(v, |weighted| {
            self.base
                .transpose_mul_second_cross(axis_d, axis_e, weighted)
        })
    }

    fn forward_mul_second_diag(
        &self,
        axis: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.lifted_forward_mul_with_base(u, |coeffs| {
            self.base.forward_mul_second_diag(axis, coeffs)
        })
    }

    fn forward_mul_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        u: &ArrayView1<'_, f64>,
    ) -> Result<Array1<f64>, crate::terms::basis::BasisError> {
        self.lifted_forward_mul_with_base(u, |coeffs| {
            self.base.forward_mul_second_cross(axis_d, axis_e, coeffs)
        })
    }

    fn row_chunk_first(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.lifted_row_chunk_with_base(rows, |local_rows| {
            self.base.row_chunk_first(axis, local_rows)
        })
    }

    fn row_chunk_second_diag(
        &self,
        axis: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.lifted_row_chunk_with_base(rows, |local_rows| {
            self.base.row_chunk_second_diag(axis, local_rows)
        })
    }

    fn row_chunk_second_cross(
        &self,
        axis_d: usize,
        axis_e: usize,
        rows: Range<usize>,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        self.lifted_row_chunk_with_base(rows, |local_rows| {
            self.base.row_chunk_second_cross(axis_d, axis_e, local_rows)
        })
    }

    fn as_materializable(&self) -> Option<&dyn MaterializablePsiDerivativeOperator> {
        Some(self)
    }
}

impl MaterializablePsiDerivativeOperator for RowwiseKroneckerPsiDerivativeOperator {
    fn materialize_first(
        &self,
        axis: usize,
    ) -> Result<Array2<f64>, crate::terms::basis::BasisError> {
        let base_mat = self.base.as_materializable().ok_or_else(|| {
            crate::terms::basis::BasisError::Other(
                "rowwise kronecker psi operator: base operator does not support materialization"
                    .to_string(),
            )
        })?;
        let base = base_mat.materialize_first(axis)?;
        let blocks: Vec<Array2<f64>> = self
            .time_bases
            .iter()
            .map(|basis| dense_rowwise_kronecker(base.view(), basis.view()))
            .collect();
        Ok(stack_dense_row_blocks(&blocks))
    }
}

pub(crate) fn build_rowwise_kronecker_psi_operator(
    base: Arc<dyn CustomFamilyPsiDerivativeOperator>,
    time_bases: Vec<Arc<Array2<f64>>>,
) -> Result<Arc<dyn CustomFamilyPsiDerivativeOperator>, String> {
    Ok(Arc::new(RowwiseKroneckerPsiDerivativeOperator::new(
        base, time_bases,
    )?))
}

#[derive(Clone)]
pub(crate) struct CustomFamilyPsiDesignAction {
    operator: Arc<dyn CustomFamilyPsiDerivativeOperator>,
    axis: usize,
    row_range: Range<usize>,
    p: usize,
}

impl CustomFamilyPsiDesignAction {
    pub(crate) fn from_first_derivative(
        deriv: &CustomFamilyBlockPsiDerivative,
        total_rows: usize,
        p: usize,
        row_range: Range<usize>,
        label: &str,
    ) -> Result<Self, String> {
        if row_range.end > total_rows {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "{label} row range {}..{} exceeds total rows {total_rows}",
                    row_range.start, row_range.end
                ),
            }
            .into());
        }
        if let Some(op) = deriv.implicit_operator.as_ref()
            && op.n_data() == total_rows
            && op.p_out() == p
        {
            return Ok(Self {
                operator: Arc::clone(op),
                axis: deriv.implicit_axis,
                row_range,
                p,
            });
        }
        Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
            "{label} is missing an implicit x_psi operator with shape {}x{}; got dense payload {}x{} instead",
            total_rows,
            p,
            deriv.x_psi.nrows(),
            deriv.x_psi.ncols(),
        ) }.into())
    }

    pub(crate) fn is_implicit(&self) -> bool {
        true
    }

    pub(crate) fn nrows(&self) -> usize {
        self.row_range.end - self.row_range.start
    }

    pub(crate) fn slice_rows(&self, row_range: Range<usize>) -> Result<Self, String> {
        if row_range.end > self.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi design row range {}..{} exceeds available rows {}",
                    row_range.start,
                    row_range.end,
                    self.nrows()
                ),
            }
            .into());
        }
        Ok(Self {
            operator: Arc::clone(&self.operator),
            axis: self.axis,
            row_range: (self.row_range.start + row_range.start)
                ..(self.row_range.start + row_range.end),
            p: self.p,
        })
    }

    pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Array1<f64> {
        assert_eq!(u.len(), self.p);
        self.operator
            .forward_mul(self.axis, &u)
            .expect("radial scalar evaluation failed during implicit psi forward_mul")
            .slice(ndarray::s![self.row_range.clone()])
            .to_owned()
    }

    pub(crate) fn transpose_mul(&self, v: ArrayView1<'_, f64>) -> Array1<f64> {
        assert_eq!(v.len(), self.row_range.end - self.row_range.start);
        if self.row_range.start == 0 && self.row_range.end == self.operator.n_data() {
            self.operator
                .transpose_mul(self.axis, &v)
                .expect("radial scalar evaluation failed during implicit psi transpose_mul")
        } else {
            let mut expanded = Array1::<f64>::zeros(self.operator.n_data());
            expanded
                .slice_mut(ndarray::s![self.row_range.clone()])
                .assign(&v);
            self.operator
                .transpose_mul(self.axis, &expanded.view())
                .expect("radial scalar evaluation failed during implicit psi transpose_mul")
        }
    }

    fn absolute_rows(&self, rows: Range<usize>) -> Range<usize> {
        (self.row_range.start + rows.start)..(self.row_range.start + rows.end)
    }

    pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
        if rows.end > self.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi design row range {}..{} exceeds available rows {}",
                    rows.start,
                    rows.end,
                    self.nrows()
                ),
            }
            .into());
        }
        self.operator
            .row_chunk_first(self.axis, self.absolute_rows(rows))
            .map_err(|e| e.to_string())
    }

    pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
        if row >= self.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi design row {row} exceeds available rows {}",
                    self.nrows()
                ),
            }
            .into());
        }
        let absolute_row = self.row_range.start + row;
        let mut out = Array1::<f64>::zeros(self.p);
        self.operator
            .row_vector_first_into(self.axis, absolute_row, out.view_mut())
            .map_err(|e| e.to_string())?;
        Ok(out)
    }
}

#[derive(Clone, Copy)]
enum CustomFamilyPsiSecondDesignLevel {
    Diag(usize),
    Cross(usize, usize),
}

#[derive(Clone)]
pub(crate) struct CustomFamilyPsiSecondDesignAction {
    operator: Arc<dyn CustomFamilyPsiDerivativeOperator>,
    level: CustomFamilyPsiSecondDesignLevel,
    row_range: Range<usize>,
    p: usize,
}

impl CustomFamilyPsiSecondDesignAction {
    pub(crate) fn from_second_derivative(
        deriv_i: &CustomFamilyBlockPsiDerivative,
        deriv_j: &CustomFamilyBlockPsiDerivative,
        total_rows: usize,
        p: usize,
        row_range: Range<usize>,
        label: &str,
    ) -> Result<Option<Self>, String> {
        if row_range.end > total_rows {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "{label} row range {}..{} exceeds total rows {total_rows}",
                    row_range.start, row_range.end
                ),
            }
            .into());
        }
        let Some(op) = deriv_i.implicit_operator.as_ref() else {
            return Ok(None);
        };
        if op.n_data() != total_rows || op.p_out() != p {
            return Err(CustomFamilyError::UnsupportedConfiguration {
                reason: format!(
                    "{label} is missing an implicit x_psi_psi operator with shape {}x{}",
                    total_rows, p
                ),
            }
            .into());
        }
        let same_group = deriv_i.implicit_group_id.is_some()
            && deriv_i.implicit_group_id == deriv_j.implicit_group_id;
        if !same_group {
            return Ok(None);
        }
        let level = if deriv_i.implicit_axis == deriv_j.implicit_axis {
            CustomFamilyPsiSecondDesignLevel::Diag(deriv_i.implicit_axis)
        } else {
            CustomFamilyPsiSecondDesignLevel::Cross(deriv_i.implicit_axis, deriv_j.implicit_axis)
        };
        Ok(Some(Self {
            operator: Arc::clone(op),
            level,
            row_range,
            p,
        }))
    }

    pub(crate) fn nrows(&self) -> usize {
        self.row_range.end - self.row_range.start
    }

    pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Array1<f64> {
        assert_eq!(u.len(), self.p);
        let out = match self.level {
            CustomFamilyPsiSecondDesignLevel::Diag(axis) => self
                .operator
                .forward_mul_second_diag(axis, &u)
                .expect("radial scalar evaluation failed during implicit psi second forward_mul"),
            CustomFamilyPsiSecondDesignLevel::Cross(axis_d, axis_e) => self
                .operator
                .forward_mul_second_cross(axis_d, axis_e, &u)
                .expect("radial scalar evaluation failed during implicit psi second forward_mul"),
        };
        out.slice(ndarray::s![self.row_range.clone()]).to_owned()
    }

    pub(crate) fn transpose_mul(&self, v: ArrayView1<'_, f64>) -> Array1<f64> {
        assert_eq!(v.len(), self.nrows());
        let expanded = if self.row_range.start == 0 && self.row_range.end == self.operator.n_data()
        {
            None
        } else {
            let mut expanded = Array1::<f64>::zeros(self.operator.n_data());
            expanded
                .slice_mut(ndarray::s![self.row_range.clone()])
                .assign(&v);
            Some(expanded)
        };
        let full_v = expanded.as_ref().map_or(v, |arr| arr.view());
        match self.level {
            CustomFamilyPsiSecondDesignLevel::Diag(axis) => self
                .operator
                .transpose_mul_second_diag(axis, &full_v)
                .expect("radial scalar evaluation failed during implicit psi second transpose_mul"),
            CustomFamilyPsiSecondDesignLevel::Cross(axis_d, axis_e) => self
                .operator
                .transpose_mul_second_cross(axis_d, axis_e, &full_v)
                .expect("radial scalar evaluation failed during implicit psi second transpose_mul"),
        }
    }

    fn absolute_rows(&self, rows: Range<usize>) -> Range<usize> {
        (self.row_range.start + rows.start)..(self.row_range.start + rows.end)
    }

    pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
        if rows.end > self.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi second-design row range {}..{} exceeds available rows {}",
                    rows.start,
                    rows.end,
                    self.nrows()
                ),
            }
            .into());
        }
        match self.level {
            CustomFamilyPsiSecondDesignLevel::Diag(axis) => self
                .operator
                .row_chunk_second_diag(axis, self.absolute_rows(rows))
                .map_err(|e| e.to_string()),
            CustomFamilyPsiSecondDesignLevel::Cross(axis_d, axis_e) => self
                .operator
                .row_chunk_second_cross(axis_d, axis_e, self.absolute_rows(rows))
                .map_err(|e| e.to_string()),
        }
    }

    pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
        self.row_chunk(row..row + 1).map(|m| m.row(0).to_owned())
    }
}

#[derive(Clone, Copy)]
pub(crate) enum CustomFamilyPsiLinearMapRef<'a> {
    Dense(&'a Array2<f64>),
    First(&'a CustomFamilyPsiDesignAction),
    Second(&'a CustomFamilyPsiSecondDesignAction),
    Zero { nrows: usize, ncols: usize },
}

impl CustomFamilyPsiLinearMapRef<'_> {
    pub(crate) fn nrows(&self) -> usize {
        match self {
            Self::Dense(mat) => mat.nrows(),
            Self::First(action) => action.nrows(),
            Self::Second(action) => action.nrows(),
            Self::Zero { nrows, .. } => *nrows,
        }
    }

    pub(crate) fn ncols(&self) -> usize {
        match self {
            Self::Dense(mat) => mat.ncols(),
            Self::First(action) => action.p,
            Self::Second(action) => action.p,
            Self::Zero { ncols, .. } => *ncols,
        }
    }

    pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Array1<f64> {
        match self {
            Self::Dense(mat) => mat.dot(&u),
            Self::First(action) => action.forward_mul(u),
            Self::Second(action) => action.forward_mul(u),
            Self::Zero { nrows, .. } => Array1::<f64>::zeros(*nrows),
        }
    }

    pub(crate) fn transpose_mul(&self, v: ArrayView1<'_, f64>) -> Array1<f64> {
        match self {
            Self::Dense(mat) => crate::faer_ndarray::fast_atv(mat, &v),
            Self::First(action) => action.transpose_mul(v),
            Self::Second(action) => action.transpose_mul(v),
            Self::Zero { ncols, .. } => Array1::<f64>::zeros(*ncols),
        }
    }

    pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
        if row >= self.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi linear-map row {row} out of bounds for {} rows",
                    self.nrows()
                ),
            }
            .into());
        }
        Ok(match self {
            Self::Dense(mat) => mat.row(row).to_owned(),
            Self::First(action) => action.row_vector(row)?,
            Self::Second(action) => action.row_vector(row)?,
            Self::Zero { ncols, .. } => Array1::<f64>::zeros(*ncols),
        })
    }

    pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
        if rows.end > self.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi linear-map row range {}..{} out of bounds for {} rows",
                    rows.start,
                    rows.end,
                    self.nrows()
                ),
            }
            .into());
        }
        Ok(match self {
            Self::Dense(mat) => mat.slice(ndarray::s![rows, ..]).to_owned(),
            Self::First(action) => action.row_chunk(rows)?,
            Self::Second(action) => action.row_chunk(rows)?,
            Self::Zero { ncols, .. } => Array2::<f64>::zeros((rows.end - rows.start, *ncols)),
        })
    }
}

#[derive(Clone)]
pub(crate) enum PsiDesignMap {
    Zero {
        nrows: usize,
        ncols: usize,
    },
    Dense {
        matrix: Arc<Array2<f64>>,
    },
    First {
        action: CustomFamilyPsiDesignAction,
    },
    Second {
        action: CustomFamilyPsiSecondDesignAction,
    },
}

impl PsiDesignMap {
    pub(crate) fn ncols(&self) -> usize {
        match self {
            Self::Zero { ncols, .. } => *ncols,
            Self::Dense { matrix } => matrix.ncols(),
            Self::First { action } => action.p,
            Self::Second { action } => action.p,
        }
    }

    pub(crate) fn forward_mul(&self, u: ArrayView1<'_, f64>) -> Result<Array1<f64>, String> {
        match self {
            Self::Zero { nrows, .. } => Ok(Array1::<f64>::zeros(*nrows)),
            Self::Dense { matrix } => Ok(matrix.dot(&u)),
            Self::First { action } => Ok(action.forward_mul(u)),
            Self::Second { action } => Ok(action.forward_mul(u)),
        }
    }

    pub(crate) fn row_chunk(&self, rows: Range<usize>) -> Result<Array2<f64>, String> {
        let ncols = self.ncols();
        match self {
            Self::Zero { .. } => Ok(Array2::<f64>::zeros((rows.end - rows.start, ncols))),
            Self::Dense { matrix } => Ok(matrix.slice(ndarray::s![rows, ..]).to_owned()),
            Self::First { action } => action.row_chunk(rows),
            Self::Second { action } => action.row_chunk(rows),
        }
    }

    pub(crate) fn row_vector(&self, row: usize) -> Result<Array1<f64>, String> {
        match self {
            Self::Zero { ncols, .. } => Ok(Array1::<f64>::zeros(*ncols)),
            Self::Dense { matrix } => Ok(matrix.row(row).to_owned()),
            Self::First { action } => action.row_vector(row),
            Self::Second { action } => action.row_vector(row),
        }
    }

    /// Borrow this map as a `CustomFamilyPsiLinearMapRef`, handling every
    /// variant. This is the zero-allocation replacement for the pattern
    /// `first_psi_linear_map(action.as_ref(), dense.as_ref(), n, p)`.
    pub(crate) fn as_linear_map_ref(&self) -> CustomFamilyPsiLinearMapRef<'_> {
        match self {
            Self::Zero { nrows, ncols } => CustomFamilyPsiLinearMapRef::Zero {
                nrows: *nrows,
                ncols: *ncols,
            },
            Self::Dense { matrix } => CustomFamilyPsiLinearMapRef::Dense(matrix.as_ref()),
            Self::First { action } => CustomFamilyPsiLinearMapRef::First(action),
            Self::Second { action } => CustomFamilyPsiLinearMapRef::Second(action),
        }
    }

    /// Return a reference to the first-derivative operator action if this map
    /// holds one. Useful for callers that need to pass ownership of the action
    /// into downstream operator builders.
    pub(crate) fn as_first_action(&self) -> Option<&CustomFamilyPsiDesignAction> {
        match self {
            Self::First { action } => Some(action),
            _ => None,
        }
    }

    /// Clone the first-derivative operator action if this map holds one.
    pub(crate) fn cloned_first_action(&self) -> Option<CustomFamilyPsiDesignAction> {
        self.as_first_action().cloned()
    }
}

fn is_zero_array(a: &Array2<f64>) -> bool {
    a.iter().all(|x| *x == 0.0)
}

pub(crate) fn weighted_crossprod_psi_maps(
    left: CustomFamilyPsiLinearMapRef<'_>,
    weights: ArrayView1<'_, f64>,
    right: CustomFamilyPsiLinearMapRef<'_>,
) -> Result<Array2<f64>, String> {
    if left.nrows() != weights.len() || right.nrows() != weights.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "psi weighted crossprod row mismatch: left={}, weights={}, right={}",
                left.nrows(),
                weights.len(),
                right.nrows()
            ),
        }
        .into());
    }
    let p_left = left.ncols();
    let p_right = right.ncols();
    if p_left == 0 || p_right == 0 {
        return Ok(Array2::<f64>::zeros((p_left, p_right)));
    }
    // Zero fast path: either operand being the Zero variant makes the full product zero.
    if matches!(left, CustomFamilyPsiLinearMapRef::Zero { .. })
        || matches!(right, CustomFamilyPsiLinearMapRef::Zero { .. })
    {
        return Ok(Array2::<f64>::zeros((p_left, p_right)));
    }

    let mut out = Array2::<f64>::zeros((p_left, p_right));
    // Stream row chunks of both operands so the weighted intermediate is never
    // materialized at full n x p_right size. Chunk size is governed by the
    // resource policy's row_chunk_target_bytes.
    let policy = ResourcePolicy::default_library();
    let rows_per_chunk = crate::resource::rows_for_target_bytes(
        policy.row_chunk_target_bytes,
        p_left.saturating_add(p_right).max(1),
    );

    let n = weights.len();
    for start in (0..n).step_by(rows_per_chunk) {
        let end = (start + rows_per_chunk).min(n);
        let rows = start..end;
        let xl = left.row_chunk(rows.clone())?;
        let mut xr = right.row_chunk(rows.clone())?;
        for local in 0..xr.nrows() {
            let w = weights[start + local];
            if w != 1.0 {
                for j in 0..p_right {
                    xr[[local, j]] *= w;
                }
            }
        }
        out += &fast_atb(&xl, &xr);
    }
    Ok(out)
}

pub(crate) fn first_psi_linear_map<'a>(
    action: Option<&'a CustomFamilyPsiDesignAction>,
    dense: Option<&'a Array2<f64>>,
    nrows: usize,
    ncols: usize,
) -> CustomFamilyPsiLinearMapRef<'a> {
    if let Some(action) = action {
        CustomFamilyPsiLinearMapRef::First(action)
    } else if let Some(dense) = dense
        && dense.nrows() == nrows
        && dense.ncols() == ncols
    {
        CustomFamilyPsiLinearMapRef::Dense(dense)
    } else {
        CustomFamilyPsiLinearMapRef::Zero { nrows, ncols }
    }
}

pub(crate) fn second_psi_linear_map<'a>(
    action: Option<&'a CustomFamilyPsiSecondDesignAction>,
    dense: Option<&'a Array2<f64>>,
    nrows: usize,
    ncols: usize,
) -> CustomFamilyPsiLinearMapRef<'a> {
    if let Some(action) = action {
        CustomFamilyPsiLinearMapRef::Second(action)
    } else if let Some(dense) = dense
        && dense.nrows() == nrows
        && dense.ncols() == ncols
    {
        CustomFamilyPsiLinearMapRef::Dense(dense)
    } else {
        CustomFamilyPsiLinearMapRef::Zero { nrows, ncols }
    }
}

pub(crate) struct CustomFamilyJointDesignChannel {
    range: Range<usize>,
    design: DesignMatrix,
    psi_derivative: Option<CustomFamilyPsiDesignAction>,
}

impl CustomFamilyJointDesignChannel {
    pub(crate) fn new<D>(
        range: Range<usize>,
        design: D,
        psi_derivative: Option<CustomFamilyPsiDesignAction>,
    ) -> Self
    where
        D: Into<DesignMatrix>,
    {
        Self {
            range,
            design: design.into(),
            psi_derivative,
        }
    }

    fn coefficients(&self, full: &Array1<f64>) -> Array1<f64> {
        full.slice(ndarray::s![self.range.clone()]).to_owned()
    }

    fn apply(&self, full: &Array1<f64>) -> Array1<f64> {
        let coeffs = self.coefficients(full);
        self.design.matrixvectormultiply(&coeffs)
    }

    fn apply_transpose(&self, values: &Array1<f64>) -> Array1<f64> {
        self.design.transpose_vector_multiply(values)
    }
}

pub(crate) struct CustomFamilyJointDesignPairContribution {
    left_channel: usize,
    right_channel: usize,
    weights: Array1<f64>,
    drift_weights: Array1<f64>,
}

impl CustomFamilyJointDesignPairContribution {
    pub(crate) fn new(
        left_channel: usize,
        right_channel: usize,
        weights: Array1<f64>,
        drift_weights: Array1<f64>,
    ) -> Self {
        Self {
            left_channel,
            right_channel,
            weights,
            drift_weights,
        }
    }
}

pub(crate) struct CustomFamilyJointPsiOperator {
    total_dim: usize,
    channels: Vec<CustomFamilyJointDesignChannel>,
    pair_contributions: Vec<CustomFamilyJointDesignPairContribution>,
    /// Optional dense correction for small cross-blocks (e.g. h/w parameters)
    /// that don't warrant their own weighted-Gram channel.
    dense_correction: Option<Array2<f64>>,
}

impl CustomFamilyJointPsiOperator {
    pub(crate) fn new(
        total_dim: usize,
        channels: Vec<CustomFamilyJointDesignChannel>,
        pair_contributions: Vec<CustomFamilyJointDesignPairContribution>,
    ) -> Self {
        Self {
            total_dim,
            channels,
            pair_contributions,
            dense_correction: None,
        }
    }
}

impl HyperOperator for CustomFamilyJointPsiOperator {
    fn dim(&self) -> usize {
        self.total_dim
    }

    fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
        assert_eq!(v.len(), self.total_dim);
        let base_vals: Vec<Array1<f64>> = self
            .channels
            .iter()
            .map(|channel| channel.apply(v))
            .collect();
        let deriv_vals: Vec<Option<Array1<f64>>> = self
            .channels
            .iter()
            .map(|channel| {
                channel
                    .psi_derivative
                    .as_ref()
                    .map(|deriv| deriv.forward_mul(v.slice(ndarray::s![channel.range.clone()])))
            })
            .collect();

        let mut out = if let Some(ref corr) = self.dense_correction {
            corr.dot(v)
        } else {
            Array1::<f64>::zeros(self.total_dim)
        };
        for pair in &self.pair_contributions {
            let left = &self.channels[pair.left_channel];
            let right_base = &base_vals[pair.right_channel];
            let weighted_drift = &pair.drift_weights * right_base;
            let mut contrib = left.apply_transpose(&weighted_drift);

            if let Some(left_deriv) = left.psi_derivative.as_ref() {
                let weighted_right = &pair.weights * right_base;
                contrib += &left_deriv.transpose_mul(weighted_right.view());
            }

            if let Some(right_deriv) = deriv_vals[pair.right_channel].as_ref() {
                let weighted_right = &pair.weights * right_deriv;
                contrib += &left.apply_transpose(&weighted_right);
            }

            let mut out_slice = out.slice_mut(ndarray::s![left.range.clone()]);
            out_slice += &contrib;
        }

        out
    }

    fn bilinear(&self, v: &Array1<f64>, u: &Array1<f64>) -> f64 {
        assert_eq!(v.len(), self.total_dim);
        assert_eq!(u.len(), self.total_dim);
        let base_v: Vec<Array1<f64>> = self
            .channels
            .iter()
            .map(|channel| channel.apply(v))
            .collect();
        let base_u: Vec<Array1<f64>> = self
            .channels
            .iter()
            .map(|channel| channel.apply(u))
            .collect();
        let deriv_v: Vec<Option<Array1<f64>>> = self
            .channels
            .iter()
            .map(|channel| {
                channel
                    .psi_derivative
                    .as_ref()
                    .map(|deriv| deriv.forward_mul(v.slice(ndarray::s![channel.range.clone()])))
            })
            .collect();
        let deriv_u: Vec<Option<Array1<f64>>> = self
            .channels
            .iter()
            .map(|channel| {
                channel
                    .psi_derivative
                    .as_ref()
                    .map(|deriv| deriv.forward_mul(u.slice(ndarray::s![channel.range.clone()])))
            })
            .collect();

        let mut total = if let Some(ref corr) = self.dense_correction {
            v.dot(&corr.dot(u))
        } else {
            0.0
        };
        for pair in &self.pair_contributions {
            let left_base_u = &base_u[pair.left_channel];
            let right_base_v = &base_v[pair.right_channel];
            total += left_base_u.dot(&(&pair.drift_weights * right_base_v));

            if let Some(left_deriv_u) = deriv_u[pair.left_channel].as_ref() {
                total += left_deriv_u.dot(&(&pair.weights * right_base_v));
            }
            if let Some(right_deriv_v) = deriv_v[pair.right_channel].as_ref() {
                total += left_base_u.dot(&(&pair.weights * right_deriv_v));
            }
        }

        total
    }

    fn to_dense(&self) -> Array2<f64> {
        let mut out = self
            .dense_correction
            .clone()
            .unwrap_or_else(|| Array2::<f64>::zeros((self.total_dim, self.total_dim)));
        let mut basis = Array1::<f64>::zeros(self.total_dim);
        for j in 0..self.total_dim {
            basis[j] = 1.0;
            // Use mul_vec without the dense_correction part (already in `out`).
            let base_vals: Vec<Array1<f64>> = self
                .channels
                .iter()
                .map(|channel| channel.apply(&basis))
                .collect();
            let deriv_vals: Vec<Option<Array1<f64>>> = self
                .channels
                .iter()
                .map(|channel| {
                    channel.psi_derivative.as_ref().map(|deriv| {
                        deriv.forward_mul(basis.slice(ndarray::s![channel.range.clone()]))
                    })
                })
                .collect();
            let mut col = Array1::<f64>::zeros(self.total_dim);
            for pair in &self.pair_contributions {
                let left = &self.channels[pair.left_channel];
                let right_base = &base_vals[pair.right_channel];
                let weighted_drift = &pair.drift_weights * right_base;
                let mut contrib = left.apply_transpose(&weighted_drift);
                if let Some(left_deriv) = left.psi_derivative.as_ref() {
                    let weighted_right = &pair.weights * right_base;
                    contrib += &left_deriv.transpose_mul(weighted_right.view());
                }
                if let Some(right_deriv) = deriv_vals[pair.right_channel].as_ref() {
                    let weighted_right = &pair.weights * right_deriv;
                    contrib += &left.apply_transpose(&weighted_right);
                }
                col.slice_mut(ndarray::s![left.range.clone()])
                    .scaled_add(1.0, &contrib);
            }
            out.column_mut(j).scaled_add(1.0, &col);
            basis[j] = 0.0;
        }
        out
    }

    fn is_implicit(&self) -> bool {
        self.dense_correction.is_none()
            && self.channels.iter().any(|channel| {
                channel
                    .psi_derivative
                    .as_ref()
                    .is_some_and(|d| d.is_implicit())
            })
    }
}

fn shared_dense_design_cache() -> &'static Mutex<HashMap<(usize, usize, usize), Weak<Array2<f64>>>>
{
    static CACHE: OnceLock<Mutex<HashMap<(usize, usize, usize), Weak<Array2<f64>>>>> =
        OnceLock::new();
    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}

pub(crate) fn shared_dense_arc(x: &Array2<f64>) -> Arc<Array2<f64>> {
    let key = (x.as_ptr() as usize, x.nrows(), x.ncols());
    let cache = shared_dense_design_cache();
    if let Ok(mut guard) = cache.lock() {
        if let Some(shared) = guard.get(&key).and_then(Weak::upgrade) {
            return shared;
        }
        guard.retain(|_, shared| shared.strong_count() > 0);
        let shared = Arc::new(x.clone());
        guard.insert(key, Arc::downgrade(&shared));
        shared
    } else {
        Arc::new(x.clone())
    }
}

pub(crate) fn resolve_custom_family_x_psi_map(
    deriv: &CustomFamilyBlockPsiDerivative,
    n: usize,
    p: usize,
    row_range: Range<usize>,
    label: &str,
    policy: &ResourcePolicy,
) -> Result<PsiDesignMap, String> {
    if row_range.end > n {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{label}: row range {}..{} exceeds total rows {n}",
                row_range.start, row_range.end
            ),
        }
        .into());
    }

    // Prefer operator action when dimensions match.
    if let Some(op) = deriv.implicit_operator.as_ref()
        && op.n_data() == n
        && op.p_out() == p
    {
        return Ok(PsiDesignMap::First {
            action: CustomFamilyPsiDesignAction::from_first_derivative(
                deriv, n, p, row_range, label,
            )?,
        });
    }

    // Dense fallback guarded by policy.
    if deriv.x_psi.nrows() == n && deriv.x_psi.ncols() == p {
        match policy.derivative_storage_mode {
            DerivativeStorageMode::AnalyticOperatorRequired => {
                if is_zero_array(&deriv.x_psi) {
                    return Ok(PsiDesignMap::Zero {
                        nrows: row_range.end - row_range.start,
                        ncols: p,
                    });
                }
                return Err(CustomFamilyError::UnsupportedConfiguration {
                    reason: format!(
                        "{label}: dense x_psi fallback disabled by AnalyticOperatorRequired"
                    ),
                }
                .into());
            }
            DerivativeStorageMode::MaterializeIfSmall | DerivativeStorageMode::DiagnosticsOnly => {
                let matrix = if row_range.start == 0 && row_range.end == n {
                    Arc::new(deriv.x_psi.clone())
                } else {
                    Arc::new(
                        deriv
                            .x_psi
                            .slice(ndarray::s![row_range.clone(), ..])
                            .to_owned(),
                    )
                };
                return Ok(PsiDesignMap::Dense { matrix });
            }
        }
    }

    // Empty / zero sentinel.
    if deriv.x_psi.nrows() == 0 || deriv.x_psi.ncols() == 0 {
        return Ok(PsiDesignMap::Zero {
            nrows: row_range.end - row_range.start,
            ncols: p,
        });
    }

    Err(CustomFamilyError::DimensionMismatch {
        reason: format!(
            "{label}: x_psi shape {:?} does not match ({n}, {p})",
            deriv.x_psi.dim()
        ),
    }
    .into())
}

pub(crate) fn resolve_custom_family_x_psi_psi_map(
    deriv_i: &CustomFamilyBlockPsiDerivative,
    deriv_j: &CustomFamilyBlockPsiDerivative,
    local_j: usize,
    n: usize,
    p: usize,
    row_range: Range<usize>,
    label: &str,
    policy: &ResourcePolicy,
) -> Result<PsiDesignMap, String> {
    if row_range.end > n {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{label}: row range {}..{} exceeds total rows {n}",
                row_range.start, row_range.end
            ),
        }
        .into());
    }

    // Prefer operator action when dimensions match.
    if let Some(op) = deriv_i.implicit_operator.as_ref()
        && op.n_data() == n
        && op.p_out() == p
    {
        let same_group = deriv_i.implicit_group_id.is_some()
            && deriv_i.implicit_group_id == deriv_j.implicit_group_id;
        if !same_group {
            return Ok(PsiDesignMap::Zero {
                nrows: row_range.end - row_range.start,
                ncols: p,
            });
        }
        match CustomFamilyPsiSecondDesignAction::from_second_derivative(
            deriv_i,
            deriv_j,
            n,
            p,
            row_range.clone(),
            label,
        )? {
            Some(action) => {
                return Ok(PsiDesignMap::Second { action });
            }
            None => {
                return Ok(PsiDesignMap::Zero {
                    nrows: row_range.end - row_range.start,
                    ncols: p,
                });
            }
        }
    }

    // Dense fallback guarded by policy, reading from the per-second-derivative
    // slot `x_psi_psi[local_j]` if provided.
    if let Some(x_psi_psi) = deriv_i.x_psi_psi.as_ref()
        && let Some(x_ab) = x_psi_psi.get(local_j)
    {
        if x_ab.nrows() == n && x_ab.ncols() == p {
            match policy.derivative_storage_mode {
                DerivativeStorageMode::AnalyticOperatorRequired => {
                    if is_zero_array(x_ab) {
                        return Ok(PsiDesignMap::Zero {
                            nrows: row_range.end - row_range.start,
                            ncols: p,
                        });
                    }
                    return Err(CustomFamilyError::UnsupportedConfiguration {
                        reason: format!(
                            "{label}: dense x_psi_psi fallback disabled by AnalyticOperatorRequired"
                        ),
                    }
                    .into());
                }
                DerivativeStorageMode::MaterializeIfSmall
                | DerivativeStorageMode::DiagnosticsOnly => {
                    let matrix = if row_range.start == 0 && row_range.end == n {
                        Arc::new(x_ab.clone())
                    } else {
                        Arc::new(x_ab.slice(ndarray::s![row_range.clone(), ..]).to_owned())
                    };
                    return Ok(PsiDesignMap::Dense { matrix });
                }
            }
        }
        if x_ab.is_empty() {
            return Ok(PsiDesignMap::Zero {
                nrows: row_range.end - row_range.start,
                ncols: p,
            });
        }
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{label}: x_psi_psi shape {:?} does not match ({n}, {p})",
                x_ab.dim()
            ),
        }
        .into());
    }

    // No operator, no dense slot: treat as zero.
    Ok(PsiDesignMap::Zero {
        nrows: row_range.end - row_range.start,
        ncols: p,
    })
}

#[derive(Clone)]
pub struct ExactNewtonJointPsiTerms {
    pub objective_psi: f64,
    pub score_psi: Array1<f64>,
    pub hessian_psi: Array2<f64>,
    pub hessian_psi_operator: Option<Arc<dyn HyperOperator>>,
}

impl std::fmt::Debug for ExactNewtonJointPsiTerms {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ExactNewtonJointPsiTerms")
            .field("objective_psi", &self.objective_psi)
            .field("score_psi", &self.score_psi)
            .field("hessian_psi", &self.hessian_psi)
            .field(
                "hessian_psi_operator",
                &self.hessian_psi_operator.as_ref().map(|_| "<operator>"),
            )
            .finish()
    }
}

impl ExactNewtonJointPsiTerms {
    fn zeros(total: usize) -> Self {
        Self {
            objective_psi: 0.0,
            score_psi: Array1::zeros(total),
            hessian_psi: Array2::zeros((total, total)),
            hessian_psi_operator: None,
        }
    }
}

pub struct ExactNewtonJointPsiSecondOrderTerms {
    pub objective_psi_psi: f64,
    pub score_psi_psi: Array1<f64>,
    pub hessian_psi_psi: Array2<f64>,
    pub hessian_psi_psi_operator: Option<Box<dyn HyperOperator>>,
}

/// Direction-contracted second-order ψ terms for the profiled θ-HVP (#740).
///
/// The per-pair [`ExactNewtonJointPsiSecondOrderTerms`] are the `(ψ_i, ψ_j)`
/// entries of the joint hyper-Hessian; assembling the full outer Hessian from
/// them costs one O(n) family row pass per pair, i.e. `K²·n`. A matrix-free
/// profiled θ-HVP never needs the individual pairs — it needs, for one applied
/// outer direction with ψ-weights `α_ψ`, the `α`-contraction of those pairs
/// against the combined ψ-direction `ψ(α) = Σ_j α_j ψ_j`:
///
/// ```text
///   objective[i] = Σ_j α_j V_{ψ_i ψ_j}
///   score[i]     = Σ_j α_j g_{ψ_i ψ_j}          (a p-vector per output row i)
///   hessian[i]   = Σ_j α_j D²_β H_L[ψ_i, ψ_j]
///                = D²_β H_L[ψ_i, ψ(α)]            (bilinearity)
/// ```
///
/// All `psi_dim` output rows share the SAME contracted second leg `ψ(α)`, so a
/// family that streams its rows once over `ψ(α)` (carrying every fixed first
/// leg `ψ_i` as a batched factor column) produces every row in a SINGLE n-pass.
/// That is the cost the profiled θ-HVP turns into `K·n`-to-densify /
/// `m·n`-in-CG instead of the dense path's `K²·n`.
///
/// Indexing is over the flattened ψ coordinates in the same order as
/// [`ExactNewtonJointPsiWorkspace::second_order_terms`]; `hessian[i]` carries
/// the `D²_β H_L[ψ_i, ψ(α)]` drift as a [`DriftDerivResult`] (dense or
/// operator-backed) plus any block-local `S_{ψ_i ψ_j}` penalty motion folded by
/// the family, exactly mirroring the per-pair `hessian_psi_psi(_operator)`.
pub struct ExactNewtonJointPsiSecondOrderContracted {
    /// `objective[i] = Σ_j α_j V_{ψ_i ψ_j}`, one scalar per ψ output row.
    pub objective: Array1<f64>,
    /// `score[i] = Σ_j α_j g_{ψ_i ψ_j}`, the `psi_dim × total` matrix whose
    /// row `i` is the contracted fixed-β score derivative for output row `i`.
    pub score: Array2<f64>,
    /// `hessian[i] = D²_β H_L[ψ_i, ψ(α)]` for each ψ output row `i`.
    pub hessian: Vec<DriftDerivResult>,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum JointHessianSourcePreference {
    Dense,
    Operator,
}

/// What the consumer is going to *do* with the joint Hessian. This is the
/// intent half of #738's capability-vs-representation split: the call site
/// states what it needs, and the workspace picks the cheapest representation
/// that serves that need (rather than a single per-workspace preference being
/// applied uniformly regardless of how the result is consumed).
///
/// The distinction matters because the same workspace serves several
/// consumers with opposite ideal representations:
/// - the inner Newton/PCG solve only ever applies `H · v`, so a matrix-free
///   HVP (`Operator`) is ideal and a dense build is pure waste;
/// - the REML logdet term factorizes `H + S_λ` (Cholesky / eigendecomposition),
///   so it must hold a dense matrix anyway — handing it an `Operator` only
///   forces an immediate column-basis (or `dense_forced`) re-materialization,
///   so a workspace with a structural direct-dense build should answer `Dense`
///   here and skip the operator wrapper entirely.
///
/// Workspaces refine their representation choice per intent via
/// [`ExactNewtonJointHessianWorkspace::hessian_source_preference_for_intent`];
/// the default keeps the legacy single-preference behaviour so existing
/// workspaces are unchanged.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum MaterializationIntent {
    /// Inner Newton / PCG solve — only applies `H · v`. Matrix-free is ideal.
    InnerSolve,
    /// REML/LAML logdet term — factorizes `H + S_λ`, needs a dense matrix.
    LogdetFactorization,
    /// Outer-Hessian / EFS evaluation — builds the joint hyper terms; today
    /// these route through the same source as the gradient path.
    OuterEvaluation,
    /// Outer-gradient / IFT term assembly.
    OuterGradient,
}

pub trait ExactNewtonJointHessianWorkspace: Send + Sync {
    /// Pre-build any per-row jet caches the workspace will hand to the
    /// outer-eval directional-derivative path. Called once when the
    /// `compute_dh` / `compute_d2h` closures are wired up at top-level
    /// rayon, *before* the outer ext-coordinate `par_iter` enters. The
    /// alternative — letting the cache materialise lazily on first call
    /// from inside the outer `par_iter` — collapses the build's own
    /// `par_iter` to a single worker (the seven other workers are parked
    /// on the cache's `OnceLock`). Default impl is a no-op for workspaces
    /// with no per-row jet cache.
    ///
    /// Deliberately not called from PIRLS-side workspaces (which never
    /// invoke `directional_derivative_operator` and would pay the prime
    /// cost without ever consuming the cache).
    fn warm_up_outer_caches(&self) -> Result<(), String> {
        Ok(())
    }

    fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
        Ok(None)
    }

    /// Preferred representation for callers that can consume either the dense
    /// coefficient Hessian or the matrix-free HVP source.
    fn hessian_source_preference(&self) -> JointHessianSourcePreference {
        JointHessianSourcePreference::Dense
    }

    /// Intent-aware representation choice (#738). Given what the consumer is
    /// about to do with the Hessian ([`MaterializationIntent`]), return the
    /// representation the workspace prefers to hand back. The default keeps the
    /// legacy intent-blind behaviour by delegating to
    /// [`Self::hessian_source_preference`], so existing workspaces are
    /// unchanged. Workspaces with a structural direct-dense build that also
    /// expose a matrix-free HVP override this to answer `Operator` for
    /// [`MaterializationIntent::InnerSolve`] (stream the HVP) and `Dense` for
    /// [`MaterializationIntent::LogdetFactorization`] (the consumer factorizes,
    /// so building the operator wrapper only to re-densify it is pure waste).
    fn hessian_source_preference_for_intent(
        &self,
        intent: MaterializationIntent,
    ) -> JointHessianSourcePreference {
        // Intent-agnostic default: every intent maps to the single legacy
        // preference. Implementors that benefit from per-intent representation
        // (e.g. CTN: dense for logdet, operator for inner solve) override this.
        match intent {
            MaterializationIntent::InnerSolve
            | MaterializationIntent::LogdetFactorization
            | MaterializationIntent::OuterEvaluation
            | MaterializationIntent::OuterGradient => self.hessian_source_preference(),
        }
    }

    /// Forced dense materialization that bypasses any amortization gate the
    /// workspace applies to `hessian_dense`. Callers that genuinely need a
    /// dense matrix (logdet, factorize-based QP solves) use this so they pay
    /// the workspace's structural direct-dense build cost rather than the
    /// caller-side column-basis HVP fallback. Returning `None` means the
    /// workspace has no preferred direct-dense path and the caller should
    /// fall back to column-basis HVP via `hessian_matvec` / `apply`.
    fn hessian_dense_forced(&self) -> Result<Option<Array2<f64>>, String> {
        self.hessian_dense()
    }

    fn joint_log_likelihood_evaluation(&self) -> Result<Option<f64>, String> {
        Ok(None)
    }

    fn joint_gradient_evaluation(
        &self,
    ) -> Result<Option<ExactNewtonJointGradientEvaluation>, String> {
        Ok(None)
    }

    /// Whether `hessian_matvec` / `hessian_matvec_into` will return `Some`.
    /// A cheap synchronisation-free flag consulted by
    /// `exact_newton_joint_hessian_source_from_workspace` to decide whether
    /// to construct a matrix-free `JointHessianSource::Operator` variant.
    /// Returning `false` is equivalent to returning `Ok(None)` from
    /// `hessian_matvec` but avoids allocating and running a full HVP sweep
    /// against a zero vector just to discover unavailability.
    /// Default is `false` matching the base-trait `hessian_matvec` returning
    /// `Ok(None)`. Concrete impls that override `hessian_matvec` must also
    /// override this to return `true`.
    fn hessian_matvec_available(&self) -> bool {
        false
    }

    fn hessian_matvec(&self, arr: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
        assert!(arr.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    /// Write-into variant of `hessian_matvec`. The default implementation
    /// delegates to the legacy owned-return form and copies the result into
    /// `out`, providing back-compat without per-impl work. Concrete impls in
    /// the inner-Newton large-scale hot path (Bernoulli marginal-slope and
    /// survival marginal-slope) override this to write directly into the
    /// caller-owned buffer, eliminating per-PCG-iter `Array1` allocations.
    fn hessian_matvec_into(&self, v: &Array1<f64>, out: &mut Array1<f64>) -> Result<bool, String> {
        match self.hessian_matvec(v)? {
            Some(result) => {
                if result.len() != out.len() {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "hessian_matvec_into: result length {} != out length {}",
                            result.len(),
                            out.len()
                        ),
                    }
                    .into());
                }
                out.assign(&result);
                Ok(true)
            }
            None => Ok(false),
        }
    }

    /// Batched multi-RHS Hessian apply: writes `H · V` into `out`, where `V`
    /// and `out` are `(total, n_rhs)` with each column an independent
    /// direction. Returns `Ok(true)` when the apply was performed and
    /// `Ok(false)` when the workspace exposes no matrix-free apply (mirroring
    /// `hessian_matvec_into`).
    ///
    /// The default implementation applies `hessian_matvec_into` column by
    /// column, so every existing workspace gets a correct batched apply for
    /// free and the batched result is, column for column, **numerically
    /// identical** to looping the single-vector HVP. Workspaces whose Hessian
    /// is `Σ_i Jᵢᵀ Hᵢ Jᵢ` over a streamed/tiled per-row primary Hessian `Hᵢ`
    /// (Bernoulli marginal-slope) override this to sweep each row tile **once**
    /// and apply its `Hᵢ` to all `n_rhs` columns in that single pass — the
    /// per-tile `Hᵢ` read and the design-row projection are then amortised
    /// across every RHS instead of paid once per column. This is the
    /// representation that makes dense reconstruction of a matrix-free operator
    /// (`H = H · [e_0 | … | e_{p-1}]`) one tile sweep wide instead of `p`.
    fn hessian_apply_mat(
        &self,
        v_cols: &Array2<f64>,
        out: &mut Array2<f64>,
    ) -> Result<bool, String> {
        if v_cols.nrows() != out.nrows() || v_cols.ncols() != out.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "hessian_apply_mat: v_cols {}x{} != out {}x{}",
                    v_cols.nrows(),
                    v_cols.ncols(),
                    out.nrows(),
                    out.ncols()
                ),
            }
            .into());
        }
        let total = v_cols.nrows();
        let mut col_in = Array1::<f64>::zeros(total);
        let mut col_out = Array1::<f64>::zeros(total);
        for col in 0..v_cols.ncols() {
            col_in.assign(&v_cols.column(col));
            if !self.hessian_matvec_into(&col_in, &mut col_out)? {
                return Ok(false);
            }
            out.column_mut(col).assign(&col_out);
        }
        Ok(true)
    }

    fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
        Ok(None)
    }

    /// Exact row-local contractions for
    /// `trace(F^T · D_beta H[d_j] · F)` over many coefficient directions.
    ///
    /// Workspaces that own the current row cache can implement this to avoid
    /// rebuilding row contexts or materializing each `D_beta H[d_j]` as a
    /// coefficient-space operator when the caller only needs its projected
    /// trace against the fixed logdet factor `F`.
    fn projected_directional_derivative_traces(
        &self,
        factor: &Array2<f64>,
        directions: &Array2<f64>,
    ) -> Result<Option<Array1<f64>>, String> {
        assert_eq!(
            factor.nrows(),
            directions.nrows(),
            "projected directional derivative traces require shared coefficient dimension"
        );
        Ok(None)
    }

    fn directional_derivative(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String>;

    fn directional_derivative_operator(
        &self,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<Arc<dyn HyperOperator>>, String> {
        Ok(self.directional_derivative(d_beta_flat)?.map(|matrix| {
            Arc::new(crate::solver::estimate::reml::unified::DenseMatrixHyperOperator { matrix })
                as Arc<dyn HyperOperator>
        }))
    }

    fn directional_derivative_operators(
        &self,
        d_beta_flats: &[Array1<f64>],
    ) -> Result<Vec<Option<Arc<dyn HyperOperator>>>, String> {
        d_beta_flats
            .iter()
            .map(|d_beta_flat| self.directional_derivative_operator(d_beta_flat))
            .collect()
    }

    fn second_directional_derivative(
        &self,
        arr: &Array1<f64>,
        arr2: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        assert!(arr.iter().all(|v| !v.is_nan()));
        assert!(arr2.iter().all(|v| !v.is_nan()));
        Ok(None)
    }

    fn second_directional_derivative_operator(
        &self,
        d_beta_u: &Array1<f64>,
        d_beta_v: &Array1<f64>,
    ) -> Result<Option<Arc<dyn HyperOperator>>, String> {
        Ok(self
            .second_directional_derivative(d_beta_u, d_beta_v)?
            .map(|matrix| {
                Arc::new(
                    crate::solver::estimate::reml::unified::DenseMatrixHyperOperator { matrix },
                ) as Arc<dyn HyperOperator>
            }))
    }

    fn second_directional_derivative_operators(
        &self,
        d_beta_pairs: &[(Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<Arc<dyn HyperOperator>>>, String> {
        d_beta_pairs
            .iter()
            .map(|(u, v)| self.second_directional_derivative_operator(u, v))
            .collect()
    }
}

pub trait ExactNewtonJointPsiWorkspace: Send + Sync {
    fn first_order_terms(&self, idx: usize) -> Result<Option<ExactNewtonJointPsiTerms>, String> {
        assert!(idx < usize::MAX);
        Ok(None)
    }

    fn first_order_terms_all(&self) -> Result<Option<Vec<ExactNewtonJointPsiTerms>>, String> {
        Ok(None)
    }

    fn second_order_terms(
        &self,
        psi_i: usize,
        psi_j: usize,
    ) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String>;

    /// Direction-contracted second-order ψ terms for the profiled θ-HVP (#740).
    ///
    /// Given the ψ-block weights `alpha_psi` (length `psi_dim`, the ψ slice of
    /// one applied outer direction α), return the `α`-contraction of every
    /// `(ψ_i, ψ_j)` second-order term against the combined ψ-direction
    /// `ψ(α) = Σ_j alpha_psi[j] · ψ_j`, as
    /// [`ExactNewtonJointPsiSecondOrderContracted`]. A family that can stream
    /// its rows once over `ψ(α)` overrides this so the profiled outer-Hessian
    /// operator applies one combined-direction n-pass per matvec instead of the
    /// dense path's `K²` per-pair [`Self::second_order_terms`] passes.
    ///
    /// Default returns `None`: the profiled θ-HVP operator is then not built and
    /// the evaluator keeps the exact per-pair assembly (dense
    /// `compute_outer_hessian` / `build_outer_hessian_operator`). Overriding
    /// this method is purely a representation/cost choice — it must produce the
    /// exact same contraction the per-pair terms would, which the
    /// `profiled_theta_hvp_outer_hessian_fd` finite-difference cross-check
    /// guards.
    fn second_order_terms_contracted(
        &self,
        alpha_psi: &[f64],
    ) -> Result<Option<ExactNewtonJointPsiSecondOrderContracted>, String> {
        assert!(alpha_psi.len() < usize::MAX);
        Ok(None)
    }

    fn hessian_directional_derivative(
        &self,
        psi_index: usize,
        d_beta_flat: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String>;
}

pub(crate) struct ExactNewtonJointPsiDirectCache<T> {
    entries: Vec<Mutex<Option<Option<Arc<T>>>>>,
    lru: Mutex<std::collections::VecDeque<usize>>,
    limit: usize,
}

impl<T> ExactNewtonJointPsiDirectCache<T> {
    pub(crate) fn new(len: usize) -> Self {
        Self {
            entries: (0..len).map(|_| Mutex::new(None)).collect(),
            lru: Mutex::new(std::collections::VecDeque::new()),
            limit: len,
        }
    }

    fn touch_lru(&self, index: usize) -> Result<(), String> {
        let mut lru = self
            .lru
            .lock()
            .map_err(|_| "joint psi direct cache lru poisoned".to_string())?;
        lru.retain(|&existing| existing != index);
        lru.push_back(index);
        while lru.len() > self.limit {
            let Some(evict_index) = lru.pop_front() else {
                break;
            };
            if evict_index == index {
                continue;
            }
            if let Some(entry) = self.entries.get(evict_index) {
                let mut guard = entry
                    .lock()
                    .map_err(|_| "joint psi direct cache poisoned".to_string())?;
                *guard = None;
            }
        }
        Ok(())
    }

    pub(crate) fn get_or_try_init<F>(&self, index: usize, init: F) -> Result<Option<Arc<T>>, String>
    where
        F: FnOnce() -> Result<Option<T>, String>,
    {
        let Some(entry) = self.entries.get(index) else {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "psi cache index {index} out of bounds for size {}",
                    self.entries.len()
                ),
            }
            .into());
        };
        {
            let guard = entry
                .lock()
                .map_err(|_| "joint psi direct cache poisoned".to_string())?;
            if let Some(cached) = guard.as_ref() {
                let cached = cached.clone();
                // release-early-on-purpose: update LRU after releasing the entry mutex.
                drop(guard);
                self.touch_lru(index)?;
                return Ok(cached);
            }
        }

        let computed = init()?.map(Arc::new);
        let mut guard = entry
            .lock()
            .map_err(|_| "joint psi direct cache poisoned".to_string())?;
        let cached = guard.get_or_insert_with(|| computed.clone());
        let out = cached.clone();
        // release-early-on-purpose: update LRU after releasing the entry mutex.
        drop(guard);
        self.touch_lru(index)?;
        Ok(out)
    }
}

#[derive(Clone)]
pub struct CustomFamilyWarmStart {
    inner: ConstrainedWarmStart,
}

impl CustomFamilyWarmStart {
    pub(crate) fn compatible_with_rho(&self, rho: &Array1<f64>) -> bool {
        screened_outer_warm_start(Some(&self.inner), rho).is_some()
    }

    pub(crate) fn block_beta_len(&self, block_idx: usize) -> Option<usize> {
        self.inner.block_beta.get(block_idx).map(|beta| beta.len())
    }

    pub(crate) fn block_beta_abs_argmax_in_range(
        &self,
        block_idx: usize,
        range: std::ops::Range<usize>,
    ) -> Option<(usize, f64)> {
        let beta = self.inner.block_beta.get(block_idx)?;
        let end = range.end.min(beta.len());
        if range.start >= end {
            return None;
        }
        beta.slice(s![range.start..end])
            .iter()
            .copied()
            .enumerate()
            .map(|(idx, value)| (range.start + idx, value.abs()))
            .filter(|(_, abs)| abs.is_finite())
            .max_by(|left, right| {
                left.1
                    .partial_cmp(&right.1)
                    .unwrap_or(std::cmp::Ordering::Equal)
            })
    }

    /// Build a warm-start payload from a flat cached β and the per-block
    /// coefficient widths. The returned warm-start carries a zero `rho`
    /// (the outer cache will overwrite it on the next eval) and empty
    /// active sets; only the per-block β slices feed the next inner
    /// PIRLS / Newton solve. Used by the spatial-joint outer cache to
    /// seed the family-owned warm-start slot on cache hits so the inner
    /// solve opens at the prior converged iterate instead of cold β.
    pub fn from_cached_beta(
        block_col_counts: &[usize],
        beta: &Array1<f64>,
    ) -> Result<Self, EstimationError> {
        let expected: usize = block_col_counts.iter().copied().sum();
        if beta.len() != expected {
            crate::bail_invalid_estim!(
                "cached inner beta has length {}, but spatial-joint blocks require length {}",
                beta.len(),
                expected
            );
        }
        crate::families::marginal_slope_shared::bail_if_cached_beta_non_finite(beta)?;
        let mut offset = 0usize;
        let mut block_beta = Vec::with_capacity(block_col_counts.len());
        for &width in block_col_counts {
            let end = offset + width;
            block_beta.push(beta.slice(s![offset..end]).to_owned());
            offset = end;
        }
        Ok(CustomFamilyWarmStart {
            inner: ConstrainedWarmStart {
                rho: Array1::zeros(0),
                block_beta,
                active_sets: vec![None; block_col_counts.len()],
                cached_inner: None,
            },
        })
    }
}

struct CustomOuterState {
    warm_cache: Option<ConstrainedWarmStart>,
    reset_warm_cache: Option<ConstrainedWarmStart>,
    last_error: Option<String>,
    initial_gradient_norm: Option<f64>,
}

impl CustomOuterState {
    fn new(warm_start: Option<ConstrainedWarmStart>) -> Self {
        Self {
            warm_cache: warm_start.clone(),
            reset_warm_cache: warm_start,
            last_error: None,
            initial_gradient_norm: None,
        }
    }

    fn reset(&mut self) {
        self.warm_cache = self.reset_warm_cache.clone();
    }

    fn seed_cached_beta(
        &mut self,
        rho_dim: usize,
        specs: &[ParameterBlockSpec],
        beta: &Array1<f64>,
    ) -> Result<(), EstimationError> {
        let warm_start = constrained_warm_start_from_cached_beta(rho_dim, specs, beta)?;
        self.reset_warm_cache = Some(warm_start.clone());
        self.warm_cache = Some(warm_start);
        self.last_error = None;
        Ok(())
    }
}

pub struct CustomFamilyJointHyperResult {
    pub objective: f64,
    pub gradient: Array1<f64>,
    pub outer_hessian: crate::solver::outer_strategy::HessianResult,
    pub warm_start: CustomFamilyWarmStart,
    /// `false` when the inner blockwise/Newton solve hit its divergence
    /// early-exit or its max-cycle cap. Envelope-theorem outer gradients
    /// and analytic outer Hessians are valid only at a stationary β̂ —
    /// callers that consume `gradient`/`outer_hessian` MUST gate on this
    /// flag and treat non-converged evaluations as inexact (e.g. let ARC
    /// back off the trust region) rather than feeding pathological
    /// derivatives into the outer optimizer.
    pub inner_converged: bool,
}

pub struct CustomFamilyJointHyperEfsResult {
    pub efs_eval: crate::solver::outer_strategy::EfsEval,
    pub warm_start: CustomFamilyWarmStart,
    /// See [`CustomFamilyJointHyperResult::inner_converged`]. EFS gradients
    /// also assume a stationary inner solve.
    pub inner_converged: bool,
}

struct OuterObjectiveEvalResult {
    objective: f64,
    gradient: Array1<f64>,
    outer_hessian: crate::solver::outer_strategy::HessianResult,
    warm_start: ConstrainedWarmStart,
    inner_converged: bool,
}

fn outer_eval_result_to_joint_hyper_result(
    result: OuterObjectiveEvalResult,
) -> CustomFamilyJointHyperResult {
    CustomFamilyJointHyperResult {
        objective: result.objective,
        gradient: result.gradient,
        outer_hessian: result.outer_hessian,
        warm_start: CustomFamilyWarmStart {
            inner: result.warm_start,
        },
        inner_converged: result.inner_converged,
    }
}

struct OwnedDenseOuterHessianOperator {
    matrix: Array2<f64>,
}

impl crate::solver::outer_strategy::OuterHessianOperator for OwnedDenseOuterHessianOperator {
    fn dim(&self) -> usize {
        self.matrix.nrows()
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        if v.len() != self.matrix.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "batched dense outer Hessian matvec length mismatch: got {}, expected {}",
                    v.len(),
                    self.matrix.ncols()
                ),
            }
            .into());
        }
        Ok(self.matrix.dot(v))
    }

    /// Zero-alloc override: write `matrix · v` directly into `out` using a
    /// row-dot loop, avoiding the `matrix.dot(v)` allocation.
    fn apply_into(&self, v: &Array1<f64>, out: &mut Array1<f64>) -> Result<(), String> {
        if v.len() != self.matrix.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "batched dense outer Hessian apply_into input length mismatch: got {}, expected {}",
                    v.len(),
                    self.matrix.ncols()
                ),
            }
            .into());
        }
        if out.len() != self.matrix.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "batched dense outer Hessian apply_into output length mismatch: got {}, expected {}",
                    out.len(),
                    self.matrix.nrows()
                ),
            }
            .into());
        }
        for (row, cell) in self.matrix.rows().into_iter().zip(out.iter_mut()) {
            *cell = row.dot(v);
        }
        Ok(())
    }

    fn is_cheap_to_materialize(&self) -> bool {
        true
    }
}

struct LabeledOuterHessianOperator {
    base: Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
    physical_to_outer: Vec<Option<usize>>,
    outer_dim: usize,
    /// Scratch buffers reused across `apply_into` calls to avoid
    /// per-call allocation of the permuted input and output vectors.
    /// `(physical_in, physical_out)`, each of length `physical_to_outer.len()`.
    scratch: std::sync::Mutex<(ndarray::Array1<f64>, ndarray::Array1<f64>)>,
}

impl LabeledOuterHessianOperator {
    fn new(
        base: Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
        layout: &PenaltyLabelLayout,
    ) -> Self {
        let n_physical = layout.physical_to_outer.len();
        Self {
            base,
            physical_to_outer: layout.physical_to_outer.clone(),
            outer_dim: layout.initial_rho.len(),
            scratch: std::sync::Mutex::new((
                ndarray::Array1::zeros(n_physical),
                ndarray::Array1::zeros(n_physical),
            )),
        }
    }
}

impl crate::solver::outer_strategy::OuterHessianOperator for LabeledOuterHessianOperator {
    fn dim(&self) -> usize {
        self.outer_dim
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        if v.len() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian input length mismatch: got {}, expected {}",
                v.len(),
                self.outer_dim
            ));
        }
        let mut physical = Array1::<f64>::zeros(self.physical_to_outer.len());
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            physical[physical_idx] = outer_idx.map(|idx| v[idx]).unwrap_or(0.0);
        }
        let physical_out = self.base.matvec(&physical)?;
        if physical_out.len() != self.physical_to_outer.len() {
            return Err(format!(
                "labeled outer Hessian physical matvec length mismatch: got {}, expected {}",
                physical_out.len(),
                self.physical_to_outer.len()
            ));
        }
        let mut out = Array1::<f64>::zeros(self.outer_dim);
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                out[outer_idx] += physical_out[physical_idx];
            }
        }
        Ok(out)
    }

    /// Zero-alloc override: reuses hoisted scratch buffers to avoid the
    /// per-call `physical` and `out` allocations in `matvec`.
    fn apply_into(
        &self,
        v: &ndarray::Array1<f64>,
        out: &mut ndarray::Array1<f64>,
    ) -> Result<(), String> {
        if v.len() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian apply_into input length mismatch: got {}, expected {}",
                v.len(),
                self.outer_dim
            ));
        }
        if out.len() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian apply_into output length mismatch: got {}, expected {}",
                out.len(),
                self.outer_dim
            ));
        }
        let mut guard = self
            .scratch
            .lock()
            .map_err(|_| "labeled outer Hessian scratch lock poisoned".to_string())?;
        let (physical_in, physical_out) = &mut *guard;
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            physical_in[physical_idx] = outer_idx.map(|idx| v[idx]).unwrap_or(0.0);
        }
        self.base.apply_into(physical_in, physical_out)?;
        if physical_out.len() != self.physical_to_outer.len() {
            return Err(format!(
                "labeled outer Hessian physical apply_into length mismatch: got {}, expected {}",
                physical_out.len(),
                self.physical_to_outer.len()
            ));
        }
        out.fill(0.0);
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                out[outer_idx] += physical_out[physical_idx];
            }
        }
        Ok(())
    }

    fn mul_mat(&self, factor: ndarray::ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
        if factor.nrows() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian factor row mismatch: got {}, expected {}",
                factor.nrows(),
                self.outer_dim
            ));
        }
        let mut physical_factor =
            Array2::<f64>::zeros((self.physical_to_outer.len(), factor.ncols()));
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                physical_factor
                    .row_mut(physical_idx)
                    .assign(&factor.row(outer_idx));
            }
        }
        let physical_out = self.base.mul_mat(physical_factor.view())?;
        if physical_out.nrows() != self.physical_to_outer.len() {
            return Err(format!(
                "labeled outer Hessian physical output row mismatch: got {}, expected {}",
                physical_out.nrows(),
                self.physical_to_outer.len()
            ));
        }
        let mut out = Array2::<f64>::zeros((self.outer_dim, factor.ncols()));
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                let physical_row = physical_out.row(physical_idx);
                out.row_mut(outer_idx).scaled_add(1.0, &physical_row);
            }
        }
        Ok(out)
    }

    fn is_cheap_to_materialize(&self) -> bool {
        self.base.is_cheap_to_materialize()
    }

    fn materialization_capability(
        &self,
    ) -> crate::solver::outer_strategy::OuterHessianMaterialization {
        self.base.materialization_capability()
    }
}

fn custom_family_batched_outer_hessian_operator<F: CustomFamily>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    rho: &Array1<f64>,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    eval_mode: EvalMode,
) -> Result<Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>, String> {
    if eval_mode != EvalMode::ValueGradientHessian {
        return Ok(None);
    }
    let Some(terms) =
        family.batched_outer_hessian_terms(states, specs, derivative_blocks, rho, workspace)?
    else {
        return Ok(None);
    };
    match terms.outer_hessian {
        crate::solver::outer_strategy::HessianResult::Operator(operator) => Ok(Some(operator)),
        crate::solver::outer_strategy::HessianResult::Analytic(matrix) => {
            Ok(Some(Arc::new(OwnedDenseOuterHessianOperator { matrix })))
        }
        crate::solver::outer_strategy::HessianResult::Unavailable => Ok(None),
    }
}

fn outer_efs_result_to_joint_hyper_efs_result(
    efs_eval: crate::solver::outer_strategy::EfsEval,
    warm_start: ConstrainedWarmStart,
    inner_converged: bool,
) -> CustomFamilyJointHyperEfsResult {
    CustomFamilyJointHyperEfsResult {
        efs_eval,
        warm_start: CustomFamilyWarmStart { inner: warm_start },
        inner_converged,
    }
}

// Unified exact joint hyper-calculus over theta = [rho, psi].
//
// The correct outer problem is not “a rho objective plus a separate psi
// objective”. It is one profiled/Laplace surface over one flattened hypervector
//
//   theta = [rho, psi],
//
// one flattened joint coefficient vector
//
//   beta = [beta_1; ...; beta_B],
//
// and one joint exact mode system
//
//   F(beta, theta) := V_beta(beta, theta) = 0,
//   H(beta, theta) := V_beta_beta(beta, theta).
//
// For every hypercoordinate theta_i we need the fixed-beta objects
//
//   V_i = partial_{theta_i} V,
//   g_i = partial_{theta_i} F,
//   H_i = partial_{theta_i} H,
//
// and for every pair (i, j)
//
//   V_ij, g_ij, H_ij,
//
// together with the beta-curvature contractions
//
//   D_beta H[u],
//   D_beta^2 H[u, v],
//   T_i[u] := D_beta H_i[u].
//
// The exact profiled mode response and total Hessian drifts are then
//
//   beta_i  = -H^{-1} g_i,
//   beta_ij = -H^{-1}(g_ij + H_i beta_j + H_j beta_i + D_beta H[beta_i] beta_j),
//
//   dot H_i
//   = H_i + D_beta H[beta_i],
//
//   ddot H_ij
//   = H_ij
//     + T_i[beta_j]
//     + T_j[beta_i]
//     + D_beta H[beta_ij]
//     + D_beta^2 H[beta_i, beta_j].
//
// Hence the exact joint profiled/Laplace derivatives are
//
//   J_i
//   = V_i + 0.5 tr(H^{-1} dot H_i) - 0.5 partial_i log|S(theta)|_+,
//
//   J_ij
//   = (V_ij - g_i^T H^{-1} g_j)
//     + 0.5 [ tr(H^{-1} ddot H_ij)
//             - tr(H^{-1} dot H_j H^{-1} dot H_i) ]
//     - 0.5 partial^2_{ij} log|S(theta)|_+.
//
// In this unified view rho and psi are the same outer calculus. They differ
// only in where their fixed-beta derivative objects come from:
//
// - rho coordinates often contribute only through the penalty surface,
//     but the generic assembler intentionally treats the penalty as S(theta),
//     not S(rho), so mixed rho/psi penalty terms are allowed whenever realized
//     component penalties move with psi:
//       V_i  = D_i  + 0.5 beta^T S_i beta
//       g_i  = D_beta_i  + S_i beta
//       H_i  = D_beta_beta_i + S_i
//       V_ij = D_ij + 0.5 beta^T S_ij beta
//       g_ij = D_beta_ij + S_ij beta
//       H_ij = D_beta_beta_ij + S_ij.
//
// - psi coordinates come from the family-specific joint exact psi hooks, while
//   the generic assembler still owns any realized-penalty motion through
//   S_i / S_ij:
//     objective_psi            <-> V_i
//     score_psi                <-> g_i
//     hessian_psi              <-> H_i
//     objective_psi_psi        <-> V_ij
//     score_psi_psi            <-> g_ij
//     hessian_psi_psi          <-> H_ij
//     D_beta H_psi[u]          <-> T_i[u].
//
// For coupled families this means any block-local psi path is wrong. Even when
// g_i is sparse or penalty-local, beta_i is defined by the full joint solve
//
//   beta_i = -H^{-1} g_i,
//
// so every exact outer derivative must be assembled in this joint flattened
// space.

#[derive(Debug, Clone, Error)]
pub enum CustomFamilyError {
    #[error("custom-family invalid input in {context}: {reason}")]
    InvalidInput {
        context: &'static str,
        reason: String,
    },
    #[error("custom-family optimization error in {context}: {reason}")]
    Optimization {
        context: &'static str,
        reason: String,
    },
    #[error("{reason}")]
    DimensionMismatch { reason: String },
    #[error("{reason}")]
    NumericalFailure { reason: String },
    #[error("{reason}")]
    ConstraintViolation { reason: String },
    #[error("{reason}")]
    UnsupportedConfiguration { reason: String },
    #[error("{reason}")]
    BasisDecompositionFailed { reason: String },
    /// Pre-fit cross-block identifiability audit refused the fit. The
    /// joint design across `ParameterBlockSpec`s carries a rank
    /// deficiency that the post-`joint_null_rotation` absorption did
    /// not resolve: two or more blocks contribute the same direction,
    /// or a structural >2-way alias was detected without per-pair
    /// attribution. The full `IdentifiabilityAudit` is held so
    /// consumers (logs, structured-error sinks, the seed driver's
    /// classifier) can extract the alias pairs and the summary string
    /// without reparsing.
    #[error("identifiability audit refused the fit: {}", audit.summary)]
    IdentifiabilityFailure {
        audit: crate::solver::identifiability_audit::IdentifiabilityAudit,
    },
    /// MAP estimate uniqueness condition `ker(J^T W J) ∩ ker(S) = {0}` is
    /// violated.  A null direction of `J^T W J` carries zero penalty
    /// curvature, so the posterior is flat along that direction and the
    /// MAP is non-unique.  The structured [`MapUniquenessError`] names the
    /// dominant block so the caller can add the missing penalty or remove
    /// the unpenalised direction.
    #[error("MAP estimate non-unique: {}", error)]
    MapUniquenessFailure {
        error: crate::solver::identifiability_audit::MapUniquenessError,
    },
}

impl From<String> for CustomFamilyError {
    fn from(value: String) -> Self {
        Self::InvalidInput {
            context: "custom-family string boundary",
            reason: value,
        }
    }
}

impl From<CustomFamilyError> for String {
    fn from(value: CustomFamilyError) -> Self {
        value.to_string()
    }
}

pub(crate) fn validate_blockspecs(specs: &[ParameterBlockSpec]) -> Result<Vec<usize>, String> {
    // `fit_custom_family` is a fit entry point and genuinely requires at least
    // one parameter block — an empty model has nothing to estimate. This is a
    // *fit-level precondition*, distinct from the *consistency* of the block
    // specs themselves, which is checked by `validate_blockspec_consistency`.
    if specs.is_empty() {
        return Err(CustomFamilyError::UnsupportedConfiguration {
            reason: "fit_custom_family requires at least one parameter block".to_string(),
        }
        .into());
    }
    validate_blockspec_consistency(specs)
}

/// Validate the *internal consistency* of a slice of parameter block specs
/// (unique names; design/offset/initial_beta/penalty dimensions agree) without
/// imposing the fit-level "at least one block" precondition.
///
/// An empty slice is vacuously consistent and returns an empty penalty-count
/// vector. The non-empty fit precondition lives in [`validate_blockspecs`];
/// pure operator-materialization hooks (e.g. `batched_outer_hessian_terms`)
/// must use this consistency check instead, so they can be probed with an
/// empty, self-consistent argument set without tripping a fit precondition
/// that does not apply to them.
pub(crate) fn validate_blockspec_consistency(
    specs: &[ParameterBlockSpec],
) -> Result<Vec<usize>, String> {
    let mut seen_names = BTreeMap::<String, usize>::new();
    for (b, spec) in specs.iter().enumerate() {
        if let Some(prev) = seen_names.insert(spec.name.clone(), b) {
            return Err(CustomFamilyError::ConstraintViolation {
                reason: format!(
                    "duplicate parameter block name '{}' at indices {prev} and {b}: block names must be unique so coefficient labels resolved by name are unambiguous",
                    spec.name
                ),
            }
            .into());
        }
    }
    let mut penalty_counts = Vec::with_capacity(specs.len());
    for (b, spec) in specs.iter().enumerate() {
        let n = spec.design.nrows();
        if spec.offset.len() != n {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {b} offset length mismatch: got {}, expected {}",
                    spec.offset.len(),
                    n
                ),
            }
            .into());
        }
        // `stacked_design` and `stacked_offset` must be `Some` together
        // and their row/length must agree.  This enforces the contract
        // that `solver_design()` and `solver_offset()` always return a
        // matched pair.
        match (&spec.stacked_design, &spec.stacked_offset) {
            (Some(sd), Some(so)) => {
                if sd.nrows() != so.len() {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "block {b} stacked_design/stacked_offset row mismatch: \
                             stacked_design.nrows()={}, stacked_offset.len()={}",
                            sd.nrows(),
                            so.len(),
                        ),
                    }
                    .into());
                }
                if sd.ncols() != spec.design.ncols() {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "block {b} stacked_design column count {} disagrees with \
                             design column count {}",
                            sd.ncols(),
                            spec.design.ncols(),
                        ),
                    }
                    .into());
                }
            }
            (None, None) => {}
            (Some(_), None) | (None, Some(_)) => {
                return Err(CustomFamilyError::ConstraintViolation {
                    reason: format!(
                        "block {b} stacked_design and stacked_offset must be Some together \
                         or both None"
                    ),
                }
                .into());
            }
        }
        let p = spec.design.ncols();
        if let Some(beta0) = &spec.initial_beta
            && beta0.len() != p
        {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {b} initial_beta length mismatch: got {}, expected {p}",
                    beta0.len()
                ),
            }
            .into());
        }
        if spec.initial_log_lambdas.len() != spec.penalties.len() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {b} initial_log_lambdas length {} does not match penalties {}",
                    spec.initial_log_lambdas.len(),
                    spec.penalties.len()
                ),
            }
            .into());
        }
        for (k, s) in spec.penalties.iter().enumerate() {
            let (r, c) = s.shape();
            if r != p || c != p {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!("block {b} penalty {k} must be {p}x{p}, got {r}x{c}"),
                }
                .into());
            }
        }
        penalty_counts.push(spec.penalties.len());
    }
    Ok(penalty_counts)
}

fn with_block_geometry<F: CustomFamily + ?Sized, T>(
    family: &F,
    block_states: &[ParameterBlockState],
    spec: &ParameterBlockSpec,
    block_idx: usize,
    f: impl FnOnce(&DesignMatrix, &Array1<f64>) -> Result<T, String>,
) -> Result<T, String> {
    if family.block_geometry_is_dynamic() {
        let (x_dyn, off_dyn) = family.block_geometry(block_states, spec)?;
        let expected_rows = spec.solver_design().nrows();
        if x_dyn.nrows() != expected_rows {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} dynamic design row mismatch: got {}, expected {}",
                    x_dyn.nrows(),
                    expected_rows
                ),
            }
            .into());
        }
        if x_dyn.ncols() != spec.design.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} dynamic design col mismatch: got {}, expected {}",
                    x_dyn.ncols(),
                    spec.design.ncols()
                ),
            }
            .into());
        }
        if off_dyn.len() != expected_rows {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} dynamic offset length mismatch: got {}, expected {}",
                    off_dyn.len(),
                    expected_rows
                ),
            }
            .into());
        }
        f(&x_dyn, &off_dyn)
    } else {
        f(spec.solver_design(), spec.solver_offset())
    }
}

fn flatten_log_lambdas(specs: &[ParameterBlockSpec]) -> Array1<f64> {
    let total = specs
        .iter()
        .map(|s| s.initial_log_lambdas.len())
        .sum::<usize>();
    let mut out = Array1::<f64>::zeros(total);
    let mut at = 0usize;
    for spec in specs {
        let len = spec.initial_log_lambdas.len();
        if len > 0 {
            out.slice_mut(ndarray::s![at..at + len])
                .assign(&spec.initial_log_lambdas);
        }
        at += len;
    }
    out
}

#[derive(Clone, Debug)]
struct PenaltyLabelLayout {
    penalty_counts: Vec<usize>,
    physical_to_outer: Vec<Option<usize>>,
    fixed_log_lambdas: Vec<Option<f64>>,
    initial_rho: Array1<f64>,
}

impl PenaltyLabelLayout {
    fn physical_count(&self) -> usize {
        self.physical_to_outer.len()
    }

    fn has_tied_coordinates(&self) -> bool {
        self.initial_rho.len() != self.physical_to_outer.len()
    }
}

fn penalty_label_layout(
    specs: &[ParameterBlockSpec],
    penalty_counts: Vec<usize>,
) -> Result<PenaltyLabelLayout, String> {
    let mut label_to_outer = BTreeMap::<String, usize>::new();
    let mut physical_to_outer = Vec::<Option<usize>>::new();
    let mut fixed_log_lambdas = Vec::<Option<f64>>::new();
    let mut initial = Vec::<f64>::new();

    for (block_idx, spec) in specs.iter().enumerate() {
        for penalty_idx in 0..spec.penalties.len() {
            if let Some(fixed) = spec.penalties[penalty_idx].fixed_log_lambda() {
                if !fixed.is_finite() {
                    return Err(CustomFamilyError::ConstraintViolation {
                        reason: format!(
                            "block {block_idx} penalty {penalty_idx} fixed log-precision is non-finite: {fixed}"
                        ),
                    }
                    .into());
                }
                physical_to_outer.push(None);
                fixed_log_lambdas.push(Some(fixed));
                continue;
            }
            let label = spec.penalties[penalty_idx]
                .precision_label()
                .map(str::to_owned)
                .unwrap_or_else(|| format!("__block_{block_idx}_penalty_{penalty_idx}"));
            let rho0 = spec.initial_log_lambdas[penalty_idx];
            let outer = if let Some(&outer) = label_to_outer.get(&label) {
                let first = initial[outer];
                if first.is_finite() && rho0.is_finite() && (first - rho0).abs() > 1e-10 {
                    return Err(CustomFamilyError::ConstraintViolation { reason: format!(
                        "precision label '{label}' has inconsistent initial log-precisions: {first} and {rho0}"
                    ) }.into());
                }
                outer
            } else {
                let outer = initial.len();
                label_to_outer.insert(label, outer);
                initial.push(rho0);
                outer
            };
            physical_to_outer.push(Some(outer));
            fixed_log_lambdas.push(None);
        }
    }

    Ok(PenaltyLabelLayout {
        penalty_counts,
        physical_to_outer,
        fixed_log_lambdas,
        initial_rho: Array1::from_vec(initial),
    })
}

fn expand_labeled_log_lambdas(
    rho: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Array1<f64>, String> {
    if rho.len() != layout.initial_rho.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "log-lambda label coordinate mismatch: got {}, expected {}",
                rho.len(),
                layout.initial_rho.len()
            ),
        }
        .into());
    }
    let mut expanded = Array1::<f64>::zeros(layout.physical_count());
    for (physical, outer) in layout.physical_to_outer.iter().enumerate() {
        expanded[physical] = match *outer {
            Some(outer) => rho[outer],
            None => layout.fixed_log_lambdas[physical].ok_or_else(|| {
                CustomFamilyError::ConstraintViolation {
                    reason: format!(
                        "fixed penalty layout missing value at physical slot {physical}"
                    ),
                }
                .to_string()
            })?,
        };
    }
    Ok(expanded)
}

fn split_labeled_log_lambdas(
    rho: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Vec<Array1<f64>>, String> {
    let expanded = expand_labeled_log_lambdas(rho, layout)?;
    split_log_lambdas(&expanded, &layout.penalty_counts)
}

fn aggregate_labeled_gradient(
    gradient: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Array1<f64>, String> {
    if gradient.len() != layout.physical_count() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "physical gradient length mismatch: got {}, expected {}",
                gradient.len(),
                layout.physical_count()
            ),
        }
        .into());
    }
    let mut out = Array1::<f64>::zeros(layout.initial_rho.len());
    for (physical, outer) in layout.physical_to_outer.iter().enumerate() {
        if let Some(outer) = *outer {
            out[outer] += gradient[physical];
        }
    }
    Ok(out)
}

fn aggregate_labeled_hessian(
    hessian: &Array2<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Array2<f64>, String> {
    if hessian.nrows() != layout.physical_count() || hessian.ncols() != layout.physical_count() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "physical Hessian shape mismatch: got {}x{}, expected {}x{}",
                hessian.nrows(),
                hessian.ncols(),
                layout.physical_count(),
                layout.physical_count()
            ),
        }
        .into());
    }
    let mut out = Array2::<f64>::zeros((layout.initial_rho.len(), layout.initial_rho.len()));
    for (i, oi) in layout.physical_to_outer.iter().enumerate() {
        let Some(oi) = *oi else { continue };
        for (j, oj) in layout.physical_to_outer.iter().enumerate() {
            if let Some(oj) = *oj {
                out[[oi, oj]] += hessian[[i, j]];
            }
        }
    }
    Ok(out)
}

/// Adapter over the shared [`rho_prior_eval`](crate::solver::estimate::reml::rho_prior_eval)
/// engine using the custom-family invalid-prior policy
/// (`HardError`): the prior math is shared with the REML/LAML runtime, and a
/// malformed prior surfaces as a structured [`CustomFamilyError`] rather than
/// being folded into the objective.
fn rho_prior_cost_gradient_hessian(
    prior: &crate::types::RhoPrior,
    rho: &Array1<f64>,
) -> Result<(f64, Array1<f64>, Option<Array2<f64>>), String> {
    use crate::solver::estimate::reml::rho_prior_eval::{InvalidPriorPolicy, RhoPriorError};
    match crate::solver::estimate::reml::rho_prior_eval::evaluate(
        prior,
        rho,
        InvalidPriorPolicy::HardError,
    ) {
        Ok(eval) => Ok((eval.cost, eval.gradient, eval.hessian)),
        Err(RhoPriorError::DimensionMismatch { reason }) => {
            Err(CustomFamilyError::DimensionMismatch { reason }.into())
        }
        Err(RhoPriorError::ConstraintViolation { reason }) => {
            Err(CustomFamilyError::ConstraintViolation { reason }.into())
        }
    }
}

fn add_labeled_rho_prior_to_outer_eval(
    mut result: OuterObjectiveEvalResult,
    rho: &Array1<f64>,
    rho_prior: &crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
    // For tied physical penalties, the likelihood/LAML contribution is first
    // evaluated in the expanded physical coordinates and then pulled back to
    // the user-facing labeled coordinates.  The configured prior lives on the
    // labeled precision itself, so it is added once after that pullback:
    //
    //   V_label(rho) = V_base(E rho) + pi(rho),
    //   ∇V_label     = E' ∇V_base(E rho) + ∇pi(rho),
    //   ∇²V_label    = E' ∇²V_base(E rho) E + ∇²pi(rho),
    //
    // where E maps each physical penalty piece to its outer label.  This is
    // the same change-of-variables identity used for overlapping/nested group
    // penalties; the prior is not repeated for each physical child component.
    if matches!(rho_prior, crate::types::RhoPrior::Flat) {
        return Ok(result);
    }
    let (cost, gradient, hessian) = rho_prior_cost_gradient_hessian(rho_prior, rho)?;
    result.objective += cost;
    if eval_mode != EvalMode::ValueOnly {
        if result.gradient.len() != gradient.len() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "rho prior gradient length mismatch: got {}, expected {}",
                    gradient.len(),
                    result.gradient.len()
                ),
            }
            .into());
        }
        result.gradient += &gradient;
    }
    if eval_mode == EvalMode::ValueGradientHessian
        && let Some(prior_hessian) = hessian
    {
        result.outer_hessian.add_rho_block_dense(&prior_hessian)?;
    }
    Ok(result)
}

fn physical_warm_start_for_labeled(
    warm_start: Option<&ConstrainedWarmStart>,
    physical_rho: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Option<ConstrainedWarmStart> {
    if !layout.has_tied_coordinates() {
        return None;
    }
    warm_start.map(|seed| {
        let mut physical_seed = seed.clone();
        physical_seed.rho = physical_rho.clone();
        physical_seed
    })
}

fn pullback_labeled_outer_eval(
    mut result: OuterObjectiveEvalResult,
    rho: &Array1<f64>,
    layout: &PenaltyLabelLayout,
    rho_prior: &crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
    if eval_mode == EvalMode::ValueOnly {
        result.gradient = Array1::<f64>::zeros(layout.initial_rho.len());
    } else {
        result.gradient = aggregate_labeled_gradient(&result.gradient, layout)?;
    }
    if eval_mode == EvalMode::ValueGradientHessian {
        result.outer_hessian = match result.outer_hessian {
            crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
                crate::solver::outer_strategy::HessianResult::Analytic(aggregate_labeled_hessian(
                    &hessian, layout,
                )?)
            }
            crate::solver::outer_strategy::HessianResult::Operator(operator) => {
                crate::solver::outer_strategy::HessianResult::Operator(Arc::new(
                    LabeledOuterHessianOperator::new(operator, layout),
                ))
            }
            crate::solver::outer_strategy::HessianResult::Unavailable => {
                crate::solver::outer_strategy::HessianResult::Unavailable
            }
        };
    }
    result.warm_start.rho = rho.clone();
    add_labeled_rho_prior_to_outer_eval(result, rho, rho_prior, eval_mode)
}

fn outerobjectivegradienthessian_labeled<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    layout: &PenaltyLabelLayout,
    rho: &Array1<f64>,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: &crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
    let physical_rho = expand_labeled_log_lambdas(rho, layout)?;
    let physical_warm_start = physical_warm_start_for_labeled(warm_start, &physical_rho, layout);
    let base = outerobjectivegradienthessian_internal(
        family,
        specs,
        options,
        &layout.penalty_counts,
        &physical_rho,
        physical_warm_start.as_ref().or(warm_start),
        crate::types::RhoPrior::Flat,
        eval_mode,
    )?;
    pullback_labeled_outer_eval(base, rho, layout, rho_prior, eval_mode)
}

fn custom_family_seed_screening_proxy_labeled<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    layout: &PenaltyLabelLayout,
    rho: &Array1<f64>,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: &crate::types::RhoPrior,
) -> Result<(f64, ConstrainedWarmStart, bool), String> {
    let physical_rho = expand_labeled_log_lambdas(rho, layout)?;
    let per_block = split_log_lambdas(&physical_rho, &layout.penalty_counts)?;
    let physical_warm_start = physical_warm_start_for_labeled(warm_start, &physical_rho, layout);
    // Seed screening only RANKS candidate seeds by their penalized objective; it
    // is capped and never produces the final fit. Mark the inner solve as a
    // screening solve so it skips the O(p · per-axis-Hdot) full Jeffreys/Firth
    // curvature loop and keeps only the cheap value-only Jeffreys term in the
    // score (gam#729/#808). For a K-block coupled family (Dirichlet/multinomial)
    // each per-axis directional derivative is O(K²·n·p), so paying the full term
    // for every cascade candidate over the joint width is the wrong cost class
    // and made the coupled fit non-completing in screening alone. The real fit
    // (after a seed is selected) runs with `seed_screening = false`, so the
    // load-bearing Firth curvature is fully present where it matters.
    let screening_options = BlockwiseFitOptions {
        seed_screening: true,
        ..options.clone()
    };
    let mut inner = inner_blockwise_fit(
        family,
        specs,
        &per_block,
        &screening_options,
        physical_warm_start.as_ref().or(warm_start),
    )?;
    refresh_all_block_etas(family, specs, &mut inner.block_states)?;
    let prior_terms = rho_prior_cost_gradient_hessian(rho_prior, rho)?;
    let score = inner_penalized_objective(
        &inner,
        include_exact_newton_logdet_h(family, options),
        include_exact_newton_logdet_s(family, options),
        "custom-family labeled seed-screening proxy",
    )? + prior_terms.0;
    let warm = ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|state| state.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(&inner)),
    };
    Ok((score, warm, inner.converged))
}

fn split_log_lambdas(
    flat: &Array1<f64>,
    penalty_counts: &[usize],
) -> Result<Vec<Array1<f64>>, String> {
    let expected: usize = penalty_counts.iter().sum();
    if flat.len() != expected {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "log-lambda length mismatch: got {}, expected {expected}",
                flat.len()
            ),
        }
        .into());
    }
    let mut out = Vec::with_capacity(penalty_counts.len());
    let mut at = 0usize;
    for &k in penalty_counts {
        out.push(flat.slice(ndarray::s![at..at + k]).to_owned());
        at += k;
    }
    Ok(out)
}

fn buildblock_states<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
) -> Result<Vec<ParameterBlockState>, String> {
    let mut states = Vec::with_capacity(specs.len());
    for (b, spec) in specs.iter().enumerate() {
        let p = spec.design.ncols();
        let beta = spec
            .initial_beta
            .clone()
            .unwrap_or_else(|| Array1::<f64>::zeros(p));
        let eta = with_block_geometry(family, &states, spec, b, |x, off| {
            let mut eta = x.matrixvectormultiply(&beta);
            eta += off;
            Ok(eta)
        })?;
        states.push(ParameterBlockState { beta, eta });
    }
    // After every block state is populated, pass each β through
    // `post_update_block_beta` so the invariant "every `states[b].beta`
    // in `inner_blockwise_fit` is feasible" holds from the first eval
    // call onward — matching the same projection the warm-start seed
    // path at 5932 already applies.  Defers projection to this second
    // pass because some family overrides (e.g.
    // `SurvivalMarginalSlopeFamily::post_update_block_beta`) read
    // `block_states[block_idx]` during projection, and `block_idx == b`
    // is only populated once the first pass has pushed all states.
    //
    // Without this, a caller that supplies `initial_beta = Some(infeasible)`
    // — or leaves it `None` for a family whose zero vector violates the
    // family's bounds — feeds an infeasible β into
    // `exact_newton_joint_hessian` / `evaluate` before the first
    // line-search trial, silently corrupting the fit or tripping
    // `max_feasible_step_size` guards on iteration 1.  The warm-start
    // path (5925-5938) projects on entry for exactly this reason; this
    // extends the invariant to the cold-start path too.
    for b in 0..specs.len() {
        let raw = states[b].beta.clone();
        let projected = family.post_update_block_beta(&states, b, &specs[b], raw)?;
        states[b].beta.assign(&projected);
    }
    // Note: the caller (`inner_blockwise_fit`) calls `refresh_all_block_etas`
    // immediately after this returns, so η is recomputed against the
    // projected β before any family evaluation runs.  We don't duplicate
    // the refresh here.
    Ok(states)
}

fn refresh_all_block_etas<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &mut [ParameterBlockState],
) -> Result<(), String> {
    if family.block_geometry_is_dynamic() {
        for b in 0..specs.len() {
            refresh_single_block_eta(family, specs, states, b)?;
        }
        return Ok(());
    }

    use rayon::iter::{IntoParallelIterator, ParallelIterator};

    let refreshed_etas: Vec<Array1<f64>> = (0..specs.len())
        .into_par_iter()
        .map(|b| {
            specs[b]
                .solver_design()
                .matrixvectormultiply(&states[b].beta)
                + specs[b].solver_offset()
        })
        .collect();

    for (state, eta) in states.iter_mut().zip(refreshed_etas) {
        state.eta = eta;
    }
    Ok(())
}

fn refresh_single_block_eta<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &mut [ParameterBlockState],
    block_idx: usize,
) -> Result<(), String> {
    let spec = &specs[block_idx];
    let beta = states[block_idx].beta.clone();
    states[block_idx].eta = with_block_geometry(family, states, spec, block_idx, |x, off| {
        Ok(x.matrixvectormultiply(&beta) + off)
    })?;
    Ok(())
}

#[inline]
fn capped_inner_max_cycles(options: &BlockwiseFitOptions, base_cycles: usize) -> usize {
    let mut cap = base_cycles;
    if let Some(screening) = options.screening_max_inner_iterations.as_ref() {
        let screening_cap = screening.load(Ordering::Relaxed);
        if screening_cap > 0 {
            cap = cap.min(screening_cap);
        }
    }
    if let Some(outer) = options.outer_inner_max_iterations.as_ref() {
        let outer_cap = outer.load(Ordering::Relaxed);
        // `0` is the `SEED_SCREENING_UNCAPPED` sentinel: "no cap — use the full
        // `pirls_config.max_iterations`". The outer bridges store it into this
        // atomic for the line-search COST probe so the deciding cost is the true
        // converged-inner envelope objective the analytic gradient differentiates
        // (gam#787/#808). Honoring it requires the SAME `> 0` guard the screening
        // branch above uses; an unconditional `cap.min(0)` would collapse the
        // probe to a single inner cycle (`.max(1)`), guaranteeing a non-converged
        // inner solve and a spurious `∞` cost — re-introducing the frozen-|g|
        // outer stall the uncap was meant to remove.
        if outer_cap > 0 {
            cap = cap.min(outer_cap);
        }
    }
    cap.max(1)
}

fn weighted_normal_equations(
    x: &DesignMatrix,
    w: &Array1<f64>,
    y_star: Option<&Array1<f64>>,
) -> Result<(Array2<f64>, Option<Array1<f64>>), String> {
    let n = x.nrows();
    if w.len() != n {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: "weighted normal-equation dimension mismatch".to_string(),
        }
        .into());
    }
    if let Some(y) = y_star
        && y.len() != n
    {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: "weighted RHS dimension mismatch".to_string(),
        }
        .into());
    }

    let xtwx = x.xt_diag_x_signed_op(SignedWeightsView::from_array(w))?;
    let xtwy = if let Some(y) = y_star {
        Some(x.compute_xtwy(w, y)?)
    } else {
        None
    };
    Ok((xtwx, xtwy))
}

/// Smallest diagonal shift that makes the penalized joint Hessian
/// Cholesky-factorable (i.e. positive definite at the solver floor), or `None`
/// when the matrix is already PD and needs no shift.
///
/// PERF (gam#729/#826): the stabilizing shift is recomputed every inner Newton
/// cycle. For a coupled K-block family (Dirichlet/multinomial) the joint Hessian
/// is structurally near-singular along the cross-block gauge / sum-to-zero null
/// space, so a shift fires on (almost) every cycle. The previous implementation
/// ran a full dense self-adjoint eigendecomposition (`O(p³)`, all eigenpairs)
/// just to read `min_eval` — the dominant per-cycle cost on the coupled inner
/// solve. We only need a PD CERTIFICATE plus the smallest lifting ridge, which a
/// Cholesky probe gives far more cheaply: a plain Cholesky succeeds in one shot
/// on a well-conditioned cycle (no shift), and a geometric ridge escalation
/// finds the lifting shift in a handful of `O(p³/3)` Cholesky attempts on the
/// near-singular cycles — strictly cheaper than the full eigh and short-circuiting
/// on the first PD factorization. The resulting shift makes `H_pen + δI` PD,
/// which is exactly what the downstream solve requires.
fn exact_newton_stabilizing_shift(lhs_dense: &Array2<f64>, ridge_floor: f64) -> Option<f64> {
    let floor = effective_solverridge(ridge_floor);
    // Fast path: already PD at zero shift ⇒ no stabilization needed. One Cholesky
    // (O(p³/3)), the common case on a well-conditioned cycle.
    if lhs_dense.cholesky(Side::Lower).is_ok() {
        return None;
    }
    // Near-singular / indefinite. We need a positive diagonal shift `δ` that makes
    // `H + δI` PD. A full eigendecomposition (the previous implementation) reads
    // the exact `λ_min` but costs `O(p³)` for ALL eigenpairs EVERY inner cycle;
    // for a coupled K-block family the shift fires almost every cycle, so that
    // dominated the inner solve (gam#729/#826). A Cholesky-escalation search is
    // even worse on a hard-near-singular block (many `O(p³)` Cholesky retries).
    //
    // Use the Gershgorin lower bound on `λ_min` instead — a single `O(p²)` pass,
    // no iteration: every eigenvalue lies in some disc
    // `[H_ii − R_i, H_ii + R_i]` with `R_i = Σ_{j≠i} |H_ij|`, so
    // `λ_min ≥ min_i (H_ii − R_i) =: g`. Shifting by `δ = floor − g` (when `g`
    // is at/below the floor) guarantees `λ_min(H + δI) = λ_min + δ ≥ floor > 0`,
    // i.e. `H + δI` is PD. The bound is conservative (δ may be larger than the
    // exact eigh shift), but it is self-vanishing in the well-conditioned regime
    // (handled by the Cholesky fast path above) and the downstream solve only
    // requires PD, not the tightest possible shift — and the trust region governs
    // step size regardless. `O(p²)` per cycle instead of `O(p³)`.
    let p = lhs_dense.nrows();
    let mut gershgorin_min = f64::INFINITY;
    for i in 0..p {
        let diag = lhs_dense[[i, i]];
        let mut radius = 0.0_f64;
        for j in 0..p {
            if j != i {
                radius += lhs_dense[[i, j]].abs();
            }
        }
        gershgorin_min = gershgorin_min.min(diag - radius);
    }
    if !gershgorin_min.is_finite() {
        let diag_max = (0..p)
            .map(|d| lhs_dense[[d, d]].abs())
            .fold(0.0_f64, f64::max);
        return Some(floor.max(diag_max * 1e-6).max(1e-6));
    }
    if gershgorin_min >= floor {
        // Gershgorin certifies PD-at-floor but the no-shift Cholesky failed
        // (round-off on a barely-PD matrix): a floor-sized shift suffices.
        return Some(floor);
    }
    Some(floor - gershgorin_min)
}

fn stabilize_exact_newton_lhs_in_place<F: CustomFamily + ?Sized>(
    family: &F,
    lhs_dense: &mut Array2<f64>,
    ridge_floor: f64,
) {
    if use_exact_newton_strict_spd(family) {
        return;
    }
    if let Some(shift) = exact_newton_stabilizing_shift(lhs_dense, ridge_floor) {
        for d in 0..lhs_dense.nrows() {
            lhs_dense[[d, d]] += shift;
        }
    }
}

fn shift_linear_constraints_to_delta(
    constraints: &LinearInequalityConstraints,
    beta: &Array1<f64>,
) -> Result<LinearInequalityConstraints, String> {
    if constraints.a.ncols() != beta.len() || constraints.a.nrows() != constraints.b.len() {
        return Err(CustomFamilyError::ConstraintViolation {
            reason: "linear constraints: shape mismatch".to_string(),
        }
        .into());
    }
    Ok(LinearInequalityConstraints {
        a: constraints.a.clone(),
        b: &constraints.b - &constraints.a.dot(beta),
    })
}

fn collect_block_linear_constraints<F: CustomFamily + ?Sized>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
) -> Result<Vec<Option<LinearInequalityConstraints>>, String> {
    let mut constraints = Vec::with_capacity(specs.len());
    for (block_idx, spec) in specs.iter().enumerate() {
        constraints.push(family.block_linear_constraints(states, block_idx, spec)?);
    }
    Ok(constraints)
}

fn reject_constrained_post_update_repair(
    block_idx: usize,
    spec: &ParameterBlockSpec,
    raw_beta: &Array1<f64>,
    updated_beta: &Array1<f64>,
    constraints: Option<&LinearInequalityConstraints>,
) -> Result<(), String> {
    let Some(constraints) = constraints else {
        return Ok(());
    };
    if raw_beta.len() != updated_beta.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "post-update beta length changed for constrained block '{}' (idx {block_idx}): raw={}, updated={}",
                spec.name,
                raw_beta.len(),
                updated_beta.len(),
            ),
        }
        .into());
    }
    if raw_beta.len() != constraints.a.ncols() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "post-update constrained block '{}' (idx {block_idx}) width mismatch: beta={}, constraints={}",
                spec.name,
                raw_beta.len(),
                constraints.a.ncols(),
            ),
        }
        .into());
    }
    let max_change = raw_beta
        .iter()
        .zip(updated_beta.iter())
        .map(|(left, right)| (left - right).abs())
        .fold(0.0_f64, f64::max);
    let raw_scale = raw_beta.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
    let updated_scale = updated_beta.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
    let tol = 1e-10 * (1.0 + raw_scale.max(updated_scale));
    if max_change > tol {
        return Err(CustomFamilyError::ConstraintViolation {
            reason: format!(
                "post-update hook materially changed constrained block '{}' (idx {block_idx}): \
                 max |β_post - β_qp|={max_change:.3e} > tol={tol:.3e}; \
                 constraints must be represented analytically in block_linear_constraints, not repaired after the Newton/QP solve",
                spec.name,
            ),
        }
        .into());
    }
    Ok(())
}

fn assemble_joint_linear_constraints(
    block_constraints: &[Option<LinearInequalityConstraints>],
    ranges: &[(usize, usize)],
    total_p: usize,
) -> Result<Option<LinearInequalityConstraints>, String> {
    if block_constraints.len() != ranges.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "joint linear constraint assembly mismatch: {} blocks but {} ranges",
                block_constraints.len(),
                ranges.len()
            ),
        }
        .into());
    }
    let total_rows = block_constraints
        .iter()
        .map(|constraints| constraints.as_ref().map_or(0, |c| c.a.nrows()))
        .sum::<usize>();
    if total_rows == 0 {
        return Ok(None);
    }
    let mut a = Array2::<f64>::zeros((total_rows, total_p));
    let mut b = Array1::<f64>::zeros(total_rows);
    let mut row_offset = 0usize;
    for (block_idx, constraints_opt) in block_constraints.iter().enumerate() {
        let Some(constraints) = constraints_opt else {
            continue;
        };
        let (start, end) = ranges[block_idx];
        let block_p = end - start;
        if constraints.a.ncols() != block_p || constraints.a.nrows() != constraints.b.len() {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "joint linear constraint assembly mismatch for block {block_idx}: A is {}x{}, b is {}, block width is {}",
                constraints.a.nrows(),
                constraints.a.ncols(),
                constraints.b.len(),
                block_p
            ) }.into());
        }
        let rows = constraints.a.nrows();
        a.slice_mut(s![row_offset..(row_offset + rows), start..end])
            .assign(&constraints.a);
        b.slice_mut(s![row_offset..(row_offset + rows)])
            .assign(&constraints.b);
        row_offset += rows;
    }
    Ok(Some(LinearInequalityConstraints { a, b }))
}

fn flatten_joint_active_set(
    block_active_sets: &[Option<Vec<usize>>],
    block_constraints: &[Option<LinearInequalityConstraints>],
) -> Option<Vec<usize>> {
    if block_active_sets.len() != block_constraints.len() {
        return None;
    }
    let mut offset = 0usize;
    let mut joint_active = Vec::new();
    for (active_opt, constraints_opt) in block_active_sets.iter().zip(block_constraints.iter()) {
        let rows = constraints_opt
            .as_ref()
            .map_or(0, |constraints| constraints.a.nrows());
        if let Some(active) = active_opt {
            joint_active.extend(
                active
                    .iter()
                    .copied()
                    .filter(|&idx| idx < rows)
                    .map(|idx| offset + idx),
            );
        }
        offset += rows;
    }
    if joint_active.is_empty() {
        None
    } else {
        Some(joint_active)
    }
}

fn scatter_joint_active_set(
    joint_active: &[usize],
    block_constraints: &[Option<LinearInequalityConstraints>],
) -> Vec<Option<Vec<usize>>> {
    let mut per_block = Vec::with_capacity(block_constraints.len());
    let mut offset = 0usize;
    for constraints_opt in block_constraints {
        let rows = constraints_opt
            .as_ref()
            .map_or(0, |constraints| constraints.a.nrows());
        if rows == 0 {
            per_block.push(None);
            continue;
        }
        let mut local = joint_active
            .iter()
            .copied()
            .filter(|&idx| idx >= offset && idx < offset + rows)
            .map(|idx| idx - offset)
            .collect::<Vec<_>>();
        offset += rows;
        if local.is_empty() {
            per_block.push(None);
            continue;
        }
        local.sort_unstable();
        local.dedup();
        per_block.push(Some(local));
    }
    per_block
}

/// Assemble the **active rows** of the joint linear inequality constraint
/// matrix into a single `(k_active × total_p)` block, suitable for the
/// unified evaluator's constraint-aware kernel.
///
/// Inputs:
/// * `block_constraints`: per-block dense `LinearInequalityConstraints`
///   (the family's full inequality system per block, output of
///   `collect_block_linear_constraints`).
/// * `block_active_sets`: per-block indices of rows currently active
///   (output of the joint Newton's QP solver / `cached_active_sets`).
/// * `ranges`: per-block column ranges within the joint β.
/// * `total_p`: sum of block widths.
///
/// Returns `None` when no block has any active constraints — the caller
/// can then skip the constraint-aware kernel entirely.
fn assemble_active_constraint_block(
    block_constraints: &[Option<LinearInequalityConstraints>],
    block_active_sets: &[Option<Vec<usize>>],
    ranges: &[(usize, usize)],
    total_p: usize,
) -> Option<crate::solver::estimate::reml::unified::ActiveLinearConstraintBlock> {
    if block_constraints.len() != ranges.len() || block_active_sets.len() != ranges.len() {
        return None;
    }
    let mut active_per_block: Vec<(usize, &[usize], &LinearInequalityConstraints)> = Vec::new();
    let mut total_active = 0usize;
    for (b, (range, (constraints_opt, active_opt))) in ranges
        .iter()
        .zip(block_constraints.iter().zip(block_active_sets.iter()))
        .enumerate()
    {
        let Some(constraints) = constraints_opt else {
            continue;
        };
        let Some(active) = active_opt else {
            continue;
        };
        if active.is_empty() {
            continue;
        }
        if constraints.a.ncols() != range.1 - range.0 {
            return None;
        }
        if !active.iter().all(|&r| r < constraints.a.nrows()) {
            return None;
        }
        total_active += active.len();
        active_per_block.push((b, active.as_slice(), constraints));
    }
    if total_active == 0 {
        return None;
    }
    let mut a = ndarray::Array2::<f64>::zeros((total_active, total_p));
    let mut out_row = 0usize;
    for (b_idx, active, constraints) in active_per_block {
        let (start, end) = ranges[b_idx];
        let block_p = end - start;
        for &local_row in active {
            for col in 0..block_p {
                a[[out_row, start + col]] = constraints.a[[local_row, col]];
            }
            out_row += 1;
        }
    }
    Some(crate::solver::estimate::reml::unified::ActiveLinearConstraintBlock { a })
}

struct SimpleLowerBounds {
    lower_bounds: Array1<f64>,
    row_to_coeff: Vec<usize>,
    coeff_to_row: Vec<Option<usize>>,
}

fn extract_simple_lower_bounds(
    constraints: &LinearInequalityConstraints,
    p: usize,
) -> Result<Option<SimpleLowerBounds>, String> {
    if constraints.a.ncols() != p || constraints.a.nrows() != constraints.b.len() {
        return Err(CustomFamilyError::ConstraintViolation {
            reason: "linear constraints: shape mismatch".to_string(),
        }
        .into());
    }
    let mut lower_bounds = Array1::from_elem(p, f64::NEG_INFINITY);
    let mut coeff_to_row = vec![None; p];
    let mut row_to_coeff = Vec::with_capacity(constraints.a.nrows());
    for row in 0..constraints.a.nrows() {
        let mut coeff_idx = None;
        let mut coeff_value = 0.0;
        for col in 0..p {
            let value = constraints.a[[row, col]];
            if value.abs() <= 1e-12 {
                continue;
            }
            if coeff_idx.is_some() {
                return Ok(None);
            }
            coeff_idx = Some(col);
            coeff_value = value;
        }
        let Some(col) = coeff_idx else {
            return Ok(None);
        };
        if coeff_value <= 0.0 {
            return Ok(None);
        }
        let bound = constraints.b[row] / coeff_value;
        if bound > lower_bounds[col] {
            lower_bounds[col] = bound;
            coeff_to_row[col] = Some(row);
        }
        row_to_coeff.push(col);
    }
    Ok(Some(SimpleLowerBounds {
        lower_bounds,
        row_to_coeff,
        coeff_to_row,
    }))
}

fn lower_bound_active_rows_to_coeffs(
    bounds: &SimpleLowerBounds,
    active_rows: Option<&[usize]>,
) -> Vec<usize> {
    let Some(active_rows) = active_rows else {
        return Vec::new();
    };
    let mut active_coeffs = active_rows
        .iter()
        .copied()
        .filter_map(|row| bounds.row_to_coeff.get(row).copied())
        .collect::<Vec<_>>();
    active_coeffs.sort_unstable();
    active_coeffs.dedup();
    active_coeffs
}

fn lower_bound_active_coeffs_to_rows(
    bounds: &SimpleLowerBounds,
    active_coeffs: &[usize],
) -> Vec<usize> {
    let mut active_rows = active_coeffs
        .iter()
        .copied()
        .filter_map(|coeff| bounds.coeff_to_row.get(coeff).and_then(|row| *row))
        .collect::<Vec<_>>();
    active_rows.sort_unstable();
    active_rows.dedup();
    active_rows
}

fn lower_bound_active_coeffs_from_solution(
    bounds: &SimpleLowerBounds,
    beta: &Array1<f64>,
) -> Vec<usize> {
    let mut active_coeffs = Vec::new();
    for coeff in 0..beta.len() {
        let lower = bounds.lower_bounds[coeff];
        if !lower.is_finite() {
            continue;
        }
        let scale = beta[coeff].abs().max(lower.abs()).max(1.0);
        let tol = 1e-6 * scale + 1e-10;
        if beta[coeff] <= lower + tol {
            active_coeffs.push(coeff);
        }
    }
    active_coeffs
}

fn project_to_lower_bounds(beta: &mut Array1<f64>, lower_bounds: &Array1<f64>) {
    for i in 0..beta.len() {
        let lower = lower_bounds[i];
        if lower.is_finite() && beta[i] < lower {
            beta[i] = lower;
        }
    }
}

fn solve_quadratic_with_simple_lower_bounds(
    lhs: &Array2<f64>,
    rhs: &Array1<f64>,
    beta_start: &Array1<f64>,
    bounds: &SimpleLowerBounds,
    active_rows: Option<&[usize]>,
) -> Result<(Array1<f64>, Vec<usize>), String> {
    let gradient = lhs.dot(beta_start) - rhs;
    let mut delta = Array1::zeros(beta_start.len());
    let mut active_coeffs = lower_bound_active_rows_to_coeffs(bounds, active_rows);
    solve_newton_directionwith_lower_bounds(
        lhs,
        &gradient,
        beta_start,
        &bounds.lower_bounds,
        &mut delta,
        Some(&mut active_coeffs),
    )
    .map_err(|e| format!("lower-bound Newton solve failed: {e}"))?;
    let mut beta_new = beta_start + &delta;
    project_to_lower_bounds(&mut beta_new, &bounds.lower_bounds);
    active_coeffs = lower_bound_active_coeffs_from_solution(bounds, &beta_new);
    let active = lower_bound_active_coeffs_to_rows(bounds, &active_coeffs);
    Ok((beta_new, active))
}

fn normalize_active_set(mut active_set: Vec<usize>) -> Option<Vec<usize>> {
    active_set.sort_unstable();
    active_set.dedup();
    if active_set.is_empty() {
        None
    } else {
        Some(active_set)
    }
}

fn normalize_active_sets(active_sets: Vec<Option<Vec<usize>>>) -> Vec<Option<Vec<usize>>> {
    active_sets
        .into_iter()
        .map(|active_set| active_set.and_then(normalize_active_set))
        .collect()
}

struct BlockUpdateContext<'a> {
    family: &'a dyn CustomFamily,
    states: &'a [ParameterBlockState],
    spec: &'a ParameterBlockSpec,
    block_idx: usize,
    s_lambda: &'a Array2<f64>,
    options: &'a BlockwiseFitOptions,
    linear_constraints: Option<&'a LinearInequalityConstraints>,
    cached_active_set: Option<&'a [usize]>,
}

struct BlockUpdateResult {
    beta_new_raw: Array1<f64>,
    active_set: Option<Vec<usize>>,
}

#[inline]
fn floor_positiveworking_weights(working_weights: &Array1<f64>, minweight: f64) -> Array1<f64> {
    let mut out = Array1::<f64>::zeros(working_weights.len());
    ndarray::Zip::from(&mut out)
        .and(working_weights)
        .par_for_each(|o, &wi| *o = if wi <= 0.0 { 0.0 } else { wi.max(minweight) });
    out
}

trait ParameterBlockUpdater {
    fn compute_update_step(
        &self,
        ctx: &BlockUpdateContext<'_>,
    ) -> Result<BlockUpdateResult, String>;
}

struct DiagonalBlockUpdater<'a> {
    working_response: &'a Array1<f64>,
    working_weights: &'a Array1<f64>,
}

impl ParameterBlockUpdater for DiagonalBlockUpdater<'_> {
    fn compute_update_step(
        &self,
        ctx: &BlockUpdateContext<'_>,
    ) -> Result<BlockUpdateResult, String> {
        if self.working_response.len() != ctx.spec.design.nrows()
            || self.working_weights.len() != ctx.spec.design.nrows()
        {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "family diagonal working-set size mismatch on block {} ({})",
                    ctx.block_idx, ctx.spec.name
                ),
            }
            .into());
        }

        // Zero-weight observations are semantically excluded and must stay inactive.
        let w_clamped = floor_positiveworking_weights(self.working_weights, ctx.options.minweight);

        if let Some(constraints) = ctx.linear_constraints {
            check_linear_feasibility(&ctx.states[ctx.block_idx].beta, constraints, 1e-8).map_err(
                |e| {
                    format!(
                        "block {} ({}) constrained diagonal solve: {e}",
                        ctx.block_idx, ctx.spec.name
                    )
                },
            )?;
            with_block_geometry(ctx.family, ctx.states, ctx.spec, ctx.block_idx, |x, off| {
                let mut y_star = self.working_response.clone();
                y_star -= off;
                let (mut lhs, rhs_opt) = weighted_normal_equations(x, &w_clamped, Some(&y_star))?;
                let rhs = rhs_opt.ok_or_else(|| {
                    "missing weighted RHS in constrained diagonal solve".to_string()
                })?;
                lhs += ctx.s_lambda;
                let lower_bounds = extract_simple_lower_bounds(constraints, lhs.ncols())?;
                let (beta_constrained, active_set) = if let Some(bounds) = lower_bounds.as_ref() {
                    solve_quadratic_with_simple_lower_bounds(
                        &lhs,
                        &rhs,
                        &ctx.states[ctx.block_idx].beta,
                        bounds,
                        ctx.cached_active_set,
                    )
                } else {
                    solve_quadratic_with_linear_constraints(
                        &lhs,
                        &rhs,
                        &ctx.states[ctx.block_idx].beta,
                        constraints,
                        ctx.cached_active_set,
                    )
                    .map_err(|e| e.to_string())
                }
                .map_err(|e| {
                    format!(
                        "block {} ({}) constrained diagonal solve failed: {e}",
                        ctx.block_idx, ctx.spec.name
                    )
                })?;
                Ok(BlockUpdateResult {
                    beta_new_raw: beta_constrained,
                    active_set: normalize_active_set(active_set),
                })
            })
        } else {
            with_block_geometry(ctx.family, ctx.states, ctx.spec, ctx.block_idx, |x, off| {
                // Fuse offset subtraction into the weighted RHS: wy[i] = w[i] * (z[i] - off[i]).
                // This avoids an O(n) working_response clone.
                let n = self.working_response.len();
                let wy = Array1::from_shape_fn(n, |i| {
                    (self.working_response[i] - off[i]) * w_clamped[i].max(0.0)
                });
                let xtwy = x.transpose_vector_multiply(&wy);
                let beta = x
                    .solve_systemwith_policy(
                        &w_clamped,
                        &xtwy,
                        Some(ctx.s_lambda),
                        ctx.options.ridge_floor,
                        ctx.options.ridge_policy,
                    )
                    .map_err(|_| "block solve failed after ridge retries".to_string())?;
                Ok(BlockUpdateResult {
                    beta_new_raw: beta,
                    active_set: None,
                })
            })
        }
    }
}

struct ExactNewtonBlockUpdater<'a> {
    gradient: &'a Array1<f64>,
    hessian: &'a SymmetricMatrix,
}

impl ParameterBlockUpdater for ExactNewtonBlockUpdater<'_> {
    fn compute_update_step(
        &self,
        ctx: &BlockUpdateContext<'_>,
    ) -> Result<BlockUpdateResult, String> {
        let p = ctx.spec.design.ncols();
        if self.gradient.len() != p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {} exact-newton gradient length mismatch: got {}, expected {p}",
                    ctx.block_idx,
                    self.gradient.len()
                ),
            }
            .into());
        }
        if self.hessian.nrows() != p || self.hessian.ncols() != p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {} exact-newton Hessian shape mismatch: got {}x{}, expected {}x{}",
                    ctx.block_idx,
                    self.hessian.nrows(),
                    self.hessian.ncols(),
                    p,
                    p
                ),
            }
            .into());
        }

        let lhs = self.hessian.add_dense(ctx.s_lambda)?;
        // Solve in delta-space for both constrained and unconstrained blocks.
        // That keeps the linear system consistent even when we add a
        // numerical ridge to stabilize an indefinite exact-Newton Hessian.
        let rhs_step = self.gradient - &ctx.s_lambda.dot(&ctx.states[ctx.block_idx].beta);
        let mut lhs_dense = lhs.to_dense();
        stabilize_exact_newton_lhs_in_place(ctx.family, &mut lhs_dense, ctx.options.ridge_floor);

        if let Some(constraints) = ctx.linear_constraints {
            check_linear_feasibility(&ctx.states[ctx.block_idx].beta, constraints, 1e-8).map_err(
                |e| {
                    format!(
                        "block {} ({}) constrained exact-newton solve: {e}",
                        ctx.block_idx, ctx.spec.name
                    )
                },
            )?;
            let lower_bounds = extract_simple_lower_bounds(constraints, p).map_err(|e| {
                format!(
                    "block {} ({}) constrained exact-newton solve: {e}",
                    ctx.block_idx, ctx.spec.name
                )
            })?;
            let (beta_new_raw, active_set) = if let Some(bounds) = lower_bounds.as_ref() {
                let rhs_beta = &lhs_dense.dot(&ctx.states[ctx.block_idx].beta) + &rhs_step;
                solve_quadratic_with_simple_lower_bounds(
                    &lhs_dense,
                    &rhs_beta,
                    &ctx.states[ctx.block_idx].beta,
                    bounds,
                    ctx.cached_active_set,
                )
            } else {
                let delta_constraints =
                    shift_linear_constraints_to_delta(constraints, &ctx.states[ctx.block_idx].beta)
                        .map_err(|e| {
                            format!(
                                "block {} ({}) constrained exact-newton solve: {e}",
                                ctx.block_idx, ctx.spec.name
                            )
                        })?;
                let delta_start = Array1::zeros(p);
                let (delta, active_set) = solve_quadratic_with_linear_constraints(
                    &lhs_dense,
                    &rhs_step,
                    &delta_start,
                    &delta_constraints,
                    ctx.cached_active_set,
                )
                .map_err(|e| e.to_string())?;
                Ok((&ctx.states[ctx.block_idx].beta + &delta, active_set))
            }
            .map_err(|e| {
                format!(
                    "block {} ({}) constrained exact-newton solve failed: {e}",
                    ctx.block_idx, ctx.spec.name
                )
            })?;
            Ok(BlockUpdateResult {
                beta_new_raw,
                active_set: normalize_active_set(active_set),
            })
        } else {
            // Solve for the Newton step, not the next beta directly.
            //
            // For the penalized negative objective
            //
            //   Q(beta) = -log L(beta) + 0.5 beta^T S beta,
            //
            // the exact block gradient and Hessian are
            //
            //   grad_Q = S beta - gradient,
            //   hess_Q = hessian + S.
            //
            // The Newton step must therefore satisfy
            //
            //   hess_Q * delta = -grad_Q = gradient - S beta.
            //
            // This form stays correct even when the linear solver adds a
            // numerical ridge to the left-hand side to stabilize an indefinite
            // or nearly singular block. Solving directly for `beta_new` with a
            // ridged matrix would require an extra `ridge * beta` term on the
            // right-hand side; without it the step is distorted, which can trap
            // exact-Newton block updates on nonconvex blocks such as survival
            // `log_sigma`.
            let delta = if use_exact_newton_strict_spd(ctx.family) {
                // Strict-mode Newton step uses the LM δ-ridge continuation:
                // a single near-zero eigenvalue from numerical noise in
                // H_β should not bounce the entire seed evaluation. The
                // bare strict_solve_spd contract is preserved (still used
                // by other paths and the existing test
                // `pseudo_laplace_path_skips_eigendecomposition_avoiding_nan_crash`);
                // here we pay an O(p³) extra Cholesky attempt when needed
                // to keep adaptive optimization moving.
                let (step, lm_stats) =
                    strict_solve_spd_with_lm_continuation(&lhs_dense, &rhs_step)?;
                if lm_stats.escalations > 0 {
                    log::debug!(
                        "[strict-spd-lm] block={} ({}): δ-ridge continuation succeeded \
                         after {} escalation(s) at δ={:.3e}",
                        ctx.block_idx,
                        ctx.spec.name,
                        lm_stats.escalations,
                        lm_stats.delta_used,
                    );
                }
                step
            } else {
                // Non-strict (RidgedQuadraticReml) families share the strict
                // path's LM δ-ridge continuation. For a nonconvex block whose
                // likelihood Hessian H_β is INDEFINITE away from the optimum —
                // e.g. the squared-coefficient SCOP transformation-normal tensor
                // over a smooth covariate, where the I(y)⊗b(x) columns are
                // strongly collinear — the previous `solve_spd_systemwith_policy`
                // (ridge-retry + pinv-positive-part) returns a valid but
                // poorly-scaled descent step that crawls and hits the inner
                // cycle cap. The eigenvalue-floored LM continuation produces a
                // well-scaled Newton step on exactly those indefinite /
                // ill-conditioned systems. It is a STRICT SUPERSET of the plain
                // solve: when H_β + S is SPD and well-conditioned it reduces to
                // the same Cholesky step (zero escalations), only escalating the
                // floor when the system is genuinely indefinite — so
                // well-behaved families see no behaviour change. Internal to the
                // solve; β is recovered in the raw basis, so dimensionality /
                // identifiability are untouched.
                let step = match strict_solve_spd_with_lm_continuation(&lhs_dense, &rhs_step) {
                    Ok((step, lm_stats)) => {
                        if lm_stats.escalations > 0 {
                            log::debug!(
                                "[joint-Newton/lm] block={} ({}): non-strict δ-ridge continuation \
                                 succeeded after {} escalation(s) at δ={:.3e}",
                                ctx.block_idx,
                                ctx.spec.name,
                                lm_stats.escalations,
                                lm_stats.delta_used,
                            );
                        }
                        step
                    }
                    // Final guard: only if the LM continuation itself fails to
                    // produce a finite step do we fall back to the diagonal-
                    // scaled steepest-descent direction (always finite when the
                    // gradient is finite).
                    Err(_) => (0..lhs_dense.nrows())
                        .map(|i| {
                            let d = lhs_dense[[i, i]].abs().max(1e-8);
                            rhs_step[i] / d
                        })
                        .collect(),
                };
                step
            };
            let beta = &ctx.states[ctx.block_idx].beta + &delta;
            Ok(BlockUpdateResult {
                beta_new_raw: beta,
                active_set: None,
            })
        }
    }
}

impl BlockWorkingSet {
    fn updater(&self) -> Box<dyn ParameterBlockUpdater + '_> {
        match self {
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => Box::new(DiagonalBlockUpdater {
                working_response,
                working_weights,
            }),
            BlockWorkingSet::ExactNewton { gradient, hessian } => {
                Box::new(ExactNewtonBlockUpdater { gradient, hessian })
            }
        }
    }
}

fn check_linear_feasibility(
    beta: &Array1<f64>,
    constraints: &LinearInequalityConstraints,
    tol: f64,
) -> Result<(), String> {
    if constraints.a.ncols() != beta.len() || constraints.a.nrows() != constraints.b.len() {
        return Err(CustomFamilyError::ConstraintViolation {
            reason: "linear constraints: shape mismatch".to_string(),
        }
        .into());
    }
    let slack = constraints.a.dot(beta) - &constraints.b;
    let mut worst = 0.0_f64;
    let mut worst_idx = 0usize;
    for (i, &s) in slack.iter().enumerate() {
        let v = (-s).max(0.0);
        if v > worst {
            worst = v;
            worst_idx = i;
        }
    }
    if worst > tol {
        return Err(CustomFamilyError::ConstraintViolation {
            reason: format!(
                "infeasible iterate: max(Aβ-b violation)={worst:.3e} at constraint row {worst_idx}"
            ),
        }
        .into());
    }
    Ok(())
}

#[inline]
fn effective_solverridge(ridge_floor: f64) -> f64 {
    ridge_floor.max(1e-15)
}

fn block_quadratic_penalty(
    beta: &Array1<f64>,
    s_lambda: &Array2<f64>,
    ridge: f64,
    ridge_policy: RidgePolicy,
) -> f64 {
    let mut value = 0.5 * beta.dot(&s_lambda.dot(beta));
    if ridge_policy.include_quadratic_penalty {
        value += 0.5 * ridge * beta.dot(beta);
    }
    value
}

fn block_penalized_hessian_vector(
    spec: &ParameterBlockSpec,
    work: &BlockWorkingSet,
    s_lambda: &Array2<f64>,
    direction: &Array1<f64>,
    ridge: f64,
    ridge_policy: RidgePolicy,
) -> Array1<f64> {
    let mut hpen = match work {
        BlockWorkingSet::ExactNewton { hessian, .. } => hessian.dot(direction),
        BlockWorkingSet::Diagonal {
            working_weights, ..
        } => {
            let solver_design = spec.solver_design();
            let x_direction = solver_design.matrixvectormultiply(direction);
            let wx_direction = &x_direction * working_weights;
            solver_design.transpose_vector_multiply(&wx_direction)
        }
    };
    hpen += &s_lambda.dot(direction);
    if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
        hpen.scaled_add(ridge, direction);
    }
    hpen
}

fn symmetric_matrix_diagonal(matrix: &SymmetricMatrix) -> Array1<f64> {
    match matrix {
        SymmetricMatrix::Dense(mat) => mat.diag().to_owned(),
        SymmetricMatrix::Sparse(mat) => {
            let mut out = Array1::<f64>::zeros(mat.ncols());
            let (symbolic, values) = mat.parts();
            let col_ptr = symbolic.col_ptr();
            let row_idx = symbolic.row_idx();
            for col in 0..mat.ncols() {
                for idx in col_ptr[col]..col_ptr[col + 1] {
                    if row_idx[idx] == col {
                        out[col] += values[idx];
                    }
                }
            }
            out
        }
    }
}

fn block_penalized_metric_diagonal(
    spec: &ParameterBlockSpec,
    work: &BlockWorkingSet,
    s_lambda: &Array2<f64>,
    ridge: f64,
    ridge_policy: RidgePolicy,
) -> Result<Array1<f64>, String> {
    let mut diagonal = match work {
        BlockWorkingSet::ExactNewton { hessian, .. } => symmetric_matrix_diagonal(hessian),
        BlockWorkingSet::Diagonal {
            working_weights, ..
        } => spec.design.diag_gram(working_weights)?,
    };
    if diagonal.len() != s_lambda.nrows() || s_lambda.nrows() != s_lambda.ncols() {
        return Err(format!(
            "block penalized metric diagonal shape mismatch: diag={}, S={}x{}",
            diagonal.len(),
            s_lambda.nrows(),
            s_lambda.ncols()
        ));
    }
    for j in 0..diagonal.len() {
        diagonal[j] += s_lambda[[j, j]];
        if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
            diagonal[j] += ridge;
        }
        diagonal[j] = positive_joint_diagonal_entry(diagonal[j]);
    }
    Ok(diagonal)
}

fn block_penalized_metric_norm(
    spec: &ParameterBlockSpec,
    work: &BlockWorkingSet,
    s_lambda: &Array2<f64>,
    direction: &Array1<f64>,
    ridge: f64,
    ridge_policy: RidgePolicy,
) -> Result<f64, String> {
    let diagonal = block_penalized_metric_diagonal(spec, work, s_lambda, ridge, ridge_policy)?;
    if diagonal.len() != direction.len() {
        return Err(format!(
            "block penalized metric direction length mismatch: direction={}, diag={}",
            direction.len(),
            diagonal.len()
        ));
    }
    Ok(joint_trust_region_metric_step_norm(direction, &diagonal))
}

fn truncate_block_step_to_metric_radius(
    spec: &ParameterBlockSpec,
    work: &BlockWorkingSet,
    s_lambda: &Array2<f64>,
    delta: Array1<f64>,
    radius: f64,
    ridge: f64,
    ridge_policy: RidgePolicy,
) -> Result<(Array1<f64>, f64), String> {
    let norm = block_penalized_metric_norm(spec, work, s_lambda, &delta, ridge, ridge_policy)?;
    if norm.is_finite() && norm > radius && radius > 0.0 {
        Ok((&delta * (radius / norm), radius))
    } else {
        Ok((delta, norm))
    }
}

const TOTAL_QUADRATIC_PENALTY_PAR_MIN_BLOCKS: usize = 4;
// Avoid Rayon overhead for a few tiny blocks; this approximates the dense
// mat-vec work in βᵀSβ before splitting independent block penalties.
const TOTAL_QUADRATIC_PENALTY_PAR_MIN_DENSE_WORK: usize = 16_384;

fn total_quadratic_penalty_parallel_worthwhile(
    states: &[ParameterBlockState],
    s_lambdas: &[Array2<f64>],
) -> bool {
    let n_blocks = states.len().min(s_lambdas.len());
    if n_blocks < TOTAL_QUADRATIC_PENALTY_PAR_MIN_BLOCKS || rayon::current_num_threads() <= 1 {
        return false;
    }

    states
        .iter()
        .zip(s_lambdas.iter())
        .map(|(state, s_lambda)| {
            let p = state.beta.len().min(s_lambda.ncols());
            p.saturating_mul(s_lambda.nrows())
        })
        .try_fold(0usize, |acc, work| {
            let next = acc.saturating_add(work);
            (next < TOTAL_QUADRATIC_PENALTY_PAR_MIN_DENSE_WORK).then_some(next)
        })
        .is_none()
}

fn total_quadratic_penalty(
    states: &[ParameterBlockState],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
    specs: Option<&[ParameterBlockSpec]>,
) -> f64 {
    let per_block: f64 = if total_quadratic_penalty_parallel_worthwhile(states, s_lambdas) {
        use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};

        states
            .par_iter()
            .zip(s_lambdas.par_iter())
            .map(|(state, s_lambda)| {
                block_quadratic_penalty(&state.beta, s_lambda, ridge, ridge_policy)
            })
            .reduce(|| 0.0, |left, right| left + right)
    } else {
        states
            .iter()
            .zip(s_lambdas.iter())
            .map(|(state, s_lambda)| {
                block_quadratic_penalty(&state.beta, s_lambda, ridge, ridge_policy)
            })
            .sum()
    };
    let joint = match (joint_full_width, specs) {
        (Some(bundle), Some(specs)) if !bundle.is_empty() => {
            let beta_flat = flatten_state_betas(states, specs);
            bundle.quadratic(beta_flat.view())
        }
        _ => 0.0,
    };
    per_block + joint
}

/// Locate the first non-finite entry in a Hessian and report it as a
/// canonical "smooth-regularized logdet boundary" error. The same
/// message is used at every site that refuses to factor or iterate on
/// a non-finite Hessian — the logdet computation itself, and the
/// inner-fit entry where exact-Newton block Hessians arrive from the
/// family. A single canonical phrasing means callers and tests
/// recognise this as one mathematical event regardless of where it
/// was caught: a NaN entry is a contract violation against the
/// family's analytic second derivative, full stop.
fn smooth_regularized_logdet_hessian_finite_check(
    matrix: &Array2<f64>,
    block: Option<usize>,
) -> Result<(), String> {
    let Some((row, col, value)) = matrix
        .indexed_iter()
        .find_map(|((row, col), &value)| (!value.is_finite()).then_some((row, col, value)))
    else {
        return Ok(());
    };
    let block_context = match block {
        Some(b) => format!(" for block {b}"),
        None => String::new(),
    };
    Err(CustomFamilyError::NumericalFailure { reason: format!(
        "smooth-regularized logdet Hessian contains non-finite entry at ({row}, {col}): {value}{block_context}"
    ) }.into())
}

/// Validate that every exact-Newton block working set in a family
/// evaluation has a finite Hessian. Returns Err on the first
/// non-finite entry using the canonical smooth-regularized logdet
/// boundary message, with the offending block index appended for
/// diagnostics.
///
/// Exact-Newton Hessians are part of the mathematical contract: they
/// are the family's analytic second derivative of the log-likelihood,
/// so any non-finite entry means that derivative is invalid math.
/// Catching it at the family-evaluation boundary lets the inner
/// solver refuse to iterate on a poisoned Hessian, instead of
/// silently "converging" because the gradient happens to be zero or
/// the bad entries get hidden behind a downstream eigendecomposition
/// fallback that the outer optimizer's flags may or may not invoke.
fn validate_block_hessians_finite(eval: &FamilyEvaluation) -> Result<(), String> {
    for (b, ws) in eval.blockworking_sets.iter().enumerate() {
        let BlockWorkingSet::ExactNewton { hessian, .. } = ws else {
            continue;
        };
        match hessian {
            SymmetricMatrix::Dense(matrix) => {
                smooth_regularized_logdet_hessian_finite_check(matrix, Some(b))?;
            }
            SymmetricMatrix::Sparse(matrix) => {
                let (symbolic, values) = matrix.parts();
                let col_ptr = symbolic.col_ptr();
                let row_idx = symbolic.row_idx();
                for col in 0..matrix.ncols() {
                    let start = col_ptr[col];
                    let end = col_ptr[col + 1];
                    for idx in start..end {
                        let row = row_idx[idx];
                        let value = values[idx];
                        if !value.is_finite() {
                            return Err(CustomFamilyError::NumericalFailure { reason: format!(
                                "smooth-regularized logdet Hessian contains non-finite entry at ({row}, {col}): {value} for block {b}"
                            ) }.into());
                        }
                    }
                }
            }
        }
    }
    Ok(())
}

fn stable_logdet_with_ridge_policy(
    matrix: &Array2<f64>,
    ridge_floor: f64,
    ridge_policy: RidgePolicy,
) -> Result<f64, String> {
    let mut a = matrix.clone();
    symmetrize_dense_in_place(&mut a);
    let p = a.nrows();
    let ridge = if ridge_policy.include_penalty_logdet {
        effective_solverridge(ridge_floor)
    } else {
        0.0
    };
    for i in 0..p {
        a[[i, i]] += ridge;
    }

    match resolved_ridge_determinant_mode(ridge_policy, p) {
        RidgeDeterminantMode::Full => {
            let chol = a.cholesky(Side::Lower).map_err(|_| {
                "cholesky failed while computing full ridge-aware logdet".to_string()
            })?;
            Ok(2.0 * chol.diag().mapv(f64::ln).sum())
        }
        RidgeDeterminantMode::Auto => Err(
            "internal: resolved_ridge_determinant_mode must resolve Auto to a concrete mode"
                .to_string(),
        ),
        RidgeDeterminantMode::PositivePart => {
            smooth_regularized_logdet_hessian_finite_check(&a, None)?;
            // Smooth-regularized logdet objective, aligned with the gradient
            // operator (`DenseSpectralOperator` in `Smooth` mode):
            //
            //   log |A|_reg = Σ_j log r_ε(σ_j),   r_ε(σ) = ½(σ + √(σ² + 4ε²))
            //
            // Every eigenvalue contributes; none are silently dropped.  The
            // regularizer r_ε is C∞, strictly positive for all real σ, and
            // numerically agrees with plain log σ when σ ≫ ε.  Negative
            // eigenvalues contribute ≈ log(ε²/|σ|) (quadratic damping) so
            // indefinite Hessians produce a finite, differentiable cost
            // rather than a discontinuous positive-part pseudo-determinant.
            //
            // This matches exactly what the downstream
            // `trace_logdet_gradient = Σ φ'(σ) u^T (dH/dρ) u` computes as the
            // analytic gradient — eliminating the cost/gradient mismatch
            // that previously broke BFGS line search on indefinite outer
            // Hessians.
            //
            match crate::faer_ndarray::FaerEigh::eigh(&a, Side::Lower) {
                Ok((evals, _)) => {
                    let eval_vec: Vec<f64> = evals
                        .as_slice()
                        .map(|sl| sl.to_vec())
                        .unwrap_or_else(|| evals.iter().copied().collect());
                    let eps = spectral_epsilon(&eval_vec)
                        .max(ridge.max(CUSTOM_FAMILY_CONDITION_RELATIVE_FLOOR));
                    let n_negative = eval_vec.iter().filter(|&&ev| ev < -eps).count();
                    if n_negative > 0 {
                        // Diagnostic only: indefiniteness is now handled
                        // correctly by the smooth regularizer, not ignored.
                        log::debug!(
                            "[SmoothRegularizedLogdet] Hessian has {n_negative} \
                             eigenvalue(s) below -eps={eps:.2e}; r_ε damps them \
                             smoothly instead of dropping them."
                        );
                    }
                    let logdet: f64 = eval_vec
                        .iter()
                        .map(|&sigma| spectral_regularize(sigma, eps).ln())
                        .sum();
                    Ok(logdet)
                }
                Err(eigh_err) => Err(CustomFamilyError::BasisDecompositionFailed {
                    reason: format!(
                        "smooth-regularized logdet eigendecomposition failed: {eigh_err}"
                    ),
                }
                .into()),
            }
        }
    }
}

/// Try Cholesky with an escalating diagonal ridge.
///
/// On attempt `k` (zero-indexed) the diagonal of `matrix` is boosted by
/// `initial_boost * growth^k`. The first successful Cholesky for which
/// `on_success` returns `Some(r)` short-circuits and yields `Some((r, boost,
/// attempt))`; otherwise (Cholesky failure or `on_success` rejection) the
/// ridge is grown and retried up to `max_attempts` times. Returns `None`
/// when every attempt is exhausted.
///
/// Callers that need a no-ridge probe should perform it explicitly before
/// invoking this helper; the helper itself always adds `initial_boost` on
/// the first attempt (which may itself be zero if the caller passes 0.0).
fn try_cholesky_with_escalating_ridge<R>(
    matrix: &Array2<f64>,
    initial_boost: f64,
    max_attempts: usize,
    growth: f64,
    mut on_success: impl FnMut(&crate::faer_ndarray::FaerCholeskyFactor, usize, f64) -> Option<R>,
) -> Option<(R, f64, usize)> {
    let p = matrix.nrows();
    let mut boost = initial_boost;
    for attempt in 0..max_attempts {
        let mut candidate = matrix.clone();
        if boost != 0.0 {
            for i in 0..p {
                candidate[[i, i]] += boost;
            }
        }
        if let Ok(chol) = candidate.cholesky(Side::Lower)
            && let Some(r) = on_success(&chol, attempt, boost)
        {
            return Some((r, boost, attempt));
        }
        boost *= growth;
    }
    None
}

/// Fallback for penalty pseudo-logdet when eigendecomposition fails.
///
/// Penalty matrices are PSD by construction (weighted sum of PSD penalties),
/// so the ridged matrix should be SPD.  Uses escalating-ridge Cholesky via
/// the shared `try_cholesky_with_escalating_ridge` helper.
fn penalty_logdet_cholesky_fallback(
    s_ridged: &Array2<f64>,
    existing_ridge: f64,
    block: usize,
    p: usize,
    eigh_err: &str,
) -> Result<f64, String> {
    let diag_scale = s_ridged
        .diag()
        .iter()
        .copied()
        .map(f64::abs)
        .fold(0.0_f64, f64::max)
        .max(1.0);

    const MAX_ATTEMPTS: usize = 6;
    let initial_boost = diag_scale * 1e-8;

    let outcome = try_cholesky_with_escalating_ridge(
        s_ridged,
        initial_boost,
        MAX_ATTEMPTS,
        10.0,
        |chol, attempt, boost| {
            let logdet = 2.0 * chol.diag().mapv(f64::ln).sum();
            if logdet.is_finite() {
                log::warn!(
                    "[PenaltyLogdetFallback] eigendecomposition failed for block {block} \
                     ({eigh_err}); using Cholesky with boosted ridge={:.2e} \
                     (attempt {}/{MAX_ATTEMPTS}, existing_ridge={:.2e}, p={p})",
                    boost + existing_ridge,
                    attempt + 1,
                    existing_ridge,
                );
                Some(logdet)
            } else {
                None
            }
        },
    );

    if let Some((logdet, _, _)) = outcome {
        return Ok(logdet);
    }

    // Mirror the original message: report the ridge that *would* have been
    // applied on the (MAX_ATTEMPTS+1)-th attempt, i.e. initial_boost * 10^MAX_ATTEMPTS.
    let final_boost = initial_boost * 10.0_f64.powi(MAX_ATTEMPTS as i32);
    Err(CustomFamilyError::BasisDecompositionFailed {
        reason: format!(
            "penalty logdet eigendecomposition failed for block {block} ({eigh_err}) and \
         Cholesky fallback also failed after {MAX_ATTEMPTS} attempts \
         (final ridge={:.2e}, p={p})",
            final_boost + existing_ridge,
        ),
    }
    .into())
}

fn resolved_ridge_determinant_mode(ridge_policy: RidgePolicy, dim: usize) -> RidgeDeterminantMode {
    assert!(
        dim.checked_add(1).is_some(),
        "ridge determinant dimension overflow"
    );
    match ridge_policy.determinant_mode {
        RidgeDeterminantMode::Auto => RidgeDeterminantMode::Full,
        mode => mode,
    }
}

fn inverse_spdwith_retry(
    matrix: &Array2<f64>,
    baseridge: f64,
    max_retry: usize,
) -> Result<Array2<f64>, String> {
    let mut sym = matrix.clone();
    symmetrize_dense_in_place(&mut sym);

    let invert_via_chol = |chol: &crate::faer_ndarray::FaerCholeskyFactor, _: usize, _: f64| {
        let mut ident = Array2::<f64>::eye(sym.nrows());
        chol.solve_mat_in_place(&mut ident);
        symmetrize_dense_in_place(&mut ident);
        Some(ident)
    };

    // Attempt 0 in the original schedule uses ridge=0 (no diagonal addition).
    // Express this as a single-attempt call with initial_boost=0.
    if let Some((inv, _, _)) =
        try_cholesky_with_escalating_ridge(&sym, 0.0, 1, 1.0, invert_via_chol)
    {
        return Ok(inv);
    }

    // Subsequent attempts use ridge = baseridge * 10^(k-1) for k = 1..=max_retry,
    // which is `max_retry` total attempts with initial_boost=baseridge, growth=10.
    if max_retry > 0
        && let Some((inv, _, _)) =
            try_cholesky_with_escalating_ridge(&sym, baseridge, max_retry, 10.0, invert_via_chol)
    {
        return Ok(inv);
    }

    Err(CustomFamilyError::BasisDecompositionFailed {
        reason: "failed to invert SPD system after Cholesky ridge retries".to_string(),
    }
    .into())
}

pub(crate) fn symmetrize_dense_in_place(matrix: &mut Array2<f64>) {
    crate::linalg::matrix::symmetrize_in_place(matrix);
}

fn validate_flat_direction_length(
    direction: &Array1<f64>,
    expected: usize,
    context: &str,
) -> Result<(), String> {
    if direction.len() != expected {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{context}: direction length mismatch: got {}, expected {expected}",
                direction.len()
            ),
        }
        .into());
    }
    Ok::<(), _>(())
}

/// Does a joint Hessian carry genuine cross-block (off-diagonal) coupling?
///
/// The trait's default `exact_newton_joint_hessian` assembles a strictly
/// block-diagonal matrix (per-block `Xᵀ W X` on the diagonal, zeros off-block).
/// A family that overrides it with the true coupled curvature of a multi-block
/// likelihood (GAMLSS μ-σ, Beta-logit `α`/`β`, Dirichlet K-block via the shared
/// concentration sum, …) necessarily fills in nonzero off-diagonal blocks. This
/// is the only structural signal — independent of any hand-set marker — that
/// distinguishes a trusted coupled joint Hessian from the block-diagonal
/// default. The block boundaries come from the per-block β widths.
fn joint_hessian_has_cross_block_coupling(
    hessian: &Array2<f64>,
    block_states: &[ParameterBlockState],
) -> bool {
    let total = block_states
        .iter()
        .map(|state| state.beta.len())
        .sum::<usize>();
    if hessian.nrows() != total || hessian.ncols() != total {
        // Shape disagreement is handled (loudly) by the symmetrizer/consumers;
        // here we only answer the coupling question and must not claim coupling
        // for a malformed matrix.
        return false;
    }
    let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_states.len());
    let mut start = 0usize;
    for state in block_states {
        let end = start + state.beta.len();
        ranges.push((start, end));
        start = end;
    }
    for (a, (ra_start, ra_end)) in ranges.iter().copied().enumerate() {
        for (rb_start, rb_end) in ranges.iter().copied().skip(a + 1) {
            for i in ra_start..ra_end {
                for j in rb_start..rb_end {
                    if hessian[[i, j]] != 0.0 || hessian[[j, i]] != 0.0 {
                        return true;
                    }
                }
            }
        }
    }
    false
}

fn exact_newton_joint_hessian_from_exact_blocks<F: CustomFamily + ?Sized>(
    family: &F,
    block_states: &[ParameterBlockState],
) -> Result<Option<Array2<f64>>, String> {
    let evaluation = family.evaluate(block_states)?;
    if evaluation.blockworking_sets.len() != block_states.len() {
        return Err(format!(
            "exact_newton_joint_hessian default: working-set count {} != block count {}",
            evaluation.blockworking_sets.len(),
            block_states.len()
        ));
    }
    if evaluation
        .blockworking_sets
        .iter()
        .any(|working_set| !matches!(working_set, BlockWorkingSet::ExactNewton { .. }))
    {
        return Ok(None);
    }

    let total = block_states
        .iter()
        .map(|state| state.beta.len())
        .sum::<usize>();
    let mut joint = Array2::<f64>::zeros((total, total));
    let mut start = 0usize;
    for (block_idx, (state, working_set)) in block_states
        .iter()
        .zip(evaluation.blockworking_sets.iter())
        .enumerate()
    {
        let p_block = state.beta.len();
        let end = start + p_block;
        let BlockWorkingSet::ExactNewton { hessian, .. } = working_set else {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "exact_newton_joint_hessian default: block {block_idx} working set is not ExactNewton after filter"
                ),
            }
            .into());
        };
        let dense = hessian.to_dense();
        if dense.nrows() != p_block || dense.ncols() != p_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "exact_newton_joint_hessian default: block {block_idx} Hessian shape {}x{} != expected {p_block}x{p_block}",
                dense.nrows(),
                dense.ncols()
            ) }.into());
        }
        joint.slice_mut(s![start..end, start..end]).assign(&dense);
        start = end;
    }
    Ok(Some(joint))
}

fn exact_newton_joint_hessian_from_working_sets<F: CustomFamily + ?Sized>(
    family: &F,
    block_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
) -> Result<Option<Array2<f64>>, String> {
    if block_states.len() != specs.len() {
        return Err(format!(
            "exact_newton_joint_hessian_with_specs default: block state count {} != spec count {}",
            block_states.len(),
            specs.len()
        ));
    }
    let evaluation = family.evaluate(block_states)?;
    if evaluation.blockworking_sets.len() != block_states.len() {
        return Err(format!(
            "exact_newton_joint_hessian_with_specs default: working-set count {} != block count {}",
            evaluation.blockworking_sets.len(),
            block_states.len()
        ));
    }

    let total = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    let mut joint = Array2::<f64>::zeros((total, total));
    let mut start = 0usize;
    for (block_idx, ((state, spec), working_set)) in block_states
        .iter()
        .zip(specs.iter())
        .zip(evaluation.blockworking_sets.iter())
        .enumerate()
    {
        let p_block = spec.design.ncols();
        if state.beta.len() != p_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "exact_newton_joint_hessian_with_specs default: block {block_idx} beta length {} != design cols {p_block}",
                state.beta.len()
            ) }.into());
        }
        let end = start + p_block;
        let dense = match working_set {
            BlockWorkingSet::ExactNewton { hessian, .. } => hessian.to_dense(),
            BlockWorkingSet::Diagonal {
                working_weights, ..
            } => spec
                .design
                .xt_diag_x_signed_op(SignedWeightsView::from_array(working_weights))?,
        };
        if dense.nrows() != p_block || dense.ncols() != p_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "exact_newton_joint_hessian_with_specs default: block {block_idx} Hessian shape {}x{} != expected {p_block}x{p_block}",
                dense.nrows(),
                dense.ncols()
            ) }.into());
        }
        joint.slice_mut(s![start..end, start..end]).assign(&dense);
        start = end;
    }
    Ok(Some(joint))
}

fn exact_newton_joint_hessian_directional_derivative_from_blocks<F: CustomFamily + ?Sized>(
    family: &F,
    block_states: &[ParameterBlockState],
    d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
    let total = block_states
        .iter()
        .map(|state| state.beta.len())
        .sum::<usize>();
    validate_flat_direction_length(
        d_beta_flat,
        total,
        "exact_newton_joint_hessian_directional_derivative default",
    )?;
    if !family.exact_newton_joint_hessian_beta_dependent() {
        return Ok(Some(Array2::zeros((total, total))));
    }

    let mut joint = Array2::<f64>::zeros((total, total));
    let mut start = 0usize;
    for (block_idx, state) in block_states.iter().enumerate() {
        let p_block = state.beta.len();
        let end = start + p_block;
        let d_beta_block = d_beta_flat.slice(s![start..end]).to_owned();
        let Some(local) = family.exact_newton_hessian_directional_derivative(
            block_states,
            block_idx,
            &d_beta_block,
        )?
        else {
            return Ok(None);
        };
        if local.nrows() != p_block || local.ncols() != p_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "exact_newton_joint_hessian_directional_derivative default: block {block_idx} dH shape {}x{} != expected {p_block}x{p_block}",
                local.nrows(),
                local.ncols()
            ) }.into());
        }
        joint.slice_mut(s![start..end, start..end]).assign(&local);
        start = end;
    }
    Ok(Some(joint))
}

/// Block-diagonal aggregator for the joint second directional derivative.
///
/// Mirrors `exact_newton_joint_hessian_directional_derivative_from_blocks`:
/// for a beta-independent joint Hessian the answer is identically zero;
/// otherwise we ask each block for `D²H_b[u_b, v_b]` via
/// `exact_newton_hessian_second_directional_derivative` and place those
/// per-block contributions on the joint diagonal.
///
/// The previous default returned `Some(zeros)` for beta-independent and
/// `None` (no aggregation at all) for beta-dependent families, silently
/// dropping the per-block `d²H` overrides that families like
/// `OneBlockQuarticExactFamily` provide for the outer Hessian's drift
/// contribution.  Aggregating here mirrors the first-derivative path so
/// outer REML receives the curvature term whenever the per-block
/// `exact_newton_hessian_second_directional_derivative` is implemented.
fn exact_newton_joint_hessiansecond_directional_derivative_from_blocks<F: CustomFamily + ?Sized>(
    family: &F,
    block_states: &[ParameterBlockState],
    d_beta_u_flat: &Array1<f64>,
    d_betav_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
    let total = block_states
        .iter()
        .map(|state| state.beta.len())
        .sum::<usize>();
    validate_flat_direction_length(d_beta_u_flat, total, "joint exact-newton d2H u")?;
    validate_flat_direction_length(d_betav_flat, total, "joint exact-newton d2H v")?;
    if !family.exact_newton_joint_hessian_beta_dependent() {
        return Ok(Some(Array2::zeros((total, total))));
    }

    let mut joint = Array2::<f64>::zeros((total, total));
    let mut start = 0usize;
    for (block_idx, state) in block_states.iter().enumerate() {
        let p_block = state.beta.len();
        let end = start + p_block;
        let u_block = d_beta_u_flat.slice(s![start..end]).to_owned();
        let v_block = d_betav_flat.slice(s![start..end]).to_owned();
        let Some(local) = family.exact_newton_hessian_second_directional_derivative(
            block_states,
            block_idx,
            &u_block,
            &v_block,
        )?
        else {
            return Ok(None);
        };
        if local.nrows() != p_block || local.ncols() != p_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "exact_newton_joint_hessiansecond_directional_derivative default: block {block_idx} d2H shape {}x{} != expected {p_block}x{p_block}",
                local.nrows(),
                local.ncols()
            ) }.into());
        }
        joint.slice_mut(s![start..end, start..end]).assign(&local);
        start = end;
    }
    Ok(Some(joint))
}

fn exact_newton_joint_hessian_directional_derivative_from_working_sets<F: CustomFamily + ?Sized>(
    family: &F,
    block_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    d_beta_flat: &Array1<f64>,
) -> Result<Option<Array2<f64>>, String> {
    if block_states.len() != specs.len() {
        return Err(format!(
            "exact_newton_joint_hessian_directional_derivative_with_specs default: block state count {} != spec count {}",
            block_states.len(),
            specs.len()
        ));
    }
    let total = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    validate_flat_direction_length(
        d_beta_flat,
        total,
        "exact_newton_joint_hessian_directional_derivative_with_specs default",
    )?;
    if !family.exact_newton_joint_hessian_beta_dependent() {
        return Ok(Some(Array2::zeros((total, total))));
    }

    let evaluation = family.evaluate(block_states)?;
    if evaluation.blockworking_sets.len() != block_states.len() {
        return Err(format!(
            "exact_newton_joint_hessian_directional_derivative_with_specs default: working-set count {} != block count {}",
            evaluation.blockworking_sets.len(),
            block_states.len()
        ));
    }

    let mut joint = Array2::<f64>::zeros((total, total));
    let mut start = 0usize;
    for (block_idx, ((state, spec), working_set)) in block_states
        .iter()
        .zip(specs.iter())
        .zip(evaluation.blockworking_sets.iter())
        .enumerate()
    {
        let p_block = spec.design.ncols();
        let end = start + p_block;
        let d_beta_block = d_beta_flat.slice(s![start..end]).to_owned();
        let local = match working_set {
            BlockWorkingSet::ExactNewton { .. } => family
                .exact_newton_hessian_directional_derivative(
                    block_states,
                    block_idx,
                    &d_beta_block,
                )?,
            BlockWorkingSet::Diagonal {
                working_weights, ..
            } => {
                let solver_design = spec.solver_design();
                let mut d_eta = solver_design.apply(&d_beta_block);
                let mut geometry_correction = Array2::<f64>::zeros((p_block, p_block));
                if let Some(geometry) = family.block_geometry_directional_derivative(
                    block_states,
                    block_idx,
                    spec,
                    &d_beta_block,
                )? {
                    if geometry.d_offset.len() != d_eta.len() {
                        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                            "exact_newton_joint_hessian_directional_derivative_with_specs default: block {block_idx} geometry offset derivative length {} != eta length {}",
                            geometry.d_offset.len(),
                            d_eta.len()
                        ) }.into());
                    }
                    d_eta += &geometry.d_offset;
                    if let Some(d_design) = geometry.d_design {
                        if d_design.nrows() != solver_design.nrows() || d_design.ncols() != p_block
                        {
                            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                                "exact_newton_joint_hessian_directional_derivative_with_specs default: block {block_idx} d_design shape {}x{} != expected {}x{}",
                                d_design.nrows(),
                                d_design.ncols(),
                                solver_design.nrows(),
                                p_block
                            ) }.into());
                        }
                        d_eta += &d_design.dot(&state.beta);

                        let x_dense = solver_design.to_dense();
                        let mut weighted_x = x_dense.clone();
                        let mut weighted_dx = d_design.clone();
                        ndarray::Zip::from(weighted_x.rows_mut())
                            .and(weighted_dx.rows_mut())
                            .and(working_weights.view())
                            .for_each(|mut wx_row, mut wdx_row, &wi| {
                                wx_row.mapv_inplace(|value| value * wi);
                                wdx_row.mapv_inplace(|value| value * wi);
                            });
                        geometry_correction += &fast_atb(&d_design, &weighted_x);
                        geometry_correction += &fast_atb(&x_dense, &weighted_dx);
                    }
                }
                family
                    .diagonalworking_weights_directional_derivative(
                        block_states,
                        block_idx,
                        &d_eta,
                    )?
                    .map(|dw| {
                        let mut local = solver_design
                            .xt_diag_x_signed_op(SignedWeightsView::from_array(&dw))?;
                        local += &geometry_correction;
                        Ok::<Array2<f64>, String>(local)
                    })
                    .transpose()?
            }
        };
        let Some(local) = local else {
            return Ok(None);
        };
        if local.nrows() != p_block || local.ncols() != p_block {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "exact_newton_joint_hessian_directional_derivative_with_specs default: block {block_idx} dH shape {}x{} != expected {p_block}x{p_block}",
                local.nrows(),
                local.ncols()
            ) }.into());
        }
        joint.slice_mut(s![start..end, start..end]).assign(&local);
        start = end;
    }
    Ok(Some(joint))
}

fn exact_newton_joint_hessian_symmetrized<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    total: usize,
    context: &str,
) -> Result<Option<Array2<f64>>, String> {
    let Some(mut h) = family.exact_newton_joint_hessian_with_specs(states, specs)? else {
        return Ok(None);
    };
    if h.nrows() != total || h.ncols() != total {
        return Err(format!(
            "{context}: got {}x{}, expected {}x{}",
            h.nrows(),
            h.ncols(),
            total,
            total
        ));
    }
    symmetrize_dense_in_place(&mut h);
    Ok(Some(h))
}

/// Scale-aware exact joint curvature payload for the outer REML evaluator.
pub struct ExactNewtonOuterCurvature {
    pub hessian: Array2<f64>,
    pub rho_curvature_scale: f64,
    pub hessian_logdet_correction: f64,
}

enum JointHessianSource {
    Dense(Array2<f64>),
    Operator {
        apply: Arc<dyn Fn(&Array1<f64>) -> Result<Array1<f64>, String> + Send + Sync>,
        /// Write-into matvec used by the inner-Newton PCG hot path so the
        /// matvec result no longer allocates an `Array1<f64>` per CG iter.
        /// At large scale (~6400 inner CG iters per outer iter, p~200) this
        /// removes thousands of small Vec<f64> allocations from the tightest
        /// loop. Wired from `workspace.hessian_matvec_into`.
        apply_into: Arc<dyn Fn(&Array1<f64>, &mut Array1<f64>) -> Result<(), String> + Send + Sync>,
        /// Batched multi-RHS apply: `out = H · V` for `(total, n_rhs)` `V`.
        /// Wired from `workspace.hessian_apply_mat`, which for the BMS tiled
        /// row-primary Hessian sweeps each row tile once and applies its `Hᵢ`
        /// to every column. Column-basis dense reconstruction below uses this
        /// to materialise the operator in one batched sweep (`H = H · I`)
        /// rather than `total` single-vector HVPs, each of which re-reads every
        /// row tile. Numerically identical to looping `apply_into`.
        apply_mat: Arc<dyn Fn(&Array2<f64>, &mut Array2<f64>) -> Result<(), String> + Send + Sync>,
        diagonal: Array1<f64>,
        /// Forced dense materialization that bypasses the workspace's
        /// `hessian_dense` amortization gate. Returns `Some` when the
        /// workspace can build dense via a structural direct path (e.g.
        /// CTN's `scop_gradient_and_negative_hessian`), `None` when the
        /// caller should fall back to column-basis HVP through `apply`.
        dense_forced: Arc<dyn Fn() -> Result<Option<Array2<f64>>, String> + Send + Sync>,
    },
}

const EXACT_JOINT_HESSIAN_DENSE_MAX_BYTES: usize = 512 * 1024 * 1024;

fn exact_joint_hessian_dense_bytes(total: usize) -> Result<usize, String> {
    total
        .checked_mul(total)
        .and_then(|n| n.checked_mul(std::mem::size_of::<f64>()))
        .ok_or_else(|| format!("joint Hessian dense byte count overflow for dim={total}"))
}

fn ensure_exact_joint_hessian_dense_budget(total: usize, context: &str) -> Result<(), String> {
    let bytes = exact_joint_hessian_dense_bytes(total)?;
    if bytes > EXACT_JOINT_HESSIAN_DENSE_MAX_BYTES {
        return Err(CustomFamilyError::UnsupportedConfiguration {
            reason: format!(
                "{context}: exact dense joint Hessian requires {:.2} GiB for dim={total}, \
             exceeding the {:.2} GiB cap; refusing approximate determinant algebra",
                bytes as f64 / (1024.0 * 1024.0 * 1024.0),
                EXACT_JOINT_HESSIAN_DENSE_MAX_BYTES as f64 / (1024.0 * 1024.0 * 1024.0),
            ),
        }
        .into());
    }
    Ok(())
}

struct JointHessianBundle<'a> {
    source: JointHessianSource,
    beta_flat: Array1<f64>,
    compute_dh: Box<DriftDerivFn<'a>>,
    compute_dh_many: Option<Box<DriftDerivManyFn<'a>>>,
    compute_d2h: Box<DriftSecondDerivFn<'a>>,
    /// Optional batched second-derivative callback. The unified evaluator's
    /// outer-Hessian ρ-ρ pair loop forwards the K(K+1)/2 (v_k, v_l) pairs
    /// here in one call when set, so families that fuse the per-row D²H walk
    /// (e.g. survival marginal-slope scanning n rows once per outer eval)
    /// amortise the row-walk across all pairs instead of paying it per pair.
    compute_d2h_many: Option<Box<DriftSecondDerivManyFn<'a>>>,
    owned_compute_dh:
        Option<Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>>,
    owned_compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    owned_compute_d2h: Option<
        Arc<
            dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
                + Send
                + Sync,
        >,
    >,
    /// Owned twin of `compute_d2h_many`. Threaded through to
    /// `OwnedJointDerivProvider` so the unified evaluator can share the
    /// callback across rayon worker threads when the outer Hessian routes
    /// through the parallel pair dispatch.
    owned_compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
}

type DriftDerivFn<'a> =
    dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync + 'a;
type DriftDerivManyFn<'a> =
    dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync + 'a;
type DriftSecondDerivFn<'a> = dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
    + Send
    + Sync
    + 'a;
type DriftSecondDerivManyFn<'a> = dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
    + Send
    + Sync
    + 'a;

fn materialize_joint_hessian_source(
    source: &JointHessianSource,
    total: usize,
    context: &str,
) -> Result<Array2<f64>, String> {
    match source {
        JointHessianSource::Dense(matrix) => Ok(matrix.clone()),
        JointHessianSource::Operator {
            apply_mat,
            dense_forced,
            ..
        } => {
            ensure_exact_joint_hessian_dense_budget(total, context)?;
            // Preferred path: the workspace exposes a structural direct-dense
            // build (e.g. SCOP's `scop_gradient_and_negative_hessian`). That
            // is `Θ(n·p²)` like column-basis HVP would be, but the constant
            // factor is much better because the structural build sweeps rows
            // once and uses BLAS-3 for the chain-rule pullback. Falling back
            // to column-basis HVP would re-walk all `n` rows once per column.
            if let Some(mut matrix) = dense_forced()? {
                if matrix.nrows() != total || matrix.ncols() != total {
                    return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                        "{context}: dense_forced shape mismatch: got {}x{}, expected {total}x{total}",
                        matrix.nrows(),
                        matrix.ncols()
                    ) }.into());
                }
                if matrix.iter().any(|value| !value.is_finite()) {
                    return Err(CustomFamilyError::NumericalFailure {
                        reason: format!("{context}: dense_forced returned non-finite values"),
                    }
                    .into());
                }
                symmetrize_dense_in_place(&mut matrix);
                return Ok(matrix);
            }
            // Column-basis reconstruction `H = H · I`. Driving it through the
            // batched multi-RHS apply lets a tiled/streamed operator sweep each
            // row tile exactly once for all `total` columns instead of once per
            // column (`total` full sweeps). The result is, column for column,
            // identical to applying the operator to each unit basis vector.
            let identity = Array2::<f64>::eye(total);
            let mut matrix = Array2::<f64>::zeros((total, total));
            apply_mat(&identity, &mut matrix)?;
            if matrix.iter().any(|value| !value.is_finite()) {
                return Err(CustomFamilyError::NumericalFailure {
                    reason: format!("{context}: operator matvec returned non-finite values"),
                }
                .into());
            }
            symmetrize_dense_in_place(&mut matrix);
            Ok(matrix)
        }
    }
}

fn exact_newton_joint_hessian_source_from_workspace(
    workspace: &Arc<dyn ExactNewtonJointHessianWorkspace>,
    total: usize,
    intent: MaterializationIntent,
    context: &str,
) -> Result<Option<JointHessianSource>, String> {
    if workspace.hessian_source_preference_for_intent(intent)
        == JointHessianSourcePreference::Operator
    {
        return exact_newton_joint_hessian_operator_source_from_workspace(
            workspace, total, intent, context,
        );
    }

    if let Some(mut hessian) = workspace.hessian_dense()? {
        if hessian.nrows() != total || hessian.ncols() != total {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "{context}: dense Hessian shape mismatch: got {}x{}, expected {total}x{total}",
                    hessian.nrows(),
                    hessian.ncols()
                ),
            }
            .into());
        }
        if hessian.iter().any(|value| !value.is_finite()) {
            return Err(CustomFamilyError::NumericalFailure {
                reason: format!("{context}: dense Hessian contains non-finite values"),
            }
            .into());
        }
        symmetrize_dense_in_place(&mut hessian);
        return Ok(Some(JointHessianSource::Dense(hessian)));
    }

    exact_newton_joint_hessian_operator_source_from_workspace(workspace, total, intent, context)
}

fn exact_newton_joint_hessian_operator_source_from_workspace(
    workspace: &Arc<dyn ExactNewtonJointHessianWorkspace>,
    total: usize,
    intent: MaterializationIntent,
    context: &str,
) -> Result<Option<JointHessianSource>, String> {
    let Some(diagonal) = workspace.hessian_diagonal()? else {
        if workspace.hessian_source_preference_for_intent(intent)
            == JointHessianSourcePreference::Operator
        {
            return Err(CustomFamilyError::UnsupportedConfiguration {
                reason: format!(
                    "{context}: operator-preferred Hessian workspace did not provide a diagonal"
                ),
            }
            .into());
        }
        return Ok(None);
    };
    if diagonal.len() != total {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{context}: operator diagonal length mismatch: got {}, expected {}",
                diagonal.len(),
                total
            ),
        }
        .into());
    }
    if diagonal.iter().any(|value| !value.is_finite()) {
        return Err(CustomFamilyError::NumericalFailure {
            reason: format!("{context}: operator diagonal contains non-finite values"),
        }
        .into());
    }

    if !workspace.hessian_matvec_available() {
        if workspace.hessian_source_preference_for_intent(intent)
            == JointHessianSourcePreference::Operator
        {
            return Err(CustomFamilyError::UnsupportedConfiguration {
                reason: format!(
                    "{context}: operator-preferred Hessian workspace did not provide HVPs"
                ),
            }
            .into());
        }
        return Ok(None);
    }

    let workspace_apply = Arc::clone(workspace);
    let workspace_apply_into = Arc::clone(workspace);
    let workspace_apply_mat = Arc::clone(workspace);
    let workspace_dense_forced = Arc::clone(workspace);
    let context_apply: Arc<str> = Arc::from(context);
    let context_apply_into = Arc::clone(&context_apply);
    let context_apply_mat = Arc::clone(&context_apply);
    let context_dense_forced = Arc::clone(&context_apply);
    Ok(Some(JointHessianSource::Operator {
        apply: Arc::new(move |v: &Array1<f64>| {
            if v.len() != total {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "{}: operator input length mismatch: got {}, expected {total}",
                        &*context_apply,
                        v.len()
                    ),
                }
                .into());
            }
            let Some(out) = workspace_apply.hessian_matvec(v)? else {
                return Err(CustomFamilyError::UnsupportedConfiguration {
                    reason: "joint exact-newton operator matvec unavailable".to_string(),
                }
                .into());
            };
            if out.len() != total {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "{}: operator matvec length mismatch: got {}, expected {total}",
                        &*context_apply,
                        out.len()
                    ),
                }
                .into());
            }
            if out.iter().any(|value| !value.is_finite()) {
                return Err(CustomFamilyError::NumericalFailure {
                    reason: format!(
                        "{}: operator matvec returned non-finite values",
                        &*context_apply
                    ),
                }
                .into());
            }
            Ok(out)
        }),
        apply_into: Arc::new(move |v: &Array1<f64>, out: &mut Array1<f64>| {
            if v.len() != total || out.len() != total {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "{}: operator input/output length mismatch: v={} out={} expected={total}",
                        &*context_apply_into,
                        v.len(),
                        out.len()
                    ),
                }
                .into());
            }
            if !workspace_apply_into.hessian_matvec_into(v, out)? {
                return Err(CustomFamilyError::UnsupportedConfiguration {
                    reason: "joint exact-newton operator matvec unavailable".to_string(),
                }
                .into());
            }
            if out.iter().any(|value| !value.is_finite()) {
                return Err(CustomFamilyError::NumericalFailure {
                    reason: format!(
                        "{}: operator matvec returned non-finite values",
                        &*context_apply_into
                    ),
                }
                .into());
            }
            Ok(())
        }),
        apply_mat: Arc::new(move |v_cols: &Array2<f64>, out: &mut Array2<f64>| {
            if v_cols.nrows() != total || out.nrows() != total {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "{}: operator batched apply row mismatch: v_cols={}x{} out={}x{} expected rows={total}",
                        &*context_apply_mat,
                        v_cols.nrows(),
                        v_cols.ncols(),
                        out.nrows(),
                        out.ncols()
                    ),
                }
                .into());
            }
            if v_cols.ncols() != out.ncols() {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "{}: operator batched apply column mismatch: v_cols has {} columns, out has {}",
                        &*context_apply_mat,
                        v_cols.ncols(),
                        out.ncols()
                    ),
                }
                .into());
            }
            if !workspace_apply_mat.hessian_apply_mat(v_cols, out)? {
                return Err(CustomFamilyError::UnsupportedConfiguration {
                    reason: "joint exact-newton operator batched apply unavailable".to_string(),
                }
                .into());
            }
            if out.iter().any(|value| !value.is_finite()) {
                return Err(CustomFamilyError::NumericalFailure {
                    reason: format!(
                        "{}: operator batched apply returned non-finite values",
                        &*context_apply_mat
                    ),
                }
                .into());
            }
            Ok(())
        }),
        diagonal,
        dense_forced: Arc::new(move || -> Result<Option<Array2<f64>>, String> {
            match workspace_dense_forced.hessian_dense_forced()? {
                Some(mut matrix) => {
                    if matrix.nrows() != total || matrix.ncols() != total {
                        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                            "{}: hessian_dense_forced shape mismatch: got {}x{}, expected {total}x{total}",
                            &*context_dense_forced,
                            matrix.nrows(),
                            matrix.ncols()
                        ) }.into());
                    }
                    if matrix.iter().any(|value| !value.is_finite()) {
                        return Err(CustomFamilyError::NumericalFailure {
                            reason: format!(
                                "{}: hessian_dense_forced returned non-finite values",
                                &*context_dense_forced
                            ),
                        }
                        .into());
                    }
                    symmetrize_dense_in_place(&mut matrix);
                    Ok(Some(matrix))
                }
                None => Ok(None),
            }
        }),
    }))
}

fn symmetrized_square_matrix(
    mut matrix: Array2<f64>,
    expected: usize,
    context: &str,
) -> Result<Array2<f64>, String> {
    if matrix.nrows() != expected || matrix.ncols() != expected {
        return Err(format!(
            "{context}: got {}x{}, expected {}x{}",
            matrix.nrows(),
            matrix.ncols(),
            expected,
            expected
        ));
    }
    if matrix.iter().any(|value| !value.is_finite()) {
        return Err(CustomFamilyError::NumericalFailure {
            reason: format!("{context}: matrix contains non-finite values"),
        }
        .into());
    }
    symmetrize_dense_in_place(&mut matrix);
    Ok(matrix)
}

/// Try exact Newton joint Hessian first, then surrogate. Returns `None` if
/// neither path provides a joint Hessian. When successful, returns the joint
/// Hessian source, flat beta, and boxed closures for computing directional
/// derivatives dH[v] and d²H[u,v].
///
/// This eliminates the previously duplicated exact-Newton and surrogate
/// code blocks in `outerobjectivegradienthessian_internal`.
fn build_joint_hessian_closures<'a, F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &'a F,
    block_states: &'a [ParameterBlockState],
    specs: &'a [ParameterBlockSpec],
    total: usize,
    options: &BlockwiseFitOptions,
    preferred_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<Option<JointHessianBundle<'a>>, String> {
    // Path 1: exact Newton joint Hessian (preferred).
    let beta_flat = flatten_state_betas(block_states, specs);
    let synced = Arc::new(synchronized_states_from_flat_beta(
        family,
        specs,
        block_states,
        &beta_flat,
    )?);
    let hessian_workspace = match preferred_workspace {
        Some(workspace) => Some(workspace),
        None => family.exact_newton_joint_hessian_workspace_with_options(
            synced.as_ref(),
            specs,
            options,
        )?,
    };
    // Outer-eval entry: prime any per-row jet caches the workspace will hand
    // to the directional-derivative path. Runs at top-level rayon (we are
    // outside the ext-coord `par_iter` here), so the cache build's own
    // `par_iter` enjoys full thread-pool parallelism. PIRLS-side workspace
    // construction skips this priming because PIRLS never invokes
    // `directional_derivative_operator`.
    if let Some(workspace) = hessian_workspace.as_ref() {
        workspace.warm_up_outer_caches()?;
    }
    if let Some(curvature) = family.exact_newton_outer_curvature(block_states)? {
        let h_joint_unpen = JointHessianSource::Dense(symmetrized_square_matrix(
            curvature.hessian,
            total,
            "joint exact-newton Hessian shape mismatch in outer gradient (rescaled)",
        )?);
        let compute_dh = Box::new(exact_newton_dh_closure(
            family,
            Arc::clone(&synced),
            specs,
            total,
            true,
            1.0,
            hessian_workspace.clone(),
        ));
        let compute_dh_many = None;
        let compute_d2h = Box::new(exact_newton_d2h_closure(
            family,
            Arc::clone(&synced),
            specs,
            total,
            true,
            1.0,
            hessian_workspace.clone(),
        ));
        let owned_compute_dh = exact_newton_dh_closure_owned(
            family.clone(),
            Arc::clone(&synced),
            specs.to_vec(),
            total,
            true,
            1.0,
            hessian_workspace.clone(),
        );
        let owned_compute_dh_many = None;
        let owned_compute_d2h = exact_newton_d2h_closure_owned(
            family.clone(),
            Arc::clone(&synced),
            specs.to_vec(),
            total,
            true,
            1.0,
            hessian_workspace.clone(),
        );
        return Ok(Some(JointHessianBundle {
            source: h_joint_unpen,
            beta_flat,
            compute_dh,
            compute_dh_many,
            compute_d2h,
            compute_d2h_many: None,
            owned_compute_dh: Some(owned_compute_dh),
            owned_compute_dh_many,
            owned_compute_d2h: Some(owned_compute_d2h),
            owned_compute_d2h_many: None,
            rho_curvature_scale: curvature.rho_curvature_scale,
            hessian_logdet_correction: curvature.hessian_logdet_correction,
        }));
    }
    let exact_joint_source = if let Some(workspace) = hessian_workspace.as_ref() {
        exact_newton_joint_hessian_source_from_workspace(
            workspace,
            total,
            MaterializationIntent::OuterGradient,
            "joint exact-newton operator mismatch in outer gradient",
        )?
    } else {
        None
    };
    let exact_joint_source = match exact_joint_source {
        Some(source) => Some(source),
        None => exact_newton_joint_hessian_symmetrized(
            family,
            block_states,
            specs,
            total,
            "joint exact-newton Hessian shape mismatch in outer gradient",
        )
        .map(|source| source.map(JointHessianSource::Dense))?,
    };
    if let Some(h_joint_unpen) = exact_joint_source {
        let compute_dh = Box::new(exact_newton_dh_closure(
            family,
            Arc::clone(&synced),
            specs,
            total,
            false,
            1.0,
            hessian_workspace.clone(),
        ));
        let compute_dh_many = exact_newton_dh_many_closure(1.0, hessian_workspace.clone());
        let compute_d2h = Box::new(exact_newton_d2h_closure(
            family,
            Arc::clone(&synced),
            specs,
            total,
            false,
            1.0,
            hessian_workspace.clone(),
        ));
        let owned_compute_dh = exact_newton_dh_closure_owned(
            family.clone(),
            Arc::clone(&synced),
            specs.to_vec(),
            total,
            false,
            1.0,
            hessian_workspace.clone(),
        );
        let owned_compute_dh_many =
            exact_newton_dh_many_closure_owned(1.0, hessian_workspace.clone());
        let owned_compute_d2h = exact_newton_d2h_closure_owned(
            family.clone(),
            Arc::clone(&synced),
            specs.to_vec(),
            total,
            false,
            1.0,
            hessian_workspace.clone(),
        );
        let compute_d2h_many = exact_newton_d2h_many_closure(1.0, hessian_workspace.clone());
        let owned_compute_d2h_many =
            exact_newton_d2h_many_closure_owned(1.0, hessian_workspace.clone());
        return Ok(Some(JointHessianBundle {
            source: h_joint_unpen,
            beta_flat,
            compute_dh,
            compute_dh_many,
            compute_d2h,
            compute_d2h_many,
            owned_compute_dh: Some(owned_compute_dh),
            owned_compute_dh_many,
            owned_compute_d2h: Some(owned_compute_d2h),
            owned_compute_d2h_many,
            rho_curvature_scale: 1.0,
            hessian_logdet_correction: 0.0,
        }));
    }

    // Path 2: surrogate joint Hessian (fallback).
    if let Some(h_joint_unpen) = family
        .joint_outer_hyper_surrogate_hessian_with_specs(block_states, specs)?
        .map(|h| {
            symmetrized_square_matrix(
                h,
                total,
                "joint outer-hyper surrogate Hessian shape mismatch",
            )
        })
        .transpose()?
    {
        let beta_flat = flatten_state_betas(block_states, specs);

        let compute_dh = Box::new(
            move |v_k: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
                let h_rho = family
                    .joint_outer_hyper_surrogate_hessian_directional_derivative_with_specs(
                        block_states,
                        specs,
                        v_k,
                    )?;
                match h_rho {
                    Some(h) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                        h,
                        total,
                        "joint surrogate dH shape mismatch",
                    )?))),
                    None => Err(CustomFamilyError::UnsupportedConfiguration {
                        reason: "joint surrogate dH unavailable for analytic outer gradient"
                            .to_string(),
                    }
                    .into()),
                }
            },
        );
        let compute_d2h = Box::new(
            move |u: &Array1<f64>, v: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
                match family
                    .joint_outer_hyper_surrogate_hessian_second_directional_derivative_with_specs(
                        block_states,
                        specs,
                        u,
                        v,
                    )? {
                    Some(m) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                        m,
                        total,
                        "joint surrogate d2H shape mismatch",
                    )?))),
                    None => Ok(None),
                }
            },
        );
        let family_owned = family.clone();
        let states_owned = block_states.to_vec();
        let specs_owned = specs.to_vec();
        let owned_compute_dh = Arc::new(
            move |v_k: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
                match family_owned
                    .joint_outer_hyper_surrogate_hessian_directional_derivative_with_specs(
                        &states_owned,
                        &specs_owned,
                        v_k,
                    )? {
                    Some(h) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                        h,
                        total,
                        "joint surrogate dH shape mismatch",
                    )?))),
                    None => Err(CustomFamilyError::UnsupportedConfiguration {
                        reason: "joint surrogate dH unavailable for analytic outer gradient"
                            .to_string(),
                    }
                    .into()),
                }
            },
        );
        let family_owned = family.clone();
        let states_owned = block_states.to_vec();
        let specs_owned = specs.to_vec();
        let owned_compute_d2h = Arc::new(
            move |u: &Array1<f64>, v: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
                match family_owned
                    .joint_outer_hyper_surrogate_hessian_second_directional_derivative_with_specs(
                        &states_owned,
                        &specs_owned,
                        u,
                        v,
                    )? {
                    Some(m) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                        m,
                        total,
                        "joint surrogate d2H shape mismatch",
                    )?))),
                    None => Ok(None),
                }
            },
        );
        return Ok(Some(JointHessianBundle {
            source: JointHessianSource::Dense(h_joint_unpen),
            beta_flat,
            compute_dh,
            compute_dh_many: None,
            compute_d2h,
            compute_d2h_many: None,
            owned_compute_dh: Some(owned_compute_dh),
            owned_compute_dh_many: None,
            owned_compute_d2h: Some(owned_compute_d2h),
            owned_compute_d2h_many: None,
            rho_curvature_scale: 1.0,
            hessian_logdet_correction: 0.0,
        }));
    }

    Ok(None)
}

/// Build a closure computing dH[v] using exact Newton derivatives on synced states.
/// Non-finite derivative output is treated as a hard error.
/// Symmetrize-and-scale the dH Dense result, optionally rejecting non-finite
/// values first.  The borrowed factory (`exact_newton_dh_closure`) guards
/// against non-finite output (`check_finite = true`); the owned factory
/// (`exact_newton_dh_closure_owned`) historically does not (`check_finite =
/// false`).  Routing both through this helper keeps that behavioral
/// distinction explicit rather than silently divergent.
fn finalize_dh_dense(
    h: Array2<f64>,
    total: usize,
    scale: f64,
    check_finite: bool,
) -> Result<Option<DriftDerivResult>, String> {
    if check_finite && h.iter().any(|v| !v.is_finite()) {
        return Err(CustomFamilyError::NumericalFailure {
            reason: "joint exact-newton dH returned non-finite values".to_string(),
        }
        .into());
    }
    let mut sym = symmetrized_square_matrix(h, total, "joint exact-newton dH shape mismatch")?;
    if scale != 1.0 {
        sym *= scale;
    }
    Ok(Some(DriftDerivResult::Dense(sym)))
}

/// Single source of truth for the dH[v] three-way dispatch shared by the
/// borrowed (`exact_newton_dh_closure`) and owned
/// (`exact_newton_dh_closure_owned`) closure factories.  The `check_finite`
/// flag preserves the lone behavioral difference between the two (the borrowed
/// variant rejects non-finite dense output, the owned variant does not); all
/// other logic — outer-curvature path, workspace-operator fast path, and
/// joint-Hessian fallback — is identical and lives here once.
fn exact_newton_dh_apply<F: CustomFamily + Sync>(
    family: &F,
    synced_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    total: usize,
    use_outer_curvature_derivatives: bool,
    scale: f64,
    workspace: Option<&Arc<dyn ExactNewtonJointHessianWorkspace>>,
    check_finite: bool,
    v_k: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
    // `v_k` is ALREADY the perturbation direction `δβ` the caller wants the
    // directional Hessian derivative evaluated along. The `HessianDerivativeProvider`s
    // (`BorrowedJointDerivProvider`/`OwnedJointDerivProvider`) own the implicit-
    // function-theorem sign `δβ = −H⁻¹(A_k β̂)` and negate before calling this
    // closure (matching `exact_newton_d2h_apply` and the owned `_many` closure,
    // which also pass the direction straight through). Re-negating here would
    // double-negate `D_β H[δβ]`, flipping the mode-response drift in the outer
    // LAML trace `½ tr(K · (B_i + D_β H[δβ_i]))` and desynchronising the analytic
    // outer gradient from its objective for every β-dependent-Hessian exact
    // family (spatial-adaptive, survival/bernoulli marginal-slope). Pass through.
    let mode_response = v_k.clone();
    if use_outer_curvature_derivatives {
        let h_rho = family.exact_newton_outer_curvature_directional_derivative_with_specs(
            synced_states,
            specs,
            &mode_response,
        )?;
        return match h_rho {
            Some(h) => finalize_dh_dense(h, total, scale, check_finite),
            None => Err(CustomFamilyError::UnsupportedConfiguration {
                reason: "joint exact-newton dH unavailable for analytic outer gradient".to_string(),
            }
            .into()),
        };
    }

    if let Some(workspace) = workspace
        && let Some(operator) = workspace.directional_derivative_operator(&mode_response)?
    {
        return Ok(Some(scale_drift_deriv_result(
            DriftDerivResult::Operator(operator),
            scale,
        )));
    }

    match family.exact_newton_joint_hessian_directional_derivative_with_specs(
        synced_states,
        specs,
        &mode_response,
    )? {
        Some(h) => finalize_dh_dense(h, total, scale, check_finite),
        None => Err(CustomFamilyError::UnsupportedConfiguration {
            reason: "joint exact-newton dH unavailable for analytic outer gradient".to_string(),
        }
        .into()),
    }
}

fn exact_newton_dh_closure<'a, F: CustomFamily + Sync>(
    family: &'a F,
    synced_states: Arc<Vec<ParameterBlockState>>,
    specs: &'a [ParameterBlockSpec],
    total: usize,
    use_outer_curvature_derivatives: bool,
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> impl Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync + 'a {
    move |v_k: &Array1<f64>| {
        exact_newton_dh_apply(
            family,
            synced_states.as_ref(),
            specs,
            total,
            use_outer_curvature_derivatives,
            scale,
            workspace.as_ref(),
            true,
            v_k,
        )
    }
}

fn exact_newton_dh_many_closure<'a>(
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<Box<DriftDerivManyFn<'a>>> {
    let workspace = workspace?;
    Some(Box::new(move |directions: &[Array1<f64>]| {
        // `directions` are already the perturbation directions `δβ`; the provider
        // owns the IFT sign and pre-negates (see `exact_newton_dh_apply`). The
        // owned `_many` counterpart passes them straight through, so this borrowed
        // path must too — re-negating here double-flips the mode-response drift.
        workspace
            .directional_derivative_operators(directions)?
            .into_iter()
            .map(|maybe_operator| {
                Ok(maybe_operator.map(|operator| {
                    scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
                }))
            })
            .collect()
    }))
}

/// Single source of truth for the d²H[u,v] three-way dispatch shared by the
/// borrowed (`exact_newton_d2h_closure`) and owned
/// (`exact_newton_d2h_closure_owned`) closure factories.  Takes references for
/// `family`/`specs` so both ownership flavors can call it; the only difference
/// between the two factories is borrow-vs-own plumbing, which lives in the
/// wrappers, not here.
fn exact_newton_d2h_apply<F: CustomFamily + Sync>(
    family: &F,
    synced_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    total: usize,
    use_outer_curvature_derivatives: bool,
    scale: f64,
    workspace: Option<&Arc<dyn ExactNewtonJointHessianWorkspace>>,
    u: &Array1<f64>,
    v: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
    if use_outer_curvature_derivatives {
        return match family.exact_newton_outer_curvature_second_directional_derivative_with_specs(
            synced_states,
            specs,
            u,
            v,
        )? {
            Some(m) => {
                let mut sym =
                    symmetrized_square_matrix(m, total, "joint exact-newton d2H shape mismatch")?;
                if scale != 1.0 {
                    sym *= scale;
                }
                Ok(Some(DriftDerivResult::Dense(sym)))
            }
            None => Ok(None),
        };
    }

    if let Some(workspace) = workspace
        && let Some(operator) = workspace.second_directional_derivative_operator(u, v)?
    {
        return Ok(Some(scale_drift_deriv_result(
            DriftDerivResult::Operator(operator),
            scale,
        )));
    }

    match family.exact_newton_joint_hessian_second_directional_derivative_with_specs(
        synced_states,
        specs,
        u,
        v,
    )? {
        Some(m) => {
            let mut sym =
                symmetrized_square_matrix(m, total, "joint exact-newton d2H shape mismatch")?;
            if scale != 1.0 {
                sym *= scale;
            }
            Ok(Some(DriftDerivResult::Dense(sym)))
        }
        None => Ok(None),
    }
}

/// Build a closure computing d²H[u,v] using exact Newton derivatives on synced states.
fn exact_newton_d2h_closure<'a, F: CustomFamily + Sync>(
    family: &'a F,
    synced_states: Arc<Vec<ParameterBlockState>>,
    specs: &'a [ParameterBlockSpec],
    total: usize,
    use_outer_curvature_derivatives: bool,
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> impl Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync + 'a
{
    move |u: &Array1<f64>, v: &Array1<f64>| {
        exact_newton_d2h_apply(
            family,
            synced_states.as_ref(),
            specs,
            total,
            use_outer_curvature_derivatives,
            scale,
            workspace.as_ref(),
            u,
            v,
        )
    }
}

fn exact_newton_d2h_many_closure<'a>(
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<Box<DriftSecondDerivManyFn<'a>>> {
    let workspace = workspace?;
    Some(Box::new(move |pairs: &[(Array1<f64>, Array1<f64>)]| {
        workspace
            .second_directional_derivative_operators(pairs)?
            .into_iter()
            .map(|maybe_operator| {
                Ok(maybe_operator.map(|operator| {
                    scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
                }))
            })
            .collect()
    }))
}

fn exact_newton_dh_closure_owned<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: F,
    synced_states: Arc<Vec<ParameterBlockState>>,
    specs: Vec<ParameterBlockSpec>,
    total: usize,
    use_outer_curvature_derivatives: bool,
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync> {
    Arc::new(move |v_k: &Array1<f64>| {
        exact_newton_dh_apply(
            &family,
            synced_states.as_ref(),
            &specs,
            total,
            use_outer_curvature_derivatives,
            scale,
            workspace.as_ref(),
            false,
            v_k,
        )
    })
}

fn exact_newton_dh_many_closure_owned(
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<
    Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
> {
    let workspace = workspace?;
    Some(Arc::new(move |directions: &[Array1<f64>]| {
        workspace
            .directional_derivative_operators(directions)?
            .into_iter()
            .map(|maybe_operator| {
                Ok(maybe_operator.map(|operator| {
                    scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
                }))
            })
            .collect()
    }))
}

fn exact_newton_d2h_closure_owned<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: F,
    synced_states: Arc<Vec<ParameterBlockState>>,
    specs: Vec<ParameterBlockSpec>,
    total: usize,
    use_outer_curvature_derivatives: bool,
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Arc<dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>
{
    Arc::new(move |u: &Array1<f64>, v: &Array1<f64>| {
        exact_newton_d2h_apply(
            &family,
            synced_states.as_ref(),
            &specs,
            total,
            use_outer_curvature_derivatives,
            scale,
            workspace.as_ref(),
            u,
            v,
        )
    })
}

fn exact_newton_d2h_many_closure_owned(
    scale: f64,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Option<
    Arc<
        dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
            + Send
            + Sync,
    >,
> {
    let workspace = workspace?;
    Some(Arc::new(move |pairs: &[(Array1<f64>, Array1<f64>)]| {
        workspace
            .second_directional_derivative_operators(pairs)?
            .into_iter()
            .map(|maybe_operator| {
                Ok(maybe_operator.map(|operator| {
                    scale_drift_deriv_result(DriftDerivResult::Operator(operator), scale)
                }))
            })
            .collect()
    }))
}

fn strict_solve_spd(matrix: &Array2<f64>, rhs: &Array1<f64>) -> Result<Array1<f64>, String> {
    let mut sym = matrix.clone();
    symmetrize_dense_in_place(&mut sym);
    let chol = sym
        .cholesky(Side::Lower)
        .map_err(|_| "strict pseudo-laplace SPD solve failed".to_string())?;
    Ok(chol.solvevec(rhs))
}

/// Statistics about a Levenberg-Marquardt-style δ-ridge SPD continuation.
/// Recorded by `strict_solve_spd_with_lm_continuation` and surfaced for
/// diagnostics — a recurring need for nontrivial ridges signals fragile
/// curvature that the controller may need to escalate.
#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct StrictSpdLmStats {
    /// δ value finally used (0.0 means the bare strict solve succeeded).
    pub(crate) delta_used: f64,
    /// Number of escalations performed before Cholesky succeeded.
    pub(crate) escalations: usize,
}

/// Strict-mode SPD solve with internal Levenberg-Marquardt δ-ridge
/// continuation: solves `(H + δI) x = b` with δ escalated geometrically
/// until the Cholesky succeeds.  The bare `strict_solve_spd` is unchanged —
/// callers that need strict semantics keep them.  Callers that want
/// fail-soft Newton on a fragile geometry (e.g. spatial-adaptive seed
/// evaluation) use this wrapper to avoid bouncing the entire seed on a
/// numerically-indefinite block.
///
/// Schedule: δ₀ = max(ε · ‖H‖₁ / p, 1e-12); growth ×10 per step; capped
/// at MAX_ESCALATIONS escalations.  The cap prevents runaway curvature
/// from producing arbitrary ridges; if the cap is hit, the bare strict
/// error propagates so the caller can route to a different optimization
/// path (e.g. sparse/gradient-only standard REML at full data).
/// Shared escalation/ridge-growth schedule used by the three
/// `strict_*_spd_with_lm_continuation` helpers. Hoisted here so a single
/// change updates the solve / inverse / logdet paths in lockstep.
const STRICT_SPD_LM_MAX_ESCALATIONS: usize = 16;
const STRICT_SPD_LM_RIDGE_GROWTH: f64 = 10.0;

/// Floor applied to IRLS working weights so downstream divisions cannot hit
/// exact zero. Used as the default `minweight` in `CustomFamilyOptions` and
/// mirrored in tests that override it.
///
/// Sourced from the canonical PIRLS positive-weight floor
/// ([`crate::solver::pirls::MIN_WEIGHT`] = `1e-12`) so every floored family
/// shares one definition; this alias keeps the descriptive local name at the
/// `minweight` defaults.
const CUSTOM_FAMILY_WEIGHT_FLOOR: f64 = crate::solver::pirls::MIN_WEIGHT;

/// Default initial ridge δ for the explicit-stabilization Cholesky escalation
/// schedule. Enters the quadratic term, the Laplace Hessian, and the penalty
/// log-determinant via the active `RidgePolicy`.
const CUSTOM_FAMILY_RIDGE_FLOOR: f64 = 1e-12;

/// Relative eigenvalue floor used wherever an eigendecomposition needs to
/// distinguish "real" curvature from noise: `eps_floor = EVAL_FLOOR · max|λ|`.
/// Applied uniformly in the strict-SPD LM eigen fallback, positive-part
/// pseudo-inverse, and penalty-direction projection.
const CUSTOM_FAMILY_EVAL_FLOOR: f64 = 1e-12;

/// Absolute relative-condition guard used to prevent the eigen / spectral
/// floors from collapsing to zero when `max|λ|` is itself tiny. Combined with
/// `CUSTOM_FAMILY_EVAL_FLOOR · max|λ|` via `.max(...)`.
const CUSTOM_FAMILY_CONDITION_RELATIVE_FLOOR: f64 = 1e-14;

/// Shared engine: try the bare strict path, fall through to an escalating
/// LM δ-ridge Cholesky, and finally an eigen-floor fallback that clamps every
/// eigenvalue from below at `eps_floor = 1e-12 · max|λ|`. Each caller
/// (solve / inverse / logdet) supplies the three operation-specific closures.
///
/// Centralizing the LM/eigen scaffolding here both removes ~180 lines of
/// near-duplicated code and guarantees the three sibling helpers stay in
/// lockstep — any future change to the schedule, the trace_scale heuristic,
/// or the eigen-floor logic now lives in exactly one place.
fn strict_spd_lm_engine<R>(
    matrix: &Array2<f64>,
    op_label: &'static str,
    empty: R,
    bare_path: impl FnOnce(&Array2<f64>) -> Result<R, String>,
    process_chol: impl FnOnce(&crate::faer_ndarray::FaerCholeskyFactor) -> R,
    process_eigen: impl FnOnce(&Array1<f64>, &Array2<f64>, f64) -> R,
) -> Result<(R, StrictSpdLmStats), String> {
    if let Ok(r) = bare_path(matrix) {
        return Ok((r, StrictSpdLmStats::default()));
    }

    let p = matrix.nrows();
    if p == 0 {
        return Ok((empty, StrictSpdLmStats::default()));
    }
    let mut sym = matrix.clone();
    symmetrize_dense_in_place(&mut sym);
    let trace_scale = (0..p).map(|i| sym[[i, i]].abs()).sum::<f64>() / (p as f64);
    let delta0 = (f64::EPSILON * trace_scale.max(1.0)).max(CUSTOM_FAMILY_RIDGE_FLOOR);

    let mut delta = delta0;
    for escalation in 1..=STRICT_SPD_LM_MAX_ESCALATIONS {
        let mut ridged = sym.clone();
        for i in 0..p {
            ridged[[i, i]] += delta;
        }
        if let Ok(chol) = ridged.cholesky(Side::Lower) {
            return Ok((
                process_chol(&chol),
                StrictSpdLmStats {
                    delta_used: delta,
                    escalations: escalation,
                },
            ));
        }
        delta *= STRICT_SPD_LM_RIDGE_GROWTH;
    }

    // δ-ridge schedule exhausted; fall back to rank-aware eigen-floor handling.
    // Floors every eigenvalue at `eps_floor = 1e-12 · max|λ|` so well-conditioned
    // modes are resolved exactly and rank-deficient directions are handled with
    // controlled curvature, preventing the spatial-adaptive pilot from collapsing
    // to a cold full-data run.
    let max_esc = STRICT_SPD_LM_MAX_ESCALATIONS;
    let (evals, evecs) = FaerEigh::eigh(&sym, Side::Lower).map_err(|e| {
        format!(
            "{op_label} failed even with LM δ-ridge continuation \
             (escalated {max_esc} times to δ={delta:.3e}, trace_scale={trace_scale:.3e}); \
             eigen-floor fallback also failed: {e}"
        )
    })?;
    let max_abs_eval = evals.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
    let eps_floor = (CUSTOM_FAMILY_EVAL_FLOOR * max_abs_eval).max(1e-300);
    Ok((
        process_eigen(&evals, &evecs, eps_floor),
        StrictSpdLmStats {
            delta_used: delta,
            escalations: STRICT_SPD_LM_MAX_ESCALATIONS + 1,
        },
    ))
}

pub(crate) fn strict_solve_spd_with_lm_continuation(
    matrix: &Array2<f64>,
    rhs: &Array1<f64>,
) -> Result<(Array1<f64>, StrictSpdLmStats), String> {
    let p = matrix.nrows();
    strict_spd_lm_engine(
        matrix,
        "strict pseudo-laplace SPD solve",
        Array1::<f64>::zeros(0),
        |m| strict_solve_spd(m, rhs),
        |chol| chol.solvevec(rhs),
        |evals, evecs, eps_floor| {
            // x = Q diag(1/Λ̃) Qᵀ rhs.
            let mut q_t_rhs = Array1::<f64>::zeros(p);
            for k in 0..p {
                let mut acc = 0.0;
                for i in 0..p {
                    acc += evecs[[i, k]] * rhs[i];
                }
                q_t_rhs[k] = acc / evals[k].max(eps_floor);
            }
            let mut x = Array1::<f64>::zeros(p);
            for i in 0..p {
                let mut acc = 0.0;
                for k in 0..p {
                    acc += evecs[[i, k]] * q_t_rhs[k];
                }
                x[i] = acc;
            }
            x
        },
    )
}

/// Exact pseudo-Laplace log-determinant `log|H + S_λ|` of the REML/LAML
/// objective, computed from the eigenspectrum with **no δ-ridge** so the value
/// stays on the same objective as the analytic gradient `tr((H+S_λ)⁻¹ ·)`
/// (gam#748).
///
/// The earlier strict path returned `log|H + S_λ + δI|` with `δ = δ(ρ)`
/// escalated geometrically until factorization succeeded. That makes `V(ρ)`
/// carry a ρ-dependent, discontinuous `δ(ρ)` the analytic derivatives ignore —
/// exactly the objective/derivative mismatch the
/// operator-dense path's own comment forbids ("mixing an approximate
/// determinant with exact traces gives ARC a Hessian for a different
/// objective"). The strict path now computes one honest quantity:
///
/// - eigendecompose the symmetrised `H + S_λ`;
/// - **reject** (return `Err`) when any eigenvalue is genuinely negative
///   (`λ < −tol`). An indefinite joint coefficient Hessian is a real defect
///   (a non-stationary inner β or a mis-signed curvature block); rejecting it
///   tells the outer optimizer to step back, instead of masking it with a
///   biased finite number;
/// - sum `Σ_{λ > tol} log λ` — the exact pseudo-logdet on the positive
///   eigenspace, which is `C∞` in ρ because the positive eigenspace of a PSD
///   `S(ρ)=Σ e^{ρ_k} S_k` is structurally fixed. A near-zero band `[−tol, tol]`
///   (a structural null space) is simply not in `range` and contributes no
///   term, matching the projected `tr` derivative; a near-singular-but-positive
///   curvature is accepted exactly as the historical Cholesky strict path did.
fn strict_exact_pseudo_logdet(
    matrix: &Array2<f64>,
    accumulation_depth: usize,
) -> Result<f64, String> {
    let mut sym = matrix.clone();
    symmetrize_dense_in_place(&mut sym);
    let (evals, _) = FaerEigh::eigh(&sym, Side::Lower)
        .map_err(|e| format!("strict pseudo-laplace eigendecomposition failed: {e}"))?;
    let p = sym.nrows();
    let max_abs_eval = evals.iter().fold(0.0_f64, |acc, &ev| acc.max(ev.abs()));
    // Bauer-Fike: |δσ| ≤ p·‖δH‖_∞; n-term fma roundoff gives ‖δH‖_∞ ≤ ε·n·‖H‖,
    // so σ_noise ≤ ε·n·p·‖H‖₂. Tenfold slack absorbs sign cancellations,
    // and a 100·ε floor handles the ‖H‖→0 limit. This `neg_tol` is the
    // INDEFINITENESS-rejection band only: an eigenvalue below `−neg_tol` is a
    // genuine negative curvature (non-stationary β / mis-signed block) and is
    // rejected, not masked (gam#748).
    let eps = f64::EPSILON;
    let eps_np = eps * (accumulation_depth as f64) * (p as f64);
    // `neg_tol` is the INDEFINITENESS-rejection band only: an eigenvalue below
    // `−neg_tol` is a genuine negative curvature (non-stationary β / mis-signed
    // block) and is rejected, not masked (gam#748).
    let neg_tol = (10.0 * eps_np * max_abs_eval).max(100.0 * eps);
    // POSITIVE-eigenspace inclusion cutoff for the pseudo-logdet sum. This MUST
    // be byte-identical to the cutoff the analytic REML gradient's trace kernel
    // uses (`positive_eigenvalue_threshold`, the `range(H+Sλ)` Moore–Penrose
    // pinv drop in `joint_penalty_subspace_trace_parts`), or the LAML VALUE
    // `½ log|H+Sλ|₊` and its analytic GRADIENT `½ tr((H+Sλ)⁺ ∂Sλ)` are evaluated
    // over DIFFERENT subspaces and describe DIFFERENT objectives — the "mixing
    // an approximate determinant with exact traces gives ARC a Hessian for a
    // different objective" trap (gam#748).
    //
    // Historically this sum used the Bauer–Fike `neg_tol = 10·ε·n·p·‖H‖`, a
    // factor of ~n/10 LARGER than the kernel's `100·ε·p·‖H‖`. At an oversmoothed
    // marginal-slope ρ probe a penalty-null trend eigenvalue lands in the band
    // `(100·ε·p·‖H‖, 10·ε·n·p·‖H‖)`: DROPPED from the value logdet but KEPT in
    // the gradient kernel, so the analytic outer gradient is the derivative of a
    // different objective than the value. ARC's predicted descent then never
    // matches the actual objective change and the outer optimizer freezes
    // (constant ‖g‖, stuck cost — gam#808). Sharing the kernel's threshold here
    // removes the desync at the source; both are `C∞` in ρ (the positive
    // eigenspace of a PSD-shifted Hessian is structurally fixed).
    let pos_tol = positive_eigenvalue_threshold(evals.as_slice().unwrap());
    if evals.iter().any(|&ev| ev < -neg_tol) {
        let min_eval = evals.iter().copied().fold(f64::INFINITY, f64::min);
        let below = evals.iter().filter(|&&ev| ev < -neg_tol).count();
        return Err(CustomFamilyError::NumericalFailure {
            reason: format!(
                "strict pseudo-laplace logdet: {below} eigenvalue(s) below -neg_tol \
             (min(λ)={min_eval:.6e}, max|λ|={max_abs_eval:.6e}, neg_tol={neg_tol:.6e}, εnp={eps_np:.6e}); \
             indefinite joint coefficient Hessian rejected (no δ-ridge masking, gam#748)"
            ),
        }
        .into());
    }
    Ok(evals
        .iter()
        .copied()
        .filter(|&ev| ev > pos_tol)
        .map(f64::ln)
        .sum())
}

fn pinv_positive_part(matrix: &Array2<f64>, ridge_floor: f64) -> Result<Array2<f64>, String> {
    let mut sym = matrix.clone();
    symmetrize_dense_in_place(&mut sym);
    let (eigenvalues, eigenvectors) = sym
        .eigh(Side::Lower)
        .map_err(|e| format!("positive-part covariance eigendecomposition failed: {e}"))?;
    let max_abs_eigenvalue = eigenvalues.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
    let tol = (max_abs_eigenvalue * CUSTOM_FAMILY_EVAL_FLOOR)
        .max(ridge_floor.max(CUSTOM_FAMILY_CONDITION_RELATIVE_FLOOR));
    let p = matrix.nrows();
    let mut pinv = Array2::<f64>::zeros((p, p));
    for (k, &ev) in eigenvalues.iter().enumerate() {
        if ev <= tol {
            continue;
        }
        let inv_ev = 1.0 / ev;
        for i in 0..p {
            let vi = eigenvectors[[i, k]];
            for j in 0..p {
                pinv[[i, j]] += inv_ev * vi * eigenvectors[[j, k]];
            }
        }
    }
    symmetrize_dense_in_place(&mut pinv);
    Ok(pinv)
}

fn include_exact_newton_logdet_h<F: CustomFamily + ?Sized>(
    family: &F,
    options: &BlockwiseFitOptions,
) -> bool {
    options.use_remlobjective
        && matches!(
            family.exact_newton_outerobjective(),
            ExactNewtonOuterObjective::RidgedQuadraticReml
                | ExactNewtonOuterObjective::StrictPseudoLaplace
        )
}

pub(crate) fn custom_family_outer_derivatives<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
) -> (
    crate::solver::outer_strategy::Derivative,
    crate::solver::outer_strategy::DeclaredHessianForm,
) {
    use crate::solver::outer_strategy::{DeclaredHessianForm, Derivative};

    // The capability-vs-policy split: capability tells us *what the family
    // can compute*; policy tells us *what we should ask for at this size*.
    //
    // For the outer-strategy declaration here we have only `specs` and
    // `options` (no resolved psi_dim), so policy is queried at
    // psi_dim = 0 — the gradient/Hessian forms returned here are the
    // pre-psi declarations consumed by the outer planner ladder. The
    // per-iter clamp in `optimize_spatial_length_scale_exact_joint`
    // consults `outer_derivative_policy` again with the realized
    // psi_dim for the κ optimizer.
    let policy = family.outer_derivative_policy(specs, 0, options);
    let gradient = if policy.capability.has_gradient() {
        Derivative::Analytic
    } else {
        Derivative::Unavailable
    };
    // The analytic outer Hessian is routed to ARC whenever the realized family
    // exposes second-order calculus. Matrix-free Hessian support is a
    // representation capability used by the evaluator; it must not be hidden
    // from the outer optimizer by a cost-based first-order policy.
    let hessian = if options.use_outer_hessian
        && include_exact_newton_logdet_h(family, options)
        && policy.capability.has_hessian()
    {
        DeclaredHessianForm::Either
    } else {
        DeclaredHessianForm::Unavailable
    };

    (gradient, hessian)
}

fn include_exact_newton_logdet_s<F: CustomFamily + ?Sized>(
    family: &F,
    options: &BlockwiseFitOptions,
) -> bool {
    family.exact_newton_outerobjective() == ExactNewtonOuterObjective::RidgedQuadraticReml
        && options.use_remlobjective
}

fn use_exact_newton_strict_spd<F: CustomFamily + ?Sized>(family: &F) -> bool {
    family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
}

fn blockwise_logdet_terms<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &mut [ParameterBlockState],
    block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
) -> Result<(f64, f64), String> {
    blockwise_logdet_terms_with_workspace(family, specs, states, block_log_lambdas, options, None)
}

fn blockwise_logdet_terms_with_workspace<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &mut [ParameterBlockState],
    block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
    preferred_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<(f64, f64), String> {
    let include_logdet_h = include_exact_newton_logdet_h(family, options);
    let include_logdet_s = include_exact_newton_logdet_s(family, options);
    if !include_logdet_h && !include_logdet_s {
        return Ok((0.0, 0.0));
    }
    let strict_spd = use_exact_newton_strict_spd(family);
    refresh_all_block_etas(family, specs, states)?;
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    // Universal full-span robustness: the outer REML logdet of the
    // penalized Hessian must use the SAME Jeffreys-augmented Hessian
    // `H + S_λ + H_Φ` the inner Newton converged on, or the LAML score and its
    // analytic derivatives describe a different objective. Compute `H_Φ` once
    // over the full-span basis `Z_J` and add it into whichever
    // logdet path runs below. `None` ⇒ no logdet-H contribution (logdet-S only).
    // Cheap matrix-free conditioning pre-check for the OUTER logdet H_Φ. When a
    // matrix-free workspace exposes the Hessian-vector product, bound the joint
    // information's spectrum from a few matvecs (no dense H, no O(p³) eigh): if it
    // certifies well-conditioned the exact gate is certain to return H_Φ = 0, so
    // we skip the whole dense formation and use `None` (no logdet-H Jeffreys
    // contribution), byte-identical to the gated-off path. This keeps the outer
    // LAML logdet consistent with the inner solve (which also gated the term off
    // on the same well-conditioned geometry) while preserving the matrix-free path
    // at outer-eval scale. Returns `false`/unsure ⇒ exact formation below.
    let outer_precheck_eligible = include_logdet_h
        && total >= crate::estimate::reml::jeffreys_subspace::CHEAP_CONDITIONING_PRECHECK_MIN_DIM;
    let outer_jeffreys_precheck_skips = match preferred_workspace.as_ref() {
        Some(ws) if outer_precheck_eligible && ws.hessian_matvec_available() => {
            let hv = |v: &Array1<f64>| -> Result<Array1<f64>, String> {
                match ws.hessian_matvec(v)? {
                    Some(out) if out.len() == total => Ok(out),
                    // Workspace declined this matvec ⇒ cannot certify ⇒ do not skip.
                    // Return a non-finite sentinel so the cheap estimator bails to
                    // the conservative `false` (never skip on an unresolved apply).
                    _ => Ok(Array1::from_elem(total, f64::NAN)),
                }
            };
            crate::estimate::reml::jeffreys_subspace::jeffreys_term_skippable_via_matvec(hv, total)
                .unwrap_or(false)
        }
        _ => false,
    };
    let logdet_jeffreys_hphi: Option<Array2<f64>> = if include_logdet_h
        && !outer_jeffreys_precheck_skips
        && !options.seed_screening
        && family.joint_jeffreys_term_required()
    {
        // Skipped during seed screening: this per-axis Jeffreys curvature
        // (O(p · per-axis-Hdot)) augments the outer LAML logdet `½ log|H+Sλ+H_Φ|`,
        // a refinement the screening SCORE does not need. Screening ranks seeds by
        // the un-augmented `½ log|H+Sλ|` plus the value-only Firth penalty already
        // in `penalty_value`; the load-bearing H_Φ is restored for the real fit
        // (gam#729/#808).
        match build_joint_jeffreys_subspace(specs, &ranges)? {
            Some(z_joint) => {
                custom_family_joint_jeffreys_term(family, states, specs, &ranges, &z_joint)?
                    .map(|(_phi, _grad, hphi)| hphi)
            }
            None => None,
        }
    } else {
        None
    };
    let compute_block_logdet_term = |b: usize| -> Result<(Array2<f64>, f64), String> {
        let spec = &specs[b];
        let (start, end) = ranges[b];
        let p = end - start;
        let lambdas = block_log_lambdas[b].mapv(f64::exp);
        let mut s_lambda = Array2::<f64>::zeros((p, p));
        for (k, s) in spec.penalties.iter().enumerate() {
            s.add_scaled_to(lambdas[k], &mut s_lambda);
        }
        let block_logdet = if include_logdet_s {
            // Pseudo-logdet of S_λ on the positive eigenspace.
            //
            // CONSISTENCY REQUIREMENT (gam#752/#748/#808 class): this VALUE is
            // the `log|S_λ|₊` term of the outer REML/LAML objective, and its
            // ρ-gradient is supplied separately by
            // `compute_block_penalty_logdet_derivs`, which differentiates the
            // canonical `PenaltyPseudologdet`. If the value used a *different*
            // positive/null eigenspace split (e.g. structural-count `skip(m0)`
            // by COUNT, or a ridge-blind `positive_eigenvalue_threshold`) than
            // the gradient's by-magnitude `> ridge + noise_band` rule, the
            // outer optimizer would see an objective and a gradient that
            // describe different functions near the ridge boundary (a barely-
            // active mode `λ_k σ_k → 0` whose ridged eigenvalue dips below
            // `ridge + noise_band` is kept by the count rule but dropped by the
            // magnitude rule). To guarantee value↔gradient agree by
            // construction, compute the value from the SAME canonical
            // `PenaltyPseudologdet` the gradient differentiates, with the same
            // dense penalty components, the same λ, and the same ridge.
            let ridge = if options.ridge_policy.include_penalty_logdet {
                effective_solverridge(options.ridge_floor)
            } else {
                0.0
            };
            let penalties_dense: Vec<Array2<f64>> =
                spec.penalties.iter().map(|pen| pen.to_dense()).collect();
            let lambdas_vec: Vec<f64> = lambdas.to_vec();
            match crate::estimate::reml::penalty_logdet::PenaltyPseudologdet::from_components(
                &penalties_dense,
                &lambdas_vec,
                ridge,
            ) {
                Ok(pld) => pld.value(),
                Err(eigh_err_msg) => {
                    // `from_components` only fails when the single internal
                    // eigendecomposition fails, which for PSD penalties is
                    // purely numerical. Fall back to Cholesky on the ridged
                    // matrix (which should be SPD). The Cholesky logdet
                    // includes null-space contributions (~m₀ × ln(ridge)),
                    // a smooth bias that does not corrupt the REML gradient.
                    let mut s_for_logdet = s_lambda.clone();
                    if ridge > 0.0 {
                        for i in 0..p {
                            s_for_logdet[[i, i]] += ridge;
                        }
                    }
                    penalty_logdet_cholesky_fallback(&s_for_logdet, ridge, b, p, &eigh_err_msg)?
                }
            }
        } else {
            0.0
        };
        Ok((s_lambda, block_logdet))
    };

    // Per-block penalty assembly and eigendecomposition are independent.
    // Use rayon only from non-rayon callers so inner operator/eigendecomp work
    // does not nest under an existing worker. Collecting an indexed range into
    // a Vec preserves block order; totals are accumulated sequentially below
    // to keep floating-point summation deterministic.
    let block_terms: Vec<Result<(Array2<f64>, f64), String>> =
        if specs.len() > 1 && rayon::current_thread_index().is_none() {
            use rayon::iter::{IntoParallelIterator, ParallelIterator};
            (0..specs.len())
                .into_par_iter()
                .map(compute_block_logdet_term)
                .collect()
        } else {
            (0..specs.len()).map(compute_block_logdet_term).collect()
        };
    let mut s_lambdas = Vec::with_capacity(block_terms.len());
    let mut penalty_logdet_s_total = 0.0;
    for block_term in block_terms {
        let (s_lambda, block_logdet) = block_term?;
        s_lambdas.push(s_lambda);
        penalty_logdet_s_total += block_logdet;
    }
    if !include_logdet_h {
        return Ok((0.0, penalty_logdet_s_total));
    }
    // Try the shared scale-aware exact curvature path first.
    if let Some(curvature) = family.exact_newton_outer_curvature(states)? {
        let mut h_joint = symmetrized_square_matrix(
            curvature.hessian,
            total,
            "joint exact-newton Hessian validation in logdet terms (rescaled)",
        )?;
        for (b, s_lambda) in s_lambdas.iter().enumerate() {
            let (start, end) = ranges[b];
            h_joint
                .slice_mut(ndarray::s![start..end, start..end])
                .scaled_add(curvature.rho_curvature_scale, s_lambda);
        }
        if let Some(hphi) = logdet_jeffreys_hphi.as_ref() {
            h_joint.scaled_add(curvature.rho_curvature_scale, hphi);
        }
        let logdet_h_scaled = if strict_spd {
            strict_exact_pseudo_logdet(&h_joint, joint_observation_count(states))?
        } else {
            stable_logdet_with_ridge_policy(
                &h_joint,
                options.ridge_floor * curvature.rho_curvature_scale,
                options.ridge_policy,
            )?
        };
        let logdet_h_total = logdet_h_scaled + curvature.hessian_logdet_correction;
        return Ok((logdet_h_total, penalty_logdet_s_total));
    }
    let exact_joint_source = if let Some(workspace) = preferred_workspace.as_ref() {
        exact_newton_joint_hessian_source_from_workspace(
            workspace,
            total,
            MaterializationIntent::LogdetFactorization,
            "joint exact-newton operator mismatch in logdet terms",
        )?
    } else if !strict_spd && use_joint_matrix_free_path(total, joint_observation_count(states)) {
        family
            .exact_newton_joint_hessian_workspace_with_options(states, specs, options)?
            .as_ref()
            .map(|workspace| {
                exact_newton_joint_hessian_source_from_workspace(
                    workspace,
                    total,
                    MaterializationIntent::LogdetFactorization,
                    "joint exact-newton operator mismatch in logdet terms",
                )
            })
            .transpose()?
            .flatten()
    } else {
        None
    };
    if let Some(source) = exact_joint_source {
        // Exact determinant of H + S_λ for operator-backed coefficient Hessians.
        //
        // The REML gradient and Hessian use analytic trace identities such as
        // ∂ log|A(θ)| = tr(A⁻¹ A_θ).  Mixing an approximate determinant with
        // exact traces violates that identity and gives ARC a Hessian for a
        // different objective.  Materializing the coefficient Hessian by
        // canonical-basis HVPs keeps the objective/derivative pair exact.  At
        // large-scale CTN scale `total` is a few hundred, so this is sub-MiB; the
        // materializer below refuses oversized systems before allocation.
        let mut h_joint = materialize_joint_hessian_source(
            &source,
            total,
            "joint exact-newton operator dense logdet materialization",
        )?;
        for (b, s_lambda) in s_lambdas.iter().enumerate() {
            let (start, end) = ranges[b];
            h_joint
                .slice_mut(ndarray::s![start..end, start..end])
                .scaled_add(1.0, s_lambda);
        }
        if let Some(hphi) = logdet_jeffreys_hphi.as_ref() {
            h_joint.scaled_add(1.0, hphi);
        }
        let logdet_h_total = if strict_spd {
            strict_exact_pseudo_logdet(&h_joint, joint_observation_count(states))?
        } else {
            stable_logdet_with_ridge_policy(&h_joint, options.ridge_floor, options.ridge_policy)?
        };
        return Ok((logdet_h_total, penalty_logdet_s_total));
    }
    // Fallback: try the non-rescaled symmetrized path (for families that
    // don't implement exact_newton_outer_curvature but do provide
    // a plain joint Hessian).
    if let Some(mut h_joint) = exact_newton_joint_hessian_symmetrized(
        family,
        states,
        specs,
        total,
        "joint exact-newton Hessian validation in logdet terms",
    )? {
        for (b, s_lambda) in s_lambdas.iter().enumerate() {
            let (start, end) = ranges[b];
            h_joint
                .slice_mut(ndarray::s![start..end, start..end])
                .scaled_add(1.0, s_lambda);
        }
        if let Some(hphi) = logdet_jeffreys_hphi.as_ref() {
            h_joint.scaled_add(1.0, hphi);
        }
        let logdet_h_total = if strict_spd {
            strict_exact_pseudo_logdet(&h_joint, joint_observation_count(states))?
        } else {
            stable_logdet_with_ridge_policy(&h_joint, options.ridge_floor, options.ridge_policy)?
        };
        return Ok((logdet_h_total, penalty_logdet_s_total));
    }

    let eval = family.evaluate(states)?;
    if eval.blockworking_sets.len() != specs.len() {
        return Err(format!(
            "family returned {} block working sets, expected {}",
            eval.blockworking_sets.len(),
            specs.len()
        ));
    }

    let mut logdet_h_total = 0.0;
    let logdet_s_total = penalty_logdet_s_total;
    for b in 0..specs.len() {
        let spec = &specs[b];
        let work = &eval.blockworking_sets[b];
        let p = spec.design.ncols();
        let xtwx = match work {
            BlockWorkingSet::Diagonal {
                working_response: _,
                working_weights,
            } => with_block_geometry(family, states, spec, b, |x_dyn, _| {
                let w = floor_positiveworking_weights(working_weights, options.minweight);
                let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
                Ok(xtwx)
            })?,
            BlockWorkingSet::ExactNewton {
                gradient: _,
                hessian,
            } => {
                if hessian.nrows() != p || hessian.ncols() != p {
                    return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                        "block {b} exact-newton Hessian shape mismatch: got {}x{}, expected {}x{}",
                        hessian.nrows(),
                        hessian.ncols(),
                        p,
                        p
                    ) }.into());
                }
                hessian.to_dense()
            }
        };

        let s_lambda = &s_lambdas[b];

        let mut h = xtwx;
        h += s_lambda;
        logdet_h_total += if strict_spd {
            strict_exact_pseudo_logdet(&h, joint_observation_count(states))?
        } else {
            stable_logdet_with_ridge_policy(&h, options.ridge_floor, options.ridge_policy)?
        };
    }
    Ok((logdet_h_total, logdet_s_total))
}

/// Snapshot of a single block's eta for line-search rollback.
///
/// Created from a specific block's state; can only restore to or update
/// that same block.  There is no shared buffer across blocks, so
/// cross-block length confusion is structurally impossible.
struct BlockEtaCheckpoint {
    saved: Array1<f64>,
}

impl BlockEtaCheckpoint {
    /// Capture the current eta of `state`.
    fn capture(state: &ParameterBlockState) -> Self {
        Self {
            saved: state.eta.clone(),
        }
    }

    /// Capture into a pre-allocated buffer, returning the filled checkpoint.
    /// The buffer is taken (O(1) move) and filled with eta's data (O(n) copy).
    fn capture_reuse(state: &ParameterBlockState, buf: &mut Array1<f64>) -> Self {
        if buf.len() == state.eta.len() {
            buf.assign(&state.eta);
            Self {
                saved: std::mem::take(buf),
            }
        } else {
            Self::capture(state)
        }
    }

    /// Return the internal buffer for recycling.
    fn into_buffer(self) -> Array1<f64> {
        self.saved
    }

    /// Restore: `state.eta = saved`.
    fn restore_eta(&self, state: &mut ParameterBlockState) {
        state.eta.assign(&self.saved);
    }

    /// Incremental update: `state.eta = saved + alpha * direction`.
    fn restore_eta_with_step(
        &self,
        state: &mut ParameterBlockState,
        alpha: f64,
        direction: &Array1<f64>,
    ) {
        // In-place: eta = eta_backup + alpha * xd (zero allocations).
        state.eta.assign(&self.saved);
        state.eta.scaled_add(alpha, direction);
    }
}

/// Classification of which branch of the trust-region radius policy
/// fired on a single update — surfaced in cycle logs so it is possible
/// to tell at a glance whether the inner solver is being throttled by
/// the TR (e.g., `RejectFloor`/`ShrinkOnRejection`) or, conversely,
/// whether the step is sitting well inside the region (`HoldInside`)
/// so the slow convergence is NOT a TR-policy issue.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum JointTrustRegionDecision {
    /// `rho > 0.75` AND `step_norm >= 0.99 * old_radius` — model is good
    /// AND the step is at the TR boundary, so doubling reflects a real
    /// constraint that was just lifted.
    GrowAtBoundary,
    /// `rho > 0.75` but the step is well inside the region; radius held
    /// because no evidence the TR was constraining the step.  When the
    /// inner is converging linearly and this branch fires every cycle,
    /// the TR is NOT the bottleneck — Newton itself is finding short
    /// steps for a reason unrelated to the trust radius.
    HoldInside,
    /// `0.25 <= rho <= 0.75` (moderate model fidelity) — radius held.
    HoldModerate,
    /// `rho < 0.25` but step accepted (positive descent above noise).
    /// Radius shrunk to a quarter to be more conservative next cycle.
    ShrinkOnMarginalAccept,
    /// Step rejected — radius shrunk and capped at half the proposed
    /// step norm so a re-proposal is constrained inside the rejected
    /// region.
    ShrinkOnRejection,
    /// Radius was already at the floor before this update.  Persistent
    /// `RejectFloor` is the unambiguous signal of a degenerate ρ region.
    RejectFloor,
}

impl JointTrustRegionDecision {
    fn label(&self) -> &'static str {
        match self {
            Self::GrowAtBoundary => "grow_at_boundary",
            Self::HoldInside => "hold_inside",
            Self::HoldModerate => "hold_moderate",
            Self::ShrinkOnMarginalAccept => "shrink_marginal_accept",
            Self::ShrinkOnRejection => "shrink_reject",
            Self::RejectFloor => "reject_floor",
        }
    }
}

#[derive(Clone, Copy, Debug)]
struct JointTrustRegionUpdate {
    rho: f64,
    radius: f64,
    accepted: bool,
    decision: JointTrustRegionDecision,
}

fn update_joint_trust_region_radius(
    old_radius: f64,
    step_norm: f64,
    actual_reduction: f64,
    predicted_reduction: f64,
    objective_scale: f64,
) -> JointTrustRegionUpdate {
    // Floating-point noise floor relative to the objective magnitude.
    // When both the model-predicted and the realized reductions are at
    // this scale, their sign is dominated by round-off in the
    // log-likelihood evaluation rather than by genuine descent or
    // ascent; rejecting on that sign would discard a perfectly
    // converged step. Mirrors the noise-floor handling in
    // src/solver/pirls.rs (see the analogous `noise_floor` block).
    let noise_floor = objective_scale.abs().max(1.0) * 1e-14;
    let predicted_finite_positive =
        predicted_reduction > noise_floor && predicted_reduction.is_finite();
    let rho = if actual_reduction.abs() <= noise_floor {
        // The realized objective change is at the floating-point round-off floor:
        // the step neither helped nor hurt beyond noise, so it is a numerically
        // neutral (converged) step. Treat it as `rho = 1` REGARDLESS of whether
        // `predicted_reduction` happens to sit just above the floor. The previous
        // form only took this branch when `!predicted_finite_positive`; when a
        // tiny-but-valid Newton step near a flat-objective optimum produced
        // `predicted_reduction` marginally above `noise_floor` while
        // `actual_reduction` was within it, it divided two round-off-level
        // quantities and got a spurious negative `rho`, rejecting the step and
        // ratcheting the trust radius to the floor — pinning the solve far below
        // the (small, valid) Newton step it needed (gam#797 last-mile; same
        // pinning family observed on clustered bernoulli). Keying neutrality on the
        // *actual* reduction is the correct round-off guard.
        1.0
    } else if predicted_finite_positive {
        actual_reduction / predicted_reduction
    } else {
        f64::NEG_INFINITY
    };
    let accepted = rho.is_finite() && rho > 0.0 && actual_reduction >= -noise_floor;
    let mut radius = old_radius;
    let decision: JointTrustRegionDecision;
    if !accepted {
        radius *= 0.25;
        if step_norm.is_finite() && step_norm > 0.0 {
            radius = radius.min(0.5 * step_norm);
        }
        decision = JointTrustRegionDecision::ShrinkOnRejection;
    } else if rho < 0.25 {
        radius *= 0.25;
        decision = JointTrustRegionDecision::ShrinkOnMarginalAccept;
    } else if rho > 0.75 && step_norm >= 0.99 * old_radius {
        radius *= 2.0;
        decision = JointTrustRegionDecision::GrowAtBoundary;
    } else if rho > 0.75 {
        decision = JointTrustRegionDecision::HoldInside;
    } else {
        decision = JointTrustRegionDecision::HoldModerate;
    }
    if !radius.is_finite() || radius <= 0.0 {
        radius = 1.0e-12;
    }
    let clamped_radius = radius.clamp(1.0e-12, 1.0e6);
    // Promote to RejectFloor if we landed at the absolute floor.  The
    // base classification is preserved up to this final clamp; the
    // floor classification is just a stronger label that captures the
    // "no descent direction exists at this radius" signal.
    let final_decision = if clamped_radius <= 1.0e-12 + f64::EPSILON
        && matches!(
            decision,
            JointTrustRegionDecision::ShrinkOnRejection
                | JointTrustRegionDecision::ShrinkOnMarginalAccept
        ) {
        JointTrustRegionDecision::RejectFloor
    } else {
        decision
    };
    JointTrustRegionUpdate {
        rho,
        radius: clamped_radius,
        accepted,
        decision: final_decision,
    }
}

fn joint_objective_roundoff_slack(old_objective: f64, trial_objective: f64) -> f64 {
    (64.0 * f64::EPSILON * (1.0 + old_objective.abs() + trial_objective.abs())).max(1.0e-10)
}

// True iff the line search detected a noise-level realized reduction (i.e.
// the trial step neither helped nor hurt the objective beyond round-off)
// AND the local quadratic model agrees that no further descent is available
// within tolerance. `actual_reduction <= 0` is kept (not made sign-symmetric)
// because at rank-deficient optima (σ_min(H) ≲ ε_machine) the outer-gradient
// FD identity requires β trajectories to be CONSISTENT across λ probes —
// accepting positive-noise-level reductions exits the loop one attempt
// earlier than the negative case and decorrelates the null-space drift
// between consecutive REML evaluations. Concretely:
// `outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active`
// at HardPseudo σ_min ~ 1e-10 fails when symmetric. The asymmetric guard
// preserves the spin avoidance for the common (negative-noise) case at
// large scale while leaving the rank-deficient FD identity intact.
fn joint_objective_floor_reached(
    old_objective: f64,
    trial_objective: f64,
    actual_reduction: f64,
    predicted_reduction: f64,
    objective_tol: f64,
) -> bool {
    trial_objective.is_finite()
        && actual_reduction <= 0.0
        && actual_reduction.abs() <= joint_objective_roundoff_slack(old_objective, trial_objective)
        && predicted_reduction.is_finite()
        && predicted_reduction
            <= objective_tol.max(joint_objective_roundoff_slack(
                old_objective,
                trial_objective,
            ))
}

/// True iff the joint-Newton proposal is already at the step-tolerance floor —
/// the unclamped Newton step's inf-norm is within `STEP_FLOOR_CERT_FACTOR ×
/// step_tol` (the same round-off band the constrained-stationary certificate
/// uses for "a hair above tol"). At the floor the iterate is doing KKT polishing
/// on a flat objective, so a `predicted_reduction = rhs·δ − ½δᵀHδ ≤ 0` is the
/// SIGN of two near-equal O(step²) quantities (round-off), NOT a model-invalid
/// descent direction; the preconditioned-descent substitution must be suppressed
/// there or it replaces the tiny polishing step with an objective-descent step
/// that catapults the KKT residual off the near-converged iterate (gam#787 binary
/// matern centers=12: residual 1.7e-4 → 4.7e-1, never recovers).
fn joint_proposal_at_step_floor(proposal_step_inf: f64, step_tol: f64) -> bool {
    const STEP_FLOOR_CERT_FACTOR: f64 = 4.0;
    proposal_step_inf.is_finite()
        && step_tol.is_finite()
        && proposal_step_inf <= STEP_FLOOR_CERT_FACTOR * step_tol
}

fn joint_trust_region_metric_step_norm(delta: &Array1<f64>, metric_diag: &Array1<f64>) -> f64 {
    assert_eq!(delta.len(), metric_diag.len());
    joint_trust_region_metric_step_norm_view(delta.view(), metric_diag.view())
}

fn joint_trust_region_metric_step_norm_view(
    delta: ArrayView1<f64>,
    metric_diag: ArrayView1<f64>,
) -> f64 {
    assert_eq!(delta.len(), metric_diag.len());
    delta
        .iter()
        .zip(metric_diag.iter())
        .map(|(step, weight)| step * step * positive_joint_diagonal_entry(*weight))
        .sum::<f64>()
        .sqrt()
}

fn joint_trust_region_block_metric_norms(
    delta: &Array1<f64>,
    ranges: &[(usize, usize)],
    metric_diag: &Array1<f64>,
) -> Vec<f64> {
    assert_eq!(delta.len(), metric_diag.len());
    ranges
        .iter()
        .map(|(start, end)| {
            joint_trust_region_metric_step_norm_view(
                delta.slice(s![*start..*end]),
                metric_diag.slice(s![*start..*end]),
            )
        })
        .collect()
}

fn truncate_joint_step_to_block_metric_radii(
    delta: &mut Array1<f64>,
    ranges: &[(usize, usize)],
    metric_diag: &Array1<f64>,
    block_radii: &[f64],
) -> Vec<f64> {
    assert_eq!(ranges.len(), block_radii.len());
    assert_eq!(delta.len(), metric_diag.len());
    let mut norms = Vec::with_capacity(ranges.len());
    for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
        let metric_view = metric_diag.slice(s![start..end]);
        let mut block = delta.slice_mut(s![start..end]);
        let norm = joint_trust_region_metric_step_norm_view(block.view(), metric_view);
        let radius = block_radii[block_idx];
        if norm.is_finite() && norm > radius && radius > 0.0 {
            block.mapv_inplace(|v| v * (radius / norm));
            norms.push(radius);
        } else {
            norms.push(norm);
        }
    }
    norms
}

fn joint_block_step_hit_trust_boundary(step_norm: f64, radius: f64) -> bool {
    step_norm.is_finite() && radius > 0.0 && step_norm >= 0.99 * radius
}

/// Per-block dogleg step (Powell, blending the Cauchy and Newton points within
/// the block's M-metric trust radius). This is the principled globalization for
/// the coupled location-scale inner Newton (gam#826/#808): box-truncating the
/// Newton step alone freezes progress when the spectral solve is degenerate at
/// the oversmoothed seed — the high-curvature `log_sigma` block has
/// `λ ~ exp(2·ρ_bound)` so its Newton component is `O(g/λ) ≈ 5e-21`, the
/// mean/trend blocks get isotropically shrunk to the radius, and the residual
/// stalls while β barely moves. The dogleg always includes the Cauchy leg
/// (the model-minimizing steepest-descent step in the block metric), so the
/// realized decrease is at least the Cauchy decrease whenever the block
/// gradient is nonzero — progress is guaranteed even when the Newton step is
/// numerically frozen. Inside the radius the dogleg returns the exact Newton
/// step, so the converged β, the KKT certificate, and the well-conditioned /
/// #729 endgame are byte-identical to the undamped solve.
///
/// Inputs per block `b`:
///   * `newton[start..end]`  — Newton (spectral) step block `δ_N`.
///   * `cauchy[start..end]`  — the FULL (unconstrained) Cauchy block
///     `δ_C = τ·p_sd`, where `p_sd = M⁻¹·rhs` is the M-metric steepest-descent
///     direction of the model and `τ` minimizes the model along it; precomputed
///     once per cycle by `joint_cauchy_step` (the curvature `p_sd·H·p_sd` needs
///     a coupled Hessian-vector product, so it must be hoisted out of the
///     radius-shrink loop).
///   * `radius`              — the block's current M-metric trust radius.
///
/// Returns the block step norms in the M-metric (same contract as
/// `truncate_joint_step_to_block_metric_radii`) and overwrites `out` with the
/// dogleg blend per block.
fn joint_dogleg_step_to_block_metric_radii(
    newton: &Array1<f64>,
    cauchy: &Array1<f64>,
    ranges: &[(usize, usize)],
    metric_diag: &Array1<f64>,
    block_radii: &[f64],
    out: &mut Array1<f64>,
) -> Vec<f64> {
    assert_eq!(ranges.len(), block_radii.len());
    assert_eq!(newton.len(), metric_diag.len());
    assert_eq!(cauchy.len(), metric_diag.len());
    assert_eq!(out.len(), metric_diag.len());
    let mut norms = Vec::with_capacity(ranges.len());
    for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
        let metric_view = metric_diag.slice(s![start..end]);
        let newton_b = newton.slice(s![start..end]);
        let cauchy_b = cauchy.slice(s![start..end]);
        let radius = block_radii[block_idx];
        let newton_norm = joint_trust_region_metric_step_norm_view(newton_b, metric_view);
        let cauchy_norm = joint_trust_region_metric_step_norm_view(cauchy_b, metric_view);
        let mut out_b = out.slice_mut(s![start..end]);

        // Degenerate radius (non-finite or non-positive): nothing moves.
        if !radius.is_finite() || radius <= 0.0 {
            out_b.fill(0.0);
            norms.push(0.0);
            continue;
        }

        // Newton step (or a non-finite Cauchy fallback) inside the radius: take
        // the exact Newton step. This is the only branch a well-conditioned /
        // converging fit ever reaches near the optimum, so the endgame numerics
        // are unchanged.
        if newton_norm.is_finite() && newton_norm <= radius {
            out_b.assign(&newton_b);
            norms.push(newton_norm);
            continue;
        }

        // Cauchy leg longer than the radius (or Newton/Cauchy not comparable):
        // scale the Cauchy step to the boundary. When the Cauchy step itself is
        // unusable, fall back to scaling the Newton step (pre-dogleg behavior).
        if !(cauchy_norm.is_finite() && cauchy_norm > 0.0) {
            let scale = if newton_norm.is_finite() && newton_norm > 0.0 {
                radius / newton_norm
            } else {
                0.0
            };
            out_b.assign(&newton_b);
            out_b.mapv_inplace(|v| v * scale);
            norms.push(if scale > 0.0 { radius } else { 0.0 });
            continue;
        }
        if cauchy_norm >= radius {
            let scale = radius / cauchy_norm;
            out_b.assign(&cauchy_b);
            out_b.mapv_inplace(|v| v * scale);
            norms.push(radius);
            continue;
        }

        // Dogleg blend: δ(θ) = δ_C + θ·(δ_N − δ_C), θ ∈ [0,1], pick θ so
        // ‖δ(θ)‖_M = radius. Solve the quadratic ‖δ_C + θ·d‖²_M = radius² with
        // d = δ_N − δ_C, a = ‖d‖²_M, b = 2·⟨δ_C, d⟩_M, c = ‖δ_C‖²_M − radius².
        let mut a = 0.0_f64;
        let mut b = 0.0_f64;
        for ((cb, nb), w) in cauchy_b.iter().zip(newton_b.iter()).zip(metric_view.iter()) {
            let m = positive_joint_diagonal_entry(*w);
            let d = nb - cb;
            a += m * d * d;
            b += 2.0 * m * cb * d;
        }
        let c = cauchy_norm * cauchy_norm - radius * radius;
        // a > 0 because δ_N ≠ δ_C here (Newton outside, Cauchy inside the
        // radius). Largest root in [0,1] keeps the step on the dogleg path.
        let disc = (b * b - 4.0 * a * c).max(0.0);
        let theta = if a > 0.0 {
            ((-b + disc.sqrt()) / (2.0 * a)).clamp(0.0, 1.0)
        } else {
            0.0
        };
        for ((o, cb), nb) in out_b.iter_mut().zip(cauchy_b.iter()).zip(newton_b.iter()) {
            *o = cb + theta * (nb - cb);
        }
        let norm = joint_trust_region_metric_step_norm_view(out_b.view(), metric_view);
        norms.push(norm);
    }
    norms
}

/// Unconstrained Cauchy point of the joint penalized quadratic model in the
/// block-diagonal M-metric: `δ_C = τ·p_sd` with `p_sd = M⁻¹·rhs` (the M-metric
/// steepest-descent direction of the model `m(δ) = −rhs·δ + ½·δᵀHδ` at δ=0)
/// and `τ = (rhs·p_sd)/(p_sd·H·p_sd)` minimizing the model along `p_sd`. When
/// the curvature `p_sd·H·p_sd ≤ 0` the model is unbounded below along `p_sd`,
/// so `δ_C` is just `p_sd` (the dogleg's boundary scaling then takes it to the
/// radius — a descent step on the indefinite/flat direction). `h_psd` must be
/// `H_pen·p_sd` for the SAME penalized (and Firth-augmented, when armed) Hessian
/// the trust-region model uses, so the dogleg path is consistent with the
/// accept/reject quadratic.
fn joint_cauchy_step(rhs: &Array1<f64>, p_sd: &Array1<f64>, h_psd: &Array1<f64>) -> Array1<f64> {
    let directional = rhs.dot(p_sd);
    if !directional.is_finite() || directional <= 0.0 {
        // `p_sd` is not an ascent direction of −m (no descent on the objective);
        // emit a zero Cauchy step so the dogleg falls back to the Newton leg.
        return Array1::zeros(p_sd.len());
    }
    let curvature = p_sd.dot(h_psd);
    let mut delta = p_sd.clone();
    if curvature.is_finite() && curvature > 0.0 {
        let tau = directional / curvature;
        if tau.is_finite() && tau > 0.0 {
            delta.mapv_inplace(|v| tau * v);
        }
    }
    // Non-positive curvature: leave δ_C = p_sd; the dogleg scales it to the
    // trust boundary (the model decreases without bound along p_sd there).
    delta
}

fn shrink_active_joint_block_trust_radii(
    block_radii: &mut [f64],
    block_step_norms: &[f64],
    factor: f64,
) -> f64 {
    assert_eq!(block_radii.len(), block_step_norms.len());
    // Joint-Newton step-rejection radius shrink. Must guarantee strict
    // monotone decrease of `max(block_radii)` until the floor, otherwise the
    // next trust-region attempt computes a step byte-identical to the rejected
    // one and the inner loop stalls forever (gam joint-Newton fully-rejected
    // cycles, root cause behind the 8-cycle bail at FULLY_REJECTED_STALL_MAX_CYCLES).
    //
    // Two cooperating mechanisms:
    //   * For every block that participates in the shrink, the new radius is
    //     pulled below the rejected step's magnitude (`0.5 · step_norm`),
    //     matching the analogous clamp in `update_joint_trust_region_radius`'s
    //     reject branch. This forces the next step to be strictly smaller
    //     than the current one even when `radius * factor` is still larger
    //     than `step_norm` (which happens whenever the dogleg/truncate path
    //     returned a Newton step shorter than the block's radius).
    //   * Block participation: by default only shrink blocks whose step hit
    //     the per-block trust boundary (the boundary block was the one the
    //     trust radius actually constrained — interior blocks took their
    //     natural Newton step and shrinking their radius is wasted). BUT when
    //     every boundary block already sits at the 1e-12 floor, further
    //     shrinking those blocks is a no-op (they'd just re-clamp to the
    //     floor), so we *must* shrink the interior blocks instead to actually
    //     change the joint step. Without this carve-out the deadlock was:
    //     boundary block pinned at 1e-12, interior block radius held at its
    //     pre-stall value, `max(block_radii)` held by the interior block, the
    //     dogleg/truncate produces an identical joint δ every cycle, every
    //     trust attempt rejects on the same objective check, the cycle burns
    //     to `inner_loop_hard_ceiling` (1200) cycles wasting ~120 s per
    //     outer ρ-evaluation — the Rust CI Test hang and the
    //     `rust_margslope_aniso_duchon16d_*` large-scale 2400 s timeout.
    const RADIUS_FLOOR: f64 = 1.0e-12;
    let any_boundary_block = block_radii
        .iter()
        .zip(block_step_norms)
        .any(|(radius, step_norm)| joint_block_step_hit_trust_boundary(*step_norm, *radius));
    let all_boundary_blocks_at_floor = any_boundary_block
        && block_radii
            .iter()
            .zip(block_step_norms)
            .filter(|(radius, step_norm)| {
                joint_block_step_hit_trust_boundary(**step_norm, **radius)
            })
            .all(|(radius, _)| *radius <= RADIUS_FLOOR * (1.0 + 1.0e-12));
    for (radius, step_norm) in block_radii.iter_mut().zip(block_step_norms) {
        let at_boundary = joint_block_step_hit_trust_boundary(*step_norm, *radius);
        let participates = if all_boundary_blocks_at_floor {
            // Boundary-at-floor stall: the boundary blocks cannot shrink any
            // further, so participate every block (including interior ones)
            // so the joint step magnitude actually changes.
            true
        } else if any_boundary_block {
            at_boundary
        } else {
            true
        };
        if participates {
            let mut new_radius = *radius * factor;
            if step_norm.is_finite() && *step_norm > 0.0 {
                new_radius = new_radius.min(0.5 * *step_norm);
            }
            *radius = new_radius.clamp(RADIUS_FLOOR, 1.0e6);
        }
    }
    block_radii.iter().copied().fold(0.0_f64, f64::max)
}

fn apply_joint_feasibility_limit<F: CustomFamily + ?Sized>(
    family: &F,
    states: &[ParameterBlockState],
    ranges: &[(usize, usize)],
    trial_delta: &mut Array1<f64>,
) -> Result<bool, String> {
    // Collect each block's feasibility α and apply the *minimum* to the
    // JOINT trial step, not to each block in isolation.
    //
    // The joint Newton direction δ̂ = H⁻¹(−g) is the unique descent direction
    // for the local quadratic model up to a positive scalar; any α·δ̂ with
    // α ∈ (0, 1] is still a descent direction on the joint objective.
    // Scaling ONLY one block by α produces (α·δ̂_A, δ̂_B, …), which is
    // neither δ̂ nor α·δ̂ and is not, in general, a descent direction on
    // the joint quadratic.
    //
    // Production survival_marginal_slope failure mode at large scale:
    // the time block returned α ≈ 1e-4 (monotonicity guard); per-block
    // scaling crushed δ_time to ~2.3e-4 while logslope kept its full
    // unconstrained Newton step. The joint step was no longer a Newton
    // direction; the time-block gradient stayed at ‖g_time‖ ≈ 5.6e8 for
    // the next 15+ cycles, triggering the linearized-rate stall
    // early-exit on every outer seed.
    //
    // Scaling the joint step by min α preserves Newton direction; the
    // trust-region/line-search already chooses the appropriate step size
    // within direction, this barrier check just enforces feasibility on
    // top of that direction.
    let mut joint_alpha = 1.0_f64;
    let mut limiting_block: Option<usize> = None;
    for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
        let block_delta = trial_delta.slice(s![start..end]).to_owned();
        if let Some(alpha_max) = family.max_feasible_step_size(states, block_idx, &block_delta)? {
            if !alpha_max.is_finite() || alpha_max <= 0.0 {
                return Err(format!(
                    "joint Newton block {block_idx} has no positive feasible step"
                ));
            }
            if alpha_max < joint_alpha {
                joint_alpha = alpha_max;
                limiting_block = Some(block_idx);
            }
        }
    }
    if joint_alpha < 1.0 {
        trial_delta.mapv_inplace(|v| joint_alpha * v);
        log::debug!(
            "[PIRLS/joint-Newton] feasibility scaled joint step by α={:.3e} (block {:?} binding)",
            joint_alpha,
            limiting_block,
        );
        Ok(true)
    } else {
        Ok(false)
    }
}

fn joint_inner_kkt_converged(residual: f64, residual_tol: f64) -> bool {
    residual.is_finite() && residual_tol.is_finite() && residual <= residual_tol
}

/// Per-iterate diagnostic snapshot assembled when the joint Newton inner solve
/// refuses to certify constrained-stationarity. The report breaks the failure
/// down by block (so the offending smooth can be named), records the H_pen
/// eigenvalue spectrum (so rank-deficiency in the penalized Hessian is
/// detectable from logs), and classifies the refusal so downstream tooling
/// can act without re-deriving the cert math.
#[derive(Clone, Debug)]
struct KktRefusalReport {
    block_names: Vec<String>,
    block_widths: Vec<usize>,
    block_beta_inf: Vec<f64>,
    block_grad_inf: Vec<f64>,
    block_penalty_grad_inf: Vec<f64>,
    block_residual_inf: Vec<f64>,
    block_carrying_residual: Option<usize>,

    hpen_eigenvalues_sorted_desc: Vec<f64>,
    hpen_min_abs_eigenvalue: f64,
    hpen_max_abs_eigenvalue: f64,
    hpen_condition_number: f64,
    hpen_nullity_at_rank_tol: usize,
    hpen_rank_tol: f64,
    hpen_null_gradient_inf: f64,
    hpen_null_vector_block_inf: Vec<f64>,
    hpen_null_vector_carrying_block: Option<usize>,

    active_set_rows_total: usize,
    accepted_step_inf: f64,
    proposal_step_inf: f64,
    trust_radius: f64,
    cycle: usize,

    residual_tol: f64,
    obj_tol: f64,
    step_tol: f64,

    linearized_rel: f64,
    scalar_model_relerr: f64,
    objective_change: f64,
    projected_residual_inf: f64,

    diagnosis: KktRefusalDiagnosis,
}

/// Three-way classification of why the cert refused, computed from the
/// H_pen spectrum and the projected residual at the refusing iterate.
/// `RankDeficientHPen` is the regression canary the nullspace lead's
/// smooth-construction rework is intended to eliminate; keep this variant
/// intact when extending — it doubles as the user-facing signal for
/// "an unconstrained polynomial null space slipped past absorption."
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum KktRefusalDiagnosis {
    RankDeficientHPen,
    PhantomMultiplierWithWellConditionedH,
    ActiveSetIncomplete,
    /// Cross-block identifiability aliasing surfaced mid-inner-solve
    /// (e.g., a binding active set materialised a 2-way alias that
    /// the pre-fit audit could not see at the cold design). The fix
    /// is structural — drop or reparameterise the aliased block;
    /// rho-anneal will not recover.
    AliasingDetectedAtFit,
}

impl KktRefusalDiagnosis {
    pub(crate) fn as_str(&self) -> &'static str {
        match self {
            KktRefusalDiagnosis::RankDeficientHPen => "rank_deficient_H_pen",
            KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH => {
                "phantom_multiplier_with_well_conditioned_H"
            }
            KktRefusalDiagnosis::ActiveSetIncomplete => "active_set_incomplete",
            KktRefusalDiagnosis::AliasingDetectedAtFit => "aliasing_detected_at_fit",
        }
    }

    /// Parse the textual `diagnosis:` field embedded in the structured
    /// bubbled error string. Returns `None` when no recognised label is
    /// present (legacy / non-cert-refusal error strings).
    pub(crate) fn parse_from_error(message: &str) -> Option<Self> {
        let marker = "diagnosis: ";
        let start = message.rfind(marker)? + marker.len();
        let tail = &message[start..];
        let end = tail
            .find(|c: char| c == ';' || c == '\n' || c == ' ')
            .unwrap_or(tail.len());
        match &tail[..end] {
            "rank_deficient_H_pen" => Some(KktRefusalDiagnosis::RankDeficientHPen),
            "phantom_multiplier_with_well_conditioned_H" => {
                Some(KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH)
            }
            "active_set_incomplete" => Some(KktRefusalDiagnosis::ActiveSetIncomplete),
            "aliasing_detected_at_fit" => Some(KktRefusalDiagnosis::AliasingDetectedAtFit),
            _ => None,
        }
    }

    fn guidance(self) -> &'static str {
        match self {
            KktRefusalDiagnosis::RankDeficientHPen => {
                "check whether the named block has a structural or numerical null direction \
                 not identified by the likelihood/penalty combination; for Duchon-style \
                 smooths this may be a polynomial null space, while marginal-slope fits can \
                 also expose callback-owned weak directions"
            }
            KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH => {
                "check whether the named block has a near-separated or weakly identified \
                 direction despite a well-conditioned penalized Hessian; in marginal-slope \
                 fits this often indicates marginal/logslope coupling rather than a \
                 Matérn/Duchon polynomial-nullspace failure"
            }
            KktRefusalDiagnosis::ActiveSetIncomplete => {
                "check whether the named block's linear constraints need an additional \
                 active row or a tighter constrained re-solve; this is an active-set \
                 certification failure, not a polynomial-nullspace diagnosis"
            }
            KktRefusalDiagnosis::AliasingDetectedAtFit => {
                "check whether the named block aliases another block after runtime \
                 constraints or callbacks materialize; drop or reparameterize the aliased \
                 direction before fitting"
            }
        }
    }
}

/// Relative rank tolerance applied to `|λ|/λ_max` when counting the
/// nullity of `H_pen`. Matches the threshold the surrounding REML
/// penalty-rank machinery uses for "structurally zero".
const KKT_REFUSAL_RANK_TOL: f64 = 1e-10;

/// Self-vanishing Levenberg–Marquardt damping factor for the range-restricted
/// spectral Newton step (`solve_joint_newton_step_on_spectral_range`). The
/// caller forms the residual-scaled magnitude
/// `μ = JOINT_SPECTRAL_LEVENBERG_FACTOR · ‖∇L − Sβ‖∞`, which the solve converts
/// to a DIMENSIONLESS, scale-invariant Marquardt damping `ν = μ / λ_max` applied
/// MULTIPLICATIVELY to each range curvature (`curvature·(1 + ν)`), not added
/// (`curvature + μ`). The multiplicative form is essential on a coupled
/// location-scale joint Hessian whose spectrum spans the penalty scale
/// (`λ ~ e²⁴` at the oversmoothed seed) and the likelihood scale (the
/// mean/wiggle XᵀWX curvature): an ADDITIVE μ — set by the penalty-inflated
/// residual — swamps the small likelihood curvature and freezes that block
/// (#826), whereas the multiplicative `1/(1+ν)` throttle is identical across all
/// scales so no block stalls. Both forms cap the unbounded `component/λ` step
/// along near-singular (ill-conditioned but above-`KKT_REFUSAL_RANK_TOL`)
/// eigen-directions — the modes that make the undamped step oscillate — and both
/// vanish as the iterate converges (`ν → 0`), recovering the exact Moore–Penrose
/// Newton step so the KKT fixed point and the well-identified fast path are
/// unchanged. `1e-3` keeps the damping two to three orders below the dominant
/// curvature on a well-conditioned problem.
const JOINT_SPECTRAL_LEVENBERG_FACTOR: f64 = 1.0e-3;

#[derive(Clone, Debug)]
struct JointSpectralNewtonStep {
    delta: Array1<f64>,
    range_rhs_inf: f64,
    null_rhs_inf: f64,
    lambda_max_abs: f64,
    lambda_min_positive: f64,
    nullity: usize,
    rank_tol: f64,
    /// Number of eigen-directions whose curvature was negative (beyond the
    /// rank cutoff) and was reflected to `|λ|` to form a modified-Newton
    /// descent step. Zero for a genuinely positive-semidefinite model.
    reflected_negative_modes: usize,
    /// Most negative eigenvalue encountered (≤ 0); `0.0` when the model was
    /// positive-semidefinite within the rank cutoff.
    most_negative_eigenvalue: f64,
}

/// Production home for the exact trust-region engine ([`WhitenedHessianSpectrum`]),
/// wired into the unconstrained dense-spectral joint-Newton step in
/// `inner_blockwise_fit` (gam#979). Kept in its own module so the engine's
/// helpers stay namespaced; the parent reaches it via `whitened_spectrum::`.
mod whitened_spectrum {
    use super::*;

    /// Eigendecomposition of the metric-whitened penalized Hessian, retained so
    /// every trust-radius shrink within one Newton cycle re-solves the
    /// trust-region subproblem from the SAME `O(p³)` factorization at `O(p)` cost.
    ///
    /// # Why this exists (gam#979)
    ///
    /// The coupled marginal↔logslope inner Newton needs ONE globalization, not a
    /// stack of approximations. Historically the joint step was a *modified-Newton*
    /// (reflect indefinite eigenvalues to `|λ|`) wrapped in a *heuristically gated*
    /// multiplicative Marquardt damping (engaged on `nullity>0`, or condition number
    /// over a threshold, or after N non-improving cycles) and then a *dogleg* between
    /// that step and the Cauchy point, truncated to per-block step-norm trust radii.
    /// Each piece approximates a different facet of the one exact object below, and
    /// each had to be gated so it would not re-break the case another piece was added
    /// for (#826 vs #808 vs #733/#734 vs #787). When none of the gates matched the
    /// operating point — well-conditioned `H_pen`, yet a coupled near-aliased
    /// direction with a huge raw Newton component — the truncated direction made only
    /// Cauchy-sized progress, the gain ratio never justified growing the radius, and
    /// the residual crawled for hundreds of cycles (the #979 "phantom multiplier"
    /// grind / survival hang).
    ///
    /// [`Self::trust_region_step`] replaces all of that with the *exact* solution of
    /// the trust-region subproblem
    ///   minimize  `−rhsᵀδ + ½ δᵀ H_pen δ`   subject to  `‖δ‖_D ≤ r`,
    /// via the Moré–Sorensen characterization: the minimizer is `δ(λ) = (H_pen +
    /// λD)⁻¹ rhs` for the unique `λ ≥ max(0, −γ_min)` with `‖δ(λ)‖_D = r` (or `λ = 0`
    /// when the Newton step is interior and `H_pen ≻ 0`). Working in the `D`-metric
    /// generalized eigenbasis this is a scalar secular equation in `λ`, solved by a
    /// safeguarded Newton iteration on the already-computed spectrum. Properties that
    /// make it the right object:
    ///   * indefiniteness is handled exactly (`λ ≥ −γ_min` makes `H_pen+λD ⪰ 0` on
    ///     the boundary — no reflection heuristic, no negative-curvature special case
    ///     other than the rigorous hard case);
    ///   * the damping `λ` is determined by the trust radius, not by nullity /
    ///     condition / stall gates — those gates disappear;
    ///   * it self-vanishes: at the KKT fixed point `rhs → 0 ⇒ δ → 0`, and once the
    ///     iterate is in a region where `H_pen ≻ 0` the Newton step goes interior so
    ///     `λ = 0` and convergence is quadratic — the converged β, the KKT
    ///     certificate, and the REML/LAML the residual feeds are byte-identical to an
    ///     undamped exact-Newton solve;
    ///   * it is affine covariant in the `D` metric, so blocks at wildly different
    ///     curvature scales (the penalty `λ ~ e²⁴` modes vs the `XᵀWX` likelihood
    ///     modes at an oversmoothed seed) are damped uniformly by `1/(γ_k+λ)` — the
    ///     scale-invariance the per-block radii and the multiplicative-Marquardt form
    ///     were each hand-built to approximate.
    ///
    /// The genuine penalty null space (`|γ_k| ≤ null_cutoff`) is still projected out
    /// (the gam#553 Moore–Penrose range restriction): an unidentified gauge direction
    /// carries no finite Newton step and is left unchanged, its stationarity-residual
    /// component reported via [`JointSpectralNewtonStep::null_rhs_inf`].
    pub(super) struct WhitenedHessianSpectrum {
        /// Generalized eigenvalues `γ_k` of `(H_pen, D)` = eigenvalues of the
        /// whitened matrix `A = D^{-1/2} H_pen D^{-1/2}`.
        gamma: Array1<f64>,
        /// Whitened eigenvectors `v_k` (columns) of `A`.
        evecs: Array2<f64>,
        /// rhs in the whitened eigenbasis: `c_k = v_kᵀ D^{-1/2} rhs`.
        c: Array1<f64>,
        /// `D^{-1/2}` diagonal, mapping a whitened step `η` back to `δ = D^{-1/2} η`.
        d_inv_sqrt: Array1<f64>,
        /// `max_k |γ_k|` (the curvature scale; `D`-whitened).
        lambda_max_abs: f64,
        /// Curvature magnitude at/below which a direction is treated as genuinely
        /// unidentified (penalty null space) and dropped from the step.
        null_cutoff: f64,
    }

    impl WhitenedHessianSpectrum {
        /// Eigendecompose the `D`-whitened penalized Hessian once. `metric_diag`
        /// supplies the positive trust-region metric `D` (each entry is passed
        /// through [`positive_joint_diagonal_entry`] so a non-positive curvature
        /// estimate becomes a safe positive scale). `rank_tol` is the relative
        /// near-singularity cutoff; the genuine numerical-rank floor is derived from
        /// the whitened spectrum exactly as the legacy spectral solve did.
        pub(super) fn decompose(
            h_pen: &Array2<f64>,
            rhs: &Array1<f64>,
            metric_diag: &Array1<f64>,
            rank_tol: f64,
        ) -> Result<Self, String> {
            let p = h_pen.nrows();
            if h_pen.ncols() != p || rhs.len() != p || metric_diag.len() != p {
                return Err(format!(
                    "whitened trust-region decomposition dimension mismatch: H={}x{}, rhs={}, metric={}",
                    h_pen.nrows(),
                    h_pen.ncols(),
                    rhs.len(),
                    metric_diag.len()
                ));
            }
            let d_inv_sqrt = Array1::from_iter(
                metric_diag
                    .iter()
                    .map(|w| 1.0 / positive_joint_diagonal_entry(*w).sqrt()),
            );
            // A = D^{-1/2} H D^{-1/2}; symmetric since H is symmetric and D diagonal.
            let mut a = Array2::<f64>::zeros((p, p));
            for i in 0..p {
                for j in 0..p {
                    a[[i, j]] = h_pen[[i, j]] * d_inv_sqrt[i] * d_inv_sqrt[j];
                }
            }
            symmetrize_dense_in_place(&mut a);
            let (gamma, evecs) = FaerEigh::eigh(&a, Side::Lower)
                .map_err(|e| format!("whitened trust-region eigendecomposition failed: {e}"))?;
            // c = Vᵀ (D^{-1/2} rhs).
            let whitened_rhs = &d_inv_sqrt * rhs;
            let c = evecs.t().dot(&whitened_rhs);
            let lambda_max_abs = gamma.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
            let numerical_floor = lambda_max_abs * (p as f64).sqrt() * f64::EPSILON;
            let cutoff = rank_tol * lambda_max_abs;
            let null_cutoff = cutoff.min(numerical_floor);
            Ok(Self {
                gamma,
                evecs,
                c,
                d_inv_sqrt,
                lambda_max_abs,
                null_cutoff,
            })
        }

        /// `‖η(λ)‖²_2 = Σ_{identified k} c_k² / (γ_k + λ)²` — the squared `D`-metric
        /// norm of the trial step as a function of the Levenberg shift `λ`. Only
        /// identified (above-`null_cutoff`) modes participate; the null space carries
        /// no step.
        fn step_norm_sq(&self, lambda: f64) -> f64 {
            let mut acc = 0.0;
            for k in 0..self.gamma.len() {
                if self.gamma[k].abs() <= self.null_cutoff {
                    continue;
                }
                let denom = self.gamma[k] + lambda;
                if denom.abs() <= f64::MIN_POSITIVE {
                    return f64::INFINITY;
                }
                let t = self.c[k] / denom;
                acc += t * t;
            }
            acc
        }

        /// Assemble the whitened step `η(λ) = Σ c_k/(γ_k+λ) v_k` over identified
        /// modes and map it back to `δ = D^{-1/2} η`. Returns `(δ, range_rhs_inf,
        /// null_rhs_inf, nullity, lambda_min_positive, reflected_negative_modes,
        /// most_negative)` diagnostics consistent with the legacy spectral step.
        fn assemble(
            &self,
            lambda: f64,
            extra_min_mode: Option<(usize, f64)>,
        ) -> JointSpectralNewtonStep {
            let p = self.gamma.len();
            let mut eta = Array1::<f64>::zeros(p);
            let mut range_rhs_inf = 0.0_f64;
            let mut null_rhs_inf = 0.0_f64;
            let mut lambda_min_positive = f64::INFINITY;
            let mut nullity = 0usize;
            let mut reflected_negative_modes = 0usize;
            let mut most_negative = 0.0_f64;
            for k in 0..p {
                let g = self.gamma[k];
                if g.abs() <= self.null_cutoff {
                    nullity += 1;
                    null_rhs_inf = null_rhs_inf.max(self.c[k].abs());
                    continue;
                }
                range_rhs_inf = range_rhs_inf.max(self.c[k].abs());
                if g < 0.0 {
                    reflected_negative_modes += 1;
                    most_negative = most_negative.min(g);
                } else {
                    lambda_min_positive = lambda_min_positive.min(g);
                }
                let denom = g + lambda;
                if denom.abs() > f64::MIN_POSITIVE {
                    let coeff = self.c[k] / denom;
                    for i in 0..p {
                        eta[i] += coeff * self.evecs[[i, k]];
                    }
                }
            }
            // Hard case: add τ·v_min along a minimal-curvature eigenvector to reach
            // the trust boundary when rhs has no component there.
            if let Some((k_min, tau)) = extra_min_mode {
                for i in 0..p {
                    eta[i] += tau * self.evecs[[i, k_min]];
                }
            }
            // δ = D^{-1/2} η.
            let delta = &self.d_inv_sqrt * &eta;
            JointSpectralNewtonStep {
                delta,
                range_rhs_inf,
                null_rhs_inf,
                lambda_max_abs: self.lambda_max_abs,
                lambda_min_positive,
                nullity,
                rank_tol: KKT_REFUSAL_RANK_TOL,
                reflected_negative_modes,
                most_negative_eigenvalue: most_negative,
            }
        }

        /// Exact solution of the trust-region subproblem inside the `D`-metric ball
        /// of radius `trust_radius`. When `trust_radius` is non-finite or `≤ 0` the
        /// unconstrained (Moore–Penrose, range-restricted) Newton step is returned —
        /// i.e. the caller opted out of the trust region.
        pub(super) fn trust_region_step(&self, trust_radius: f64) -> JointSpectralNewtonStep {
            // Smallest identified curvature (signed). Empty identified set ⇒ pure
            // null space ⇒ zero step.
            let mut gamma_min_id = f64::INFINITY;
            let mut any_identified = false;
            for k in 0..self.gamma.len() {
                if self.gamma[k].abs() <= self.null_cutoff {
                    continue;
                }
                any_identified = true;
                gamma_min_id = gamma_min_id.min(self.gamma[k]);
            }
            if !any_identified {
                return self.assemble(0.0, None);
            }

            let unconstrained_radius = !(trust_radius.is_finite() && trust_radius > 0.0);
            // Interior Newton step is admissible only when the model is convex on the
            // identified range (γ_min > 0); then λ = 0 gives the exact Newton step.
            if gamma_min_id > 0.0 {
                let newton_norm = self.step_norm_sq(0.0).sqrt();
                if unconstrained_radius || newton_norm <= trust_radius {
                    return self.assemble(0.0, None);
                }
            } else if unconstrained_radius {
                // No trust region but an indefinite/semidefinite model: the
                // unconstrained problem is unbounded below. Fall back to the
                // reflected modified-Newton step (|γ| curvature) so the caller still
                // receives a finite descent direction; the downstream accept/reject
                // validates it. This path is only hit when a caller explicitly
                // disables the trust region on an indefinite model.
                return self.assemble_reflected();
            }

            // Boundary solution: find λ ≥ λ_lo with ‖η(λ)‖ = trust_radius.
            let lambda_lo = (-gamma_min_id).max(0.0);
            // Hard case detection: is rhs orthogonal to the minimal-curvature
            // eigenspace? If so ‖η(λ_lo)‖ is finite and may be below the radius.
            let min_mode_tol = self.null_cutoff.max(self.lambda_max_abs * 1e-12);
            let mut hard_case_component_sq = 0.0;
            let mut k_min_witness = None;
            for k in 0..self.gamma.len() {
                if self.gamma[k].abs() <= self.null_cutoff {
                    continue;
                }
                if (self.gamma[k] - gamma_min_id).abs() <= min_mode_tol {
                    hard_case_component_sq += self.c[k] * self.c[k];
                    k_min_witness = Some(k);
                }
            }
            // Evaluate the norm just above the pole. With a real rhs component at the
            // minimal mode the norm diverges at λ_lo, so the secular root is interior
            // to (λ_lo, ∞) and a small relative offset brackets it. With no such
            // component (hard case) the norm at λ_lo is finite.
            let lambda_lo_eval = lambda_lo + self.lambda_max_abs.max(1.0) * 1e-12;
            if hard_case_component_sq <= (self.lambda_max_abs.max(1.0) * 1e-12).powi(2) {
                let norm_at_lo = self.step_norm_sq(lambda_lo_eval).sqrt();
                if norm_at_lo < trust_radius {
                    // Hard case: λ = λ_lo, then add τ·v_min to reach the boundary.
                    if let Some(k_min) = k_min_witness {
                        let deficit =
                            (trust_radius * trust_radius - norm_at_lo * norm_at_lo).max(0.0);
                        let tau = deficit.sqrt();
                        return self.assemble(lambda_lo, Some((k_min, tau)));
                    }
                    return self.assemble(lambda_lo, None);
                }
            }
            // Safeguarded Newton on φ(λ) = 1/‖η(λ)‖ − 1/r (well-behaved, ~linear),
            // bracketed in [lo, hi]. φ is increasing in λ (‖η‖ decreasing), φ(lo)<0,
            // and we grow hi until φ(hi)>0.
            let target = trust_radius;
            let mut lo = lambda_lo_eval;
            let mut hi = lambda_lo_eval.max(self.lambda_max_abs).max(1.0);
            let mut grow_guard = 0;
            while self.step_norm_sq(hi).sqrt() > target && grow_guard < 200 {
                hi *= 2.0;
                grow_guard += 1;
            }
            let mut lambda = 0.5 * (lo + hi);
            for _ in 0..100 {
                let q = self.step_norm_sq(lambda);
                let norm = q.sqrt();
                if !norm.is_finite() {
                    lo = lambda;
                    lambda = 0.5 * (lo + hi);
                    continue;
                }
                // Maintain the bracket on φ(λ) = 1/norm − 1/target.
                if norm > target {
                    lo = lambda;
                } else {
                    hi = lambda;
                }
                let phi = 1.0 / norm - 1.0 / target;
                if phi.abs() <= 1e-12 / target {
                    break;
                }
                // q'(λ) = -2 Σ c_k²/(γ_k+λ)³ ⇒ d/dλ (1/norm) = -½ q^{-3/2} q'.
                let mut q_prime = 0.0;
                for k in 0..self.gamma.len() {
                    if self.gamma[k].abs() <= self.null_cutoff {
                        continue;
                    }
                    let denom = self.gamma[k] + lambda;
                    if denom.abs() <= f64::MIN_POSITIVE {
                        continue;
                    }
                    q_prime += -2.0 * self.c[k] * self.c[k] / (denom * denom * denom);
                }
                let phi_prime = -0.5 * q.powf(-1.5) * q_prime;
                let next = if phi_prime.abs() > f64::MIN_POSITIVE {
                    lambda - phi / phi_prime
                } else {
                    0.5 * (lo + hi)
                };
                // Safeguard into the bracket.
                lambda = if next.is_finite() && next > lo && next < hi {
                    next
                } else {
                    0.5 * (lo + hi)
                };
                if (hi - lo) <= 1e-14 * (1.0 + hi.abs()) {
                    break;
                }
            }
            self.assemble(lambda, None)
        }

        /// Reflected modified-Newton step (`|γ_k|` curvature, no trust region). Only
        /// used when a caller disables the trust region on an indefinite model — the
        /// trust-region path proper never reflects.
        fn assemble_reflected(&self) -> JointSpectralNewtonStep {
            let p = self.gamma.len();
            let mut eta = Array1::<f64>::zeros(p);
            let mut range_rhs_inf = 0.0_f64;
            let mut null_rhs_inf = 0.0_f64;
            let mut lambda_min_positive = f64::INFINITY;
            let mut nullity = 0usize;
            let mut reflected_negative_modes = 0usize;
            let mut most_negative = 0.0_f64;
            for k in 0..p {
                let g = self.gamma[k];
                if g.abs() <= self.null_cutoff {
                    nullity += 1;
                    null_rhs_inf = null_rhs_inf.max(self.c[k].abs());
                    continue;
                }
                range_rhs_inf = range_rhs_inf.max(self.c[k].abs());
                let curvature = if g < 0.0 {
                    reflected_negative_modes += 1;
                    most_negative = most_negative.min(g);
                    g.abs()
                } else {
                    lambda_min_positive = lambda_min_positive.min(g);
                    g
                };
                let coeff = self.c[k] / curvature;
                for i in 0..p {
                    eta[i] += coeff * self.evecs[[i, k]];
                }
            }
            let delta = &self.d_inv_sqrt * &eta;
            JointSpectralNewtonStep {
                delta,
                range_rhs_inf,
                null_rhs_inf,
                lambda_max_abs: self.lambda_max_abs,
                lambda_min_positive,
                nullity,
                rank_tol: KKT_REFUSAL_RANK_TOL,
                reflected_negative_modes,
                most_negative_eigenvalue: most_negative,
            }
        }
    }
}

#[cfg(test)]
mod trust_region_subproblem_tests {
    use super::whitened_spectrum::WhitenedHessianSpectrum;
    use super::*;
    use ndarray::array;

    fn metric_norm(delta: &Array1<f64>, d: &Array1<f64>) -> f64 {
        delta
            .iter()
            .zip(d.iter())
            .map(|(x, w)| x * x * positive_joint_diagonal_entry(*w))
            .sum::<f64>()
            .sqrt()
    }

    /// Interior case: a positive-definite model with a generous trust radius
    /// must return the exact (full) Newton step `H⁻¹ rhs`, i.e. λ = 0.
    #[test]
    fn interior_returns_exact_newton_step() {
        let h = array![[3.0, 1.0], [1.0, 2.0]];
        let rhs = array![1.0, -2.0];
        let d = array![1.0, 1.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let step = spec.trust_region_step(1e6);
        // Exact Newton: H δ = rhs.
        let resid = h.dot(&step.delta) - &rhs;
        assert!(
            resid.iter().all(|v| v.abs() < 1e-10),
            "interior step must solve H δ = rhs exactly, residual {resid:?}"
        );
    }

    /// Boundary case: a tight radius forces `‖δ‖_D = r` and the KKT condition
    /// `(H + λD) δ = rhs` with `λ > 0`.
    #[test]
    fn boundary_satisfies_more_sorensen_kkt() {
        let h = array![[3.0, 1.0], [1.0, 2.0]];
        let rhs = array![1.0, -2.0];
        let d = array![1.0, 1.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let r = 0.3;
        let step = spec.trust_region_step(r);
        let norm = metric_norm(&step.delta, &d);
        assert!(
            (norm - r).abs() < 1e-8,
            "boundary step must lie on the trust boundary: ‖δ‖_D={norm} vs r={r}"
        );
        // Recover λ from one coordinate of (H+λD)δ = rhs and check the whole
        // system is satisfied at that λ.
        let hd = h.dot(&step.delta);
        // Solve λ minimizing ‖(H+λD)δ − rhs‖ in least squares over the single
        // scalar λ: λ* = (Dδ)·(rhs − Hδ) / (Dδ)·(Dδ).
        let dd = &d * &step.delta;
        let lam = dd.dot(&(&rhs - &hd)) / dd.dot(&dd);
        assert!(lam > 0.0, "boundary multiplier must be positive, got {lam}");
        let resid = &hd + &(lam * &dd) - &rhs;
        assert!(
            resid.iter().all(|v| v.abs() < 1e-7),
            "(H+λD)δ = rhs must hold at the recovered λ={lam}, residual {resid:?}"
        );
    }

    /// Indefinite model: the exact subproblem still returns a finite boundary
    /// step that is a descent direction (rhsᵀδ > 0) and lies on the boundary.
    #[test]
    fn indefinite_model_returns_descent_step_on_boundary() {
        // Eigenvalues +4 and -1: genuinely indefinite.
        let h = array![[1.5, 2.5], [2.5, 1.5]];
        let rhs = array![1.0, 0.4];
        let d = array![1.0, 1.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let r = 0.7;
        let step = spec.trust_region_step(r);
        assert!(step.reflected_negative_modes >= 1 || step.most_negative_eigenvalue < 0.0);
        let norm = metric_norm(&step.delta, &d);
        assert!(
            (norm - r).abs() < 1e-7,
            "indefinite boundary step ‖δ‖_D={norm} vs r={r}"
        );
        assert!(
            rhs.dot(&step.delta) > 0.0,
            "step must be a descent direction for −rhsᵀδ + ½δᵀHδ (rhsᵀδ>0)"
        );
        // (H+λD) must be PSD at the chosen λ (most negative eigenvalue ≥ -λ).
        let dd = &d * &step.delta;
        let lam = dd.dot(&(&rhs - &h.dot(&step.delta))) / dd.dot(&dd);
        assert!(lam >= 1.0 - 1e-6, "λ must dominate -γ_min=1, got {lam}");
    }

    /// Self-vanishing: as rhs → 0 the step → 0 regardless of the radius, so the
    /// converged β and the KKT fixed point are unchanged by the globalization.
    #[test]
    fn step_vanishes_as_rhs_vanishes() {
        let h = array![[3.0, 1.0], [1.0, 2.0]];
        let rhs = array![1e-13, -2e-13];
        let d = array![1.0, 1.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let step = spec.trust_region_step(0.5);
        assert!(
            step.delta.iter().all(|v| v.abs() < 1e-11),
            "near-zero rhs must give near-zero step, got {:?}",
            step.delta
        );
    }

    /// Null space: a genuinely zero-curvature direction is dropped from the step
    /// (Moore–Penrose range restriction) and reported via `null_rhs_inf`.
    #[test]
    fn null_direction_is_dropped_and_reported() {
        // Second coordinate has zero curvature; rhs has mass there.
        let h = array![[2.0, 0.0], [0.0, 0.0]];
        let rhs = array![1.0, 0.5];
        let d = array![1.0, 1.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let step = spec.trust_region_step(1e6);
        assert_eq!(step.nullity, 1, "one null direction expected");
        assert!(
            step.null_rhs_inf >= 0.5 - 1e-9,
            "null-space rhs component must be reported, got {}",
            step.null_rhs_inf
        );
        // The identified direction takes its exact Newton component (1/2).
        assert!((step.delta[0] - 0.5).abs() < 1e-10);
        assert!(step.delta[1].abs() < 1e-10, "null coordinate left at 0");
    }

    /// Non-identity metric: the boundary is measured in the `D` norm, so a step
    /// with a large lightly-weighted coordinate is admissible.
    #[test]
    fn respects_non_identity_metric() {
        let h = array![[2.0, 0.0], [0.0, 8.0]];
        let rhs = array![1.0, 1.0];
        let d = array![1.0, 16.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let r = 0.2;
        let step = spec.trust_region_step(r);
        let norm = metric_norm(&step.delta, &d);
        assert!(
            (norm - r).abs() < 1e-8,
            "step must lie on the D-metric boundary, ‖δ‖_D={norm} vs r={r}"
        );
    }

    /// Shrinking the radius re-solves the subproblem (the direction bends toward
    /// the gradient) rather than rescaling a fixed direction — the property the
    /// dogleg/truncation lacked. A halved radius must not merely halve the step.
    #[test]
    fn radius_shrink_bends_direction_not_just_scale() {
        let h = array![[50.0, 0.0], [0.0, 0.5]];
        let rhs = array![1.0, 1.0];
        let d = array![1.0, 1.0];
        let spec = WhitenedHessianSpectrum::decompose(&h, &rhs, &d, KKT_REFUSAL_RANK_TOL).unwrap();
        let big = spec.trust_region_step(1.0).delta;
        let small = spec.trust_region_step(0.25).delta;
        // Direction (unit vectors) must differ: a pure truncation keeps the
        // direction fixed; the exact subproblem rotates toward the steep mode.
        let big_u = &big / metric_norm(&big, &d);
        let small_u = &small / metric_norm(&small, &d);
        let cos = big_u.dot(&small_u);
        assert!(
            cos < 0.9999,
            "exact TR step must bend the direction under radius shrink (cos={cos})"
        );
    }
}

/// Numerical nullity of a symmetric penalized Hessian at the shared
/// `KKT_REFUSAL_RANK_TOL` relative cutoff (the same threshold the spectral
/// range solve and the REML penalty-rank machinery use). Returns `None` only
/// when the eigendecomposition fails or the matrix is the zero matrix (no
/// finite curvature scale to normalize against); callers treat a `None` as
/// "could not certify full rank" and fall back to the conservative (damped)
/// path.
///
/// This exists so the CONSTRAINED active-set QP branch can decide whether the
/// joint design is genuinely rank-deficient (`nullity > 0` ⇒ an unidentified
/// gauge direction that needs the self-vanishing Levenberg floor to make the
/// QP minimizer unique) or fully identified (`nullity == 0` ⇒ the exact,
/// undamped Newton/KKT step is well-posed and converges quadratically). The
/// spectral-range branch already gets this for free via
/// `JointSpectralNewtonStep::nullity`; the constrained branch never runs the
/// eigensolve otherwise, so it computes it here on the already-penalized `lhs`.
/// PSD part of a symmetric matrix: eigendecompose and clamp negative
/// eigenvalues to zero. Used by the step consumers that REQUIRE a convex
/// model (the constrained active-set QP and the SPD-PCG matvec) when folding
/// the exact divided-difference Jeffreys curvature `H_Φ`, which is indefinite
/// exactly where `Φ` is (gam#979). On a PSD input this is the identity (up to
/// eigendecomposition round-off). Falls back to the zero matrix if the
/// eigendecomposition fails — the safe unaugmented step, never a wrong one.
fn symmetric_psd_projection(matrix: &Array2<f64>) -> Array2<f64> {
    let p = matrix.nrows();
    let mut sym = matrix.clone();
    symmetrize_dense_in_place(&mut sym);
    let Ok((evals, evecs)) = FaerEigh::eigh(&sym, Side::Lower) else {
        return Array2::zeros((p, p));
    };
    if evals.iter().all(|lam| *lam >= 0.0) {
        return sym;
    }
    let clamped = Array1::from_iter(evals.iter().map(|lam| lam.max(0.0)));
    let scaled = &evecs * &clamped.view().insert_axis(ndarray::Axis(0));
    scaled.dot(&evecs.t())
}

fn symmetric_penalized_hessian_nullity(lhs: &Array2<f64>) -> Option<usize> {
    let p = lhs.nrows();
    if p == 0 || lhs.ncols() != p {
        return Some(0);
    }
    let (evals, _) = FaerEigh::eigh(lhs, Side::Lower).ok()?;
    let max_abs = evals.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max);
    if !(max_abs.is_finite() && max_abs > 0.0) {
        return None;
    }
    let cutoff = KKT_REFUSAL_RANK_TOL * max_abs;
    Some(evals.iter().filter(|x| x.abs() < cutoff).count())
}

#[allow(clippy::too_many_arguments)]
fn compute_kkt_refusal_report(
    cycle: usize,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    s_lambdas: &[Array2<f64>],
    ranges: &[(usize, usize)],
    cached_joint_gradient: Option<&Array1<f64>>,
    cached_active_sets: &[Option<Vec<usize>>],
    block_constraints: &[Option<LinearInequalityConstraints>],
    joint_hessian_source: Option<&JointHessianSource>,
    total_p: usize,
    ridge: f64,
    ridge_policy: RidgePolicy,
    accepted_step_inf: f64,
    proposal_step_inf: f64,
    trust_radius: f64,
    residual_tol: f64,
    obj_tol: f64,
    step_tol: f64,
    objective_change: f64,
    projected_residual_inf: f64,
    math: Option<&JointNewtonMathDiagnostic>,
) -> KktRefusalReport {
    let block_names: Vec<String> = specs.iter().map(|s| s.name.clone()).collect();
    let block_widths: Vec<usize> = states.iter().map(|s| s.beta.len()).collect();
    let block_beta_inf: Vec<f64> = states
        .iter()
        .map(|s| s.beta.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max))
        .collect();

    let block_grad_inf: Vec<f64> = match cached_joint_gradient {
        Some(joint_grad) => {
            let mut acc = 0usize;
            states
                .iter()
                .map(|s| {
                    let n = s.beta.len();
                    let end = (acc + n).min(joint_grad.len());
                    // A width-0 block (e.g. a constant-scale `noise_formula="1"`
                    // log_sigma channel collapsed to zero free coefficients,
                    // gam#553) has no gradient and a zero residual — report 0.0,
                    // not the NaN sentinel. The NaN sentinel is reserved for a
                    // genuine layout mismatch: a positive-width block whose
                    // coordinates fall past the end of the joint gradient.
                    let nrm = if n == 0 {
                        0.0
                    } else if acc < end {
                        joint_grad
                            .slice(ndarray::s![acc..end])
                            .iter()
                            .map(|x: &f64| x.abs())
                            .fold(0.0_f64, f64::max)
                    } else {
                        f64::NAN
                    };
                    acc += n;
                    nrm
                })
                .collect()
        }
        None => vec![f64::NAN; states.len()],
    };

    let block_penalty_grad_inf: Vec<f64> = ranges
        .iter()
        .enumerate()
        .map(|(b, _)| {
            let mut penalty_block = s_lambdas[b].dot(&states[b].beta);
            if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                penalty_block += &states[b].beta.mapv(|v| ridge * v);
            }
            penalty_block
                .iter()
                .map(|x: &f64| x.abs())
                .fold(0.0_f64, f64::max)
        })
        .collect();

    let residual_vec_opt = cached_joint_gradient.and_then(|joint_grad| {
        exact_newton_joint_projected_stationarity_vector_from_gradient(
            joint_grad,
            states,
            specs,
            s_lambdas,
            ridge,
            ridge_policy,
            block_constraints,
            Some(cached_active_sets),
        )
        .ok()
    });
    let block_residual_inf: Vec<f64> = match residual_vec_opt.as_ref() {
        Some(residual) => ranges
            .iter()
            .map(|(start, end)| {
                // A zero-width block (start == end) has no residual of its own;
                // an empty `fold` would report a spurious `0.0`. Mark it `NaN`
                // so the `is_finite()` filter below excludes it from the
                // carrying-block selection (it cannot carry residual it has no
                // parameters for).
                if start >= end {
                    f64::NAN
                } else {
                    residual
                        .slice(ndarray::s![*start..*end])
                        .iter()
                        .map(|x: &f64| x.abs())
                        .fold(0.0_f64, f64::max)
                }
            })
            .collect(),
        None => vec![f64::NAN; states.len()],
    };
    let block_carrying_residual = block_residual_inf
        .iter()
        .enumerate()
        .filter(|(_, v)| v.is_finite())
        .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
        .map(|(i, _)| i);

    let mut hpen_eigenvalues_sorted_desc: Vec<f64> = Vec::new();
    let mut hpen_min_abs_eigenvalue = f64::NAN;
    let mut hpen_max_abs_eigenvalue = f64::NAN;
    let mut hpen_condition_number = f64::NAN;
    let mut hpen_nullity_at_rank_tol = 0usize;
    let mut hpen_null_gradient_inf = f64::NAN;
    let mut hpen_null_vector_block_inf = Vec::new();
    let mut hpen_null_vector_carrying_block = None;
    if total_p > 0
        && let Some(source) = joint_hessian_source
        && let Ok(mut h_joint) =
            materialize_joint_hessian_source(source, total_p, "KKT refusal diagnostic spectrum")
    {
        let model_diagonal_ridge = if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
            ridge
        } else {
            0.0
        };
        add_joint_penalty_to_matrix(&mut h_joint, ranges, s_lambdas, model_diagonal_ridge, None);
        symmetrize_dense_in_place(&mut h_joint);
        if let Ok((evals, evecs)) = FaerEigh::eigh(&h_joint, Side::Lower) {
            let mut sorted: Vec<f64> = evals.iter().copied().collect();
            sorted.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
            let max_abs = sorted.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max);
            let min_abs = sorted
                .iter()
                .map(|x: &f64| x.abs())
                .fold(f64::INFINITY, f64::min);
            let cutoff = KKT_REFUSAL_RANK_TOL * max_abs;
            hpen_nullity_at_rank_tol = sorted.iter().filter(|x| x.abs() < cutoff).count();
            hpen_max_abs_eigenvalue = max_abs;
            hpen_min_abs_eigenvalue = if min_abs.is_finite() {
                min_abs
            } else {
                f64::NAN
            };
            hpen_condition_number = if min_abs > 0.0 && min_abs.is_finite() {
                max_abs / min_abs
            } else {
                f64::INFINITY
            };
            if let Some(residual) = residual_vec_opt.as_ref()
                && residual.len() == total_p
                && hpen_nullity_at_rank_tol > 0
            {
                let mut best_component = 0.0_f64;
                let mut best_block_inf = vec![0.0_f64; ranges.len()];
                for k in 0..evals.len() {
                    if evals[k].abs() >= cutoff {
                        continue;
                    }
                    let component = evecs.column(k).dot(residual).abs();
                    if component > best_component {
                        best_component = component;
                        best_block_inf.clear();
                        best_block_inf.extend(ranges.iter().map(|(start, end)| {
                            evecs
                                .slice(ndarray::s![*start..*end, k])
                                .iter()
                                .map(|x: &f64| x.abs())
                                .fold(0.0_f64, f64::max)
                        }));
                    }
                }
                hpen_null_gradient_inf = best_component;
                hpen_null_vector_block_inf = best_block_inf;
                hpen_null_vector_carrying_block = hpen_null_vector_block_inf
                    .iter()
                    .enumerate()
                    .filter(|(_, v)| v.is_finite())
                    .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
                    .map(|(i, _)| i);
            }
            hpen_eigenvalues_sorted_desc = sorted;
        }
    }

    let active_set_rows_total: usize = cached_active_sets
        .iter()
        .map(|maybe_rows| maybe_rows.as_ref().map(|v| v.len()).unwrap_or(0))
        .sum();
    let any_block_has_constraints = block_constraints.iter().any(|c| c.is_some());

    let diagnosis = if hpen_nullity_at_rank_tol > 0 {
        KktRefusalDiagnosis::RankDeficientHPen
    } else if any_block_has_constraints
        && cached_active_sets.iter().any(|s| s.is_some())
        && projected_residual_inf > residual_tol
    {
        // Well-conditioned H_pen, the user has bound constraints, the current
        // active set already pinned some rows, yet the projected residual is
        // still many tolerances above the threshold. The cert refused
        // *because* the projection captured part of the multiplier but not
        // all of it — i.e. the active set is missing a row.
        KktRefusalDiagnosis::ActiveSetIncomplete
    } else {
        KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH
    };

    KktRefusalReport {
        block_names,
        block_widths,
        block_beta_inf,
        block_grad_inf,
        block_penalty_grad_inf,
        block_residual_inf,
        block_carrying_residual,
        hpen_eigenvalues_sorted_desc,
        hpen_min_abs_eigenvalue,
        hpen_max_abs_eigenvalue,
        hpen_condition_number,
        hpen_nullity_at_rank_tol,
        hpen_rank_tol: KKT_REFUSAL_RANK_TOL,
        hpen_null_gradient_inf,
        hpen_null_vector_block_inf,
        hpen_null_vector_carrying_block,
        active_set_rows_total,
        accepted_step_inf,
        proposal_step_inf,
        trust_radius,
        cycle,
        residual_tol,
        obj_tol,
        step_tol,
        linearized_rel: math
            .map(JointNewtonMathDiagnostic::linearized_rel)
            .unwrap_or(f64::NAN),
        scalar_model_relerr: math
            .map(JointNewtonMathDiagnostic::scalar_model_relative_error)
            .unwrap_or(f64::NAN),
        objective_change,
        projected_residual_inf,
        diagnosis,
    }
}

impl KktRefusalReport {
    fn carrying_block_label(&self) -> String {
        match self.block_carrying_residual {
            Some(idx) => format!(
                "{} (idx={}, |g|={:.3e}, |Sβ|={:.3e}, |∇L-Sβ|={:.3e}, |β|={:.3e}, width={})",
                self.block_names.get(idx).map(String::as_str).unwrap_or("?"),
                idx,
                self.block_grad_inf.get(idx).copied().unwrap_or(f64::NAN),
                self.block_penalty_grad_inf
                    .get(idx)
                    .copied()
                    .unwrap_or(f64::NAN),
                self.block_residual_inf
                    .get(idx)
                    .copied()
                    .unwrap_or(f64::NAN),
                self.block_beta_inf.get(idx).copied().unwrap_or(f64::NAN),
                self.block_widths.get(idx).copied().unwrap_or(0),
            ),
            None => "<no block carries finite residual>".to_string(),
        }
    }

    fn beta_inf(&self) -> f64 {
        self.block_beta_inf.iter().copied().fold(0.0_f64, f64::max)
    }

    fn null_direction_label(&self) -> String {
        match self.hpen_null_vector_carrying_block {
            Some(idx) => format!(
                "{} (idx={}, |u_block|∞={:.3e}, |uᵀg_proj|={:.3e})",
                self.block_names.get(idx).map(String::as_str).unwrap_or("?"),
                idx,
                self.hpen_null_vector_block_inf
                    .get(idx)
                    .copied()
                    .unwrap_or(f64::NAN),
                self.hpen_null_gradient_inf,
            ),
            None => format!("none (|uᵀg_proj|={:.3e})", self.hpen_null_gradient_inf),
        }
    }

    /// Multi-line structured log emitted at the cert REFUSED site. The
    /// per-block residual / eigenspectrum / diagnosis breakdown is what
    /// makes the failure actionable (vs the legacy one-liner that only
    /// reported aggregate residual + cert math).
    fn format_structured_log(&self, four_tol: f64) -> String {
        format!(
            "[PIRLS/joint-Newton convergence] cycle {:>3} | cert REFUSED: residual={:.3e} > tol={:.3e} (cert)\n  \
             carrying-block: {}\n  \
             block_names={:?}, block_widths={:?}, block_grad_inf={:?}, block_penalty_grad_inf={:?}, block_residual_inf={:?}\n  \
             H_pen spectrum: λ_max={:.3e}, λ_min={:.3e}, cond={:.3e}, nullity@{:.0e}={} (of {} eigenvalues)\n  \
             free-null diagnostic: {}\n  \
             cert math: linearized_rel={:.3e}, scalar_relerr={:.3e}, |Δobj|={:.3e} (tol={:.3e}), accepted_step_inf={:.3e} (tol={:.3e}), proposal_step_inf={:.3e}, trust_radius={:.3e}, |β|∞={:.3e}, active_set_rows_total={}\n  \
             diagnosis: {}",
            self.cycle,
            self.projected_residual_inf,
            four_tol,
            self.carrying_block_label(),
            self.block_names,
            self.block_widths,
            self.block_grad_inf,
            self.block_penalty_grad_inf,
            self.block_residual_inf,
            self.hpen_max_abs_eigenvalue,
            self.hpen_min_abs_eigenvalue,
            self.hpen_condition_number,
            self.hpen_rank_tol,
            self.hpen_nullity_at_rank_tol,
            self.hpen_eigenvalues_sorted_desc.len(),
            self.null_direction_label(),
            self.linearized_rel,
            self.scalar_model_relerr,
            self.objective_change,
            self.obj_tol,
            self.accepted_step_inf,
            self.step_tol,
            self.proposal_step_inf,
            self.trust_radius,
            self.beta_inf(),
            self.active_set_rows_total,
            self.diagnosis.as_str(),
        )
    }

    /// Single-string formatter used by the bubbled error returned from
    /// the inner solver, where the caller wants one self-contained line
    /// even though the data is structured.
    fn format_bubbled_error(&self) -> String {
        let carrying = self.carrying_block_label();
        format!(
            "cycle={} cert REFUSED: residual={:.3e} > tol={:.3e}; \
             carrying-block: {}; block_names={:?}, block_widths={:?}, \
             block_grad_inf={:?}, block_penalty_grad_inf={:?}, block_residual_inf={:?}; \
             H_pen spectrum: λ_max={:.3e}, λ_min={:.3e}, cond={:.3e}, nullity@{:.0e}={}/{}; \
             free-null diagnostic: {}; \
             cert math: linearized_rel={:.3e}, scalar_relerr={:.3e}, |Δobj|={:.3e}, \
             accepted_step_inf={:.3e}, proposal_step_inf={:.3e}, trust_radius={:.3e}, \
             |β|∞={:.3e}, active_set_rows_total={}; diagnosis: {}; {}",
            self.cycle,
            self.projected_residual_inf,
            4.0 * self.residual_tol,
            carrying,
            self.block_names,
            self.block_widths,
            self.block_grad_inf,
            self.block_penalty_grad_inf,
            self.block_residual_inf,
            self.hpen_max_abs_eigenvalue,
            self.hpen_min_abs_eigenvalue,
            self.hpen_condition_number,
            self.hpen_rank_tol,
            self.hpen_nullity_at_rank_tol,
            self.hpen_eigenvalues_sorted_desc.len(),
            self.null_direction_label(),
            self.linearized_rel,
            self.scalar_model_relerr,
            self.objective_change,
            self.accepted_step_inf,
            self.proposal_step_inf,
            self.trust_radius,
            self.beta_inf(),
            self.active_set_rows_total,
            self.diagnosis.as_str(),
            self.diagnosis.guidance(),
        )
    }
}

const JOINT_PCG_REL_TOL: f64 = 1e-8;
const PCG_ETA_MAX: f64 = 1.0e-1;
const PCG_ETA_MIN: f64 = 1.0e-8;
const PCG_GAMMA: f64 = 0.9;
const PCG_ALPHA: f64 = 1.618_033_988_749_895;

/// Eisenstat–Walker adaptive forcing term for the inner PCG tolerance:
/// when the previous outer KKT residual is known, scale the next inner
/// solve's relative tolerance by `γ·(‖r_cur‖/‖r_prev‖)^α`, clamped to
/// `[PCG_ETA_MIN, PCG_ETA_MAX]`. On the first cycle (no previous
/// residual) we use the loose `PCG_ETA_MAX` to avoid over-solving when
/// the iterate is far from the optimum.
fn joint_pcg_eisenstat_walker_forcing(prev_kkt_norm: Option<f64>, current_kkt_norm: f64) -> f64 {
    if !current_kkt_norm.is_finite() || current_kkt_norm < 0.0 {
        return JOINT_PCG_REL_TOL;
    }
    let Some(prev_kkt_norm) = prev_kkt_norm else {
        return PCG_ETA_MAX;
    };
    if !prev_kkt_norm.is_finite() || prev_kkt_norm <= 0.0 {
        return JOINT_PCG_REL_TOL;
    }
    let ratio = current_kkt_norm / prev_kkt_norm;
    if !ratio.is_finite() || ratio < 0.0 {
        return JOINT_PCG_REL_TOL;
    }
    (PCG_GAMMA * ratio.powf(PCG_ALPHA)).clamp(PCG_ETA_MIN, PCG_ETA_MAX)
}

fn apply_joint_penalized_hessian_into(
    source: &JointHessianSource,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    diagonal_ridge: f64,
    vector: &Array1<f64>,
    out: &mut Array1<f64>,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Result<(), String> {
    let mut penalty = Array1::<f64>::zeros(vector.len());
    apply_joint_penalized_hessian_into_with_workspace(
        source,
        ranges,
        s_lambdas,
        diagonal_ridge,
        vector,
        out,
        &mut penalty,
        joint_full_width,
    )
}

/// Variant of [`apply_joint_penalized_hessian_into`] that reuses a
/// caller-supplied scratch buffer for the penalty term instead of
/// allocating per call.  Use this in hot loops (e.g. the trust-region
/// trial loop) where `penalty_scratch` and the output `out` are hoisted
/// outside the loop and reused across attempts.
///
/// `penalty_scratch` must have the same length as `vector`; its contents
/// are overwritten on every call.
fn apply_joint_penalized_hessian_into_with_workspace(
    source: &JointHessianSource,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    diagonal_ridge: f64,
    vector: &Array1<f64>,
    out: &mut Array1<f64>,
    penalty_scratch: &mut Array1<f64>,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Result<(), String> {
    match source {
        JointHessianSource::Dense(h_joint) => {
            crate::faer_ndarray::fast_av_view_into(h_joint, vector, out.view_mut());
        }
        JointHessianSource::Operator { apply_into, .. } => {
            apply_into(vector, out)?;
        }
    }
    penalty_scratch.fill(0.0);
    apply_joint_block_penalty_into(
        ranges,
        s_lambdas,
        vector,
        diagonal_ridge,
        penalty_scratch,
        joint_full_width,
    );
    *out += &*penalty_scratch;
    Ok(())
}

fn stabilized_joint_solver_diagonal_ridge<F: CustomFamily + ?Sized>(
    family: &F,
    source: &JointHessianSource,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    base_diagonal_ridge: f64,
    ridge_floor: f64,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> f64 {
    if use_exact_newton_strict_spd(family) {
        return base_diagonal_ridge;
    }
    let JointHessianSource::Dense(h_joint) = source else {
        return base_diagonal_ridge;
    };
    let mut lhs = h_joint.clone();
    add_joint_penalty_to_matrix(
        &mut lhs,
        ranges,
        s_lambdas,
        base_diagonal_ridge,
        joint_full_width,
    );
    let shift = exact_newton_stabilizing_shift(&lhs, ridge_floor).unwrap_or(0.0);
    if shift > 0.0 {
        log::debug!(
            "[PIRLS/joint-Newton] stabilized dense penalized Hessian with diagonal shift {:.3e}",
            shift
        );
    }
    base_diagonal_ridge + shift
}

fn joint_quadratic_predicted_reduction(
    rhs: &Array1<f64>,
    hpen_delta: &Array1<f64>,
    delta: &Array1<f64>,
) -> f64 {
    rhs.dot(delta) - 0.5 * delta.dot(hpen_delta)
}

fn joint_preconditioned_descent_delta(
    source: &JointHessianSource,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    diagonal_ridge: f64,
    rhs: &Array1<f64>,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Result<Array1<f64>, String> {
    let base_diagonal = match source {
        JointHessianSource::Dense(h_joint) => h_joint.diag().to_owned(),
        JointHessianSource::Operator { diagonal, .. } => diagonal.clone(),
    };
    let preconditioner = joint_penalty_preconditioner_diag(
        &base_diagonal,
        ranges,
        s_lambdas,
        diagonal_ridge,
        joint_full_width,
    );
    let mut delta = rhs / &preconditioner;
    if !delta.iter().all(|v| v.is_finite()) || rhs.dot(&delta) <= 0.0 {
        delta.assign(rhs);
    }
    let directional = rhs.dot(&delta);
    if directional.is_finite() && directional > 0.0 {
        let mut hpen_delta = Array1::<f64>::zeros(rhs.len());
        apply_joint_penalized_hessian_into(
            source,
            ranges,
            s_lambdas,
            diagonal_ridge,
            &delta,
            &mut hpen_delta,
            joint_full_width,
        )?;
        let curvature = delta.dot(&hpen_delta);
        if curvature.is_finite() && curvature > 0.0 {
            let alpha = (directional / curvature).clamp(1.0e-12, 1.0);
            delta.mapv_inplace(|v| alpha * v);
        }
    }
    Ok(delta)
}

fn joint_line_search_log_likelihood<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    line_search_options: &BlockwiseFitOptions,
    states: &[ParameterBlockState],
) -> Result<(f64, Option<Arc<dyn ExactNewtonJointHessianWorkspace>>), String> {
    family
        .log_likelihood_only_with_options(states, line_search_options)
        .map(|log_likelihood| (log_likelihood, None))
}

fn coefficient_line_search_options(
    options: &BlockwiseFitOptions,
    early_exit_threshold: f64,
) -> BlockwiseFitOptions {
    let mut line_search_options = options.clone();
    // Preserve `outer_score_subsample` so the trial-objective and the
    // Hessian/gradient share a row measure: the trust-region ratio
    // ρ = [F(β) − F(β + δ)] / [−g·δ − ½·δᵀHδ] is only valid when
    // numerator and denominator evaluate the same measure. Disable
    // *auto*-install so no mid-iteration mask rebuild can occur, and
    // tag scope=InnerCoefficient so any sibling auto-install path that
    // somehow gets reached bails out (cf. `install_auto_outer_subsample_options`).
    line_search_options.auto_outer_subsample = false;
    line_search_options.outer_eval_context =
        options
            .outer_eval_context
            .as_ref()
            .map(|ctx| OuterEvalContext {
                rho: ctx.rho.clone(),
                eval_id: ctx.eval_id,
                scope: EvalScope::InnerCoefficient,
            });
    line_search_options.early_exit_threshold = Some(early_exit_threshold);
    line_search_options
}

type JointGradientLoad = (
    f64,
    Option<Array1<f64>>,
    Option<FamilyEvaluation>,
    Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
);

fn load_joint_gradient_evaluation<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    states: &[ParameterBlockState],
    prefer_workspace: bool,
    preferred_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
) -> Result<JointGradientLoad, String> {
    let workspace = match preferred_workspace {
        Some(workspace) => Some(workspace),
        None if prefer_workspace && family.inner_joint_workspace_gradient_available(specs) => {
            family.exact_newton_joint_hessian_workspace_with_options(states, specs, options)?
        }
        None => None,
    };
    if let Some(workspace_ref) = workspace.as_ref()
        && let Some(joint_eval) = workspace_ref.joint_gradient_evaluation()?
    {
        return Ok((
            joint_eval.log_likelihood,
            Some(joint_eval.gradient),
            None,
            Some(Arc::clone(workspace_ref)),
        ));
    }
    if let Some(joint_eval) = family.exact_newton_joint_gradient_evaluation(states, specs)? {
        return Ok((
            joint_eval.log_likelihood,
            Some(joint_eval.gradient),
            None,
            workspace,
        ));
    }
    let eval = family.evaluate(states)?;
    let log_likelihood = eval.log_likelihood;
    let gradient = exact_newton_joint_gradient_from_eval(&eval, specs, states)?;
    Ok((log_likelihood, gradient, Some(eval), workspace))
}

fn require_projected_kkt_residual(
    residual: Option<ProjectedKktResidual>,
    context: &str,
) -> Result<ProjectedKktResidual, String> {
    match residual {
        Some(residual) => Ok(residual),
        None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
            "{context}: converged joint-Newton exact inner solve did not produce a projected KKT \
             residual; refusing to assemble REML/LAML derivatives without the IFT correction input"
        ) }.into()),
    }
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum ConstrainedStationaryCertificate {
    NotCandidate,
    Accept,
    RefusePhantomMultiplier,
}

#[derive(Clone, Debug)]
struct JointNewtonMathDiagnostic {
    old_kkt_inf: f64,
    linearized_next_kkt_inf: f64,
    predicted_reduction: f64,
    actual_reduction: f64,
    trust_ratio: f64,
    step_inf: f64,
    proposal_inf: f64,
}

impl JointNewtonMathDiagnostic {
    fn scalar_model_relative_error(&self) -> f64 {
        (self.actual_reduction - self.predicted_reduction).abs()
            / self.predicted_reduction.abs().max(1.0)
    }

    fn linearized_rel(&self) -> f64 {
        self.linearized_next_kkt_inf / (1.0 + self.old_kkt_inf)
    }
}

fn constrained_stationary_certificate_decision(
    math: &JointNewtonMathDiagnostic,
    objective_change: f64,
    objective_tol: f64,
    step_tol: f64,
    geometric_tail_bound: Option<f64>,
    residual: f64,
    residual_tol: f64,
) -> ConstrainedStationaryCertificate {
    let linearized_rel = math.linearized_rel();
    let scalar_model_relerr = math.scalar_model_relative_error();
    let objective_exhausted = objective_change <= objective_tol
        || geometric_tail_bound.is_some_and(|tail| tail <= objective_tol);
    let step_exhausted =
        math.step_inf.is_finite() && step_tol.is_finite() && math.step_inf <= step_tol;

    if !(objective_exhausted
        && step_exhausted
        && linearized_rel >= 0.5
        && scalar_model_relerr <= 1e-3)
    {
        return ConstrainedStationaryCertificate::NotCandidate;
    }

    // A large linearized residual can mean either an honest active-set
    // multiplier or an H-null/rank-deficient direction that Newton cannot
    // move. Only the projected KKT residual distinguishes those cases. This
    // small tolerance band is intentionally tied to the inner residual
    // tolerance, because this branch is allowed to certify convergence only
    // when the active-set projection has actually captured the multiplier.
    //
    // The band is a small MULTIPLE of `residual_tol`, not exactly `1x`: this
    // branch fires only once the iterate is already proven stationary (objective
    // exhausted, step exhausted, `linearized_rel >= 0.5` so the residual is
    // multiplier/null mass not a gradient defect, `scalar_relerr <= 1e-3` so the
    // quadratic model is exact). There the active-projected residual stalls at the
    // conditioning/round-off floor — for the survival baseline-hazard block
    // (well-conditioned after the data-seeded baseline, gam#797) it floors a hair
    // above the scale-relative `residual_tol`, so demanding exactly `<= tol` leaves
    // a fully-stationary iterate uncertified. A `4x` band certifies the genuinely
    // converged iterate while still rejecting a residual orders of magnitude above
    // tolerance (a real defect), the only case this guard must catch.
    let cert_residual_factor = 4.0;
    if residual.is_finite() && residual <= cert_residual_factor * residual_tol {
        ConstrainedStationaryCertificate::Accept
    } else {
        ConstrainedStationaryCertificate::RefusePhantomMultiplier
    }
}

/// True iff the recent KKT-residual tail (`history`, oldest→newest) shows STEADY
/// geometric descent: every consecutive pair strictly decreased by at least the
/// factor `(1 - min_drop)` over the whole window.
///
/// This distinguishes a still-converging Newton direction from a genuine
/// multiplier/null plateau at the certificate-refusal gate (gam#787 duchon
/// centers≥20). The constrained-stationary refusal fires on a flat objective +
/// `linearized_rel ≥ 0.5`, but those signals ALSO hold for a logslope block
/// whose residual is dropping by a steady factor each cycle (objective already
/// at its Φ-bounded floor while the KKT residual still polishes): refusing there
/// rejects the seed a few cycles short of `residual_tol`. Requiring a STEADY
/// drop over `≥ window` cycles (not a single lucky decrease) keeps a noisy
/// near-plateau from being falsely extended, and the inner cycle cap still
/// bounds the extra work.
fn residual_in_steady_geometric_descent(history: &std::collections::VecDeque<f64>) -> bool {
    let window = history.len();
    if window < 3 {
        return false;
    }
    let min_drop = 0.1; // each cycle must cut the residual by ≥ 10%.
    history
        .iter()
        .zip(history.iter().skip(1))
        .all(|(prev, next)| {
            prev.is_finite() && next.is_finite() && *prev > 0.0 && *next < (1.0 - min_drop) * *prev
        })
}

/// Inf-norm of the active-set-projected stationarity residual restricted to the
/// **range** of the joint penalized Hessian `H_pen = H + S(λ) + ridge·I`.
///
/// A penalized smooth whose penalty has a polynomial null space the censored /
/// location-scale data does not pin down (TP / Bernstein trend directions in a
/// survival `time_transform` or `log_sigma` channel, gam#553) leaves a residual
/// that lives entirely in `ker(H_pen)`: along that direction the objective has
/// neither curvature nor a constraint, so it is a genuinely *free* gauge
/// direction, not an unresolved KKT defect. The total residual inf-norm then
/// stays large forever and the phantom-multiplier refusal never clears, aborting
/// the fit at REML startup even though the iterate is stationary on the entire
/// identifiable (range) subspace.
///
/// The downstream outer IFT trace already removes the null-space component via
/// the projected pseudo-inverse `U_S·H_proj⁻¹·U_Sᵀ`, so only a *range-space*
/// residual component can bias the envelope gradient (see the "do NOT
/// soft-accept" investigation note at the certifier call site). This returns the
/// range-space inf-norm so the certifier can accept iff that — the only part
/// that matters for outer correctness — is at tolerance, while a real defect
/// (residual with mass in the curved subspace) still refuses.
///
/// Returns `None` when the penalized Hessian cannot be materialized or
/// eigendecomposed, or carries no numerical null space — in which case the
/// caller keeps the strict total-residual refusal (no null space ⇒ range = all).
fn projected_residual_range_space_inf(
    projected_residual: &Array1<f64>,
    joint_hessian_source: &JointHessianSource,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    total_p: usize,
) -> Option<f64> {
    if total_p == 0 || projected_residual.len() != total_p {
        return None;
    }
    let mut h_joint = materialize_joint_hessian_source(
        joint_hessian_source,
        total_p,
        "penalty-null-space certificate spectrum",
    )
    .ok()?;
    let model_diagonal_ridge = if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
        ridge
    } else {
        0.0
    };
    add_joint_penalty_to_matrix(&mut h_joint, ranges, s_lambdas, model_diagonal_ridge, None);
    symmetrize_dense_in_place(&mut h_joint);
    let (evals, evecs) = FaerEigh::eigh(&h_joint, Side::Lower).ok()?;
    let max_abs = evals.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max);
    if !(max_abs.is_finite() && max_abs > 0.0) {
        return None;
    }
    let cutoff = KKT_REFUSAL_RANK_TOL * max_abs;
    let nullity = evals.iter().filter(|x| x.abs() < cutoff).count();
    if nullity == 0 {
        // No data-unconstrained null space — the range is the whole space, so
        // the strict total-residual refusal already governs. Signal "no relief".
        return None;
    }
    // Range-space residual = residual minus its projection onto ker(H_pen).
    // Equivalently, accumulate the residual's coordinates along every
    // range-space (|λ| ≥ cutoff) eigenvector. The eigenbasis is orthonormal,
    // so ‖P_range r‖∞ is read off the reconstructed range component.
    let mut range_component = Array1::<f64>::zeros(total_p);
    for k in 0..evals.len() {
        if evals[k].abs() < cutoff {
            continue;
        }
        let coeff = evecs.column(k).dot(projected_residual);
        range_component.scaled_add(coeff, &evecs.column(k));
    }
    Some(
        range_component
            .iter()
            .map(|x: &f64| x.abs())
            .fold(0.0_f64, f64::max),
    )
}

fn inner_blockwise_fit<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
    warm_start: Option<&ConstrainedWarmStart>,
) -> Result<BlockwiseInnerResult, String> {
    // Inner-blockwise prelude waypoints. At large-scale n the cold-start
    // path between function entry and the first PIRLS/JN cycle-summary
    // log can run for many minutes (sometimes hours) silently while
    // row-kernel workspace builds run. Emit a `[STAGE] PIRLS/inner`
    // line at each transition so the next failed run pinpoints which
    // named step holds time. Gated on large-scale n so small-fit
    // tests stay quiet.
    let inner_started = std::time::Instant::now();
    let mut states = buildblock_states(family, specs)?;
    refresh_all_block_etas(family, specs, &mut states)?;
    let total_joint_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    let total_joint_n = joint_observation_count(&states);
    const INNER_PRELUDE_LOG_MIN_N: usize = 100_000;
    let prelude_log = total_joint_n >= INNER_PRELUDE_LOG_MIN_N;
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=buildblock_states+refresh_etas elapsed={:.3}s n={} p={} blocks={}",
            inner_started.elapsed().as_secs_f64(),
            total_joint_n,
            total_joint_p,
            specs.len(),
        );
    }
    let matrix_free_joint_requested = use_joint_matrix_free_path(total_joint_p, total_joint_n)
        || family.prefers_matrix_free_inner_joint(specs, &states);
    let has_workspace_source = family.inner_coefficient_hessian_hvp_available(specs);
    // Probe the *spec-aware* joint Hessian: it is the canonical source of the
    // coupled joint curvature. A family may override only
    // `exact_newton_joint_hessian_with_specs` (the variant that has access to
    // the realized block designs needed to assemble the cross-block
    // `X_aᵀ diag(w_ab) X_b` blocks — e.g. the Dirichlet common-parameterization
    // family, whose `evaluate` emits diagonal working sets so the spec-less
    // default block assembler returns `None`). Routing the inner joint-Newton
    // availability gate through the spec-less `exact_newton_joint_hessian`
    // would then mis-classify such a family as "no joint Hessian" and drop it
    // onto pure block-diagonal backfitting, which fails to reach KKT on small,
    // concentrated coupled likelihoods. The `_with_specs` path subsumes the
    // spec-less one for every family (single-block / uncoupled delegate
    // identically), so it is the correct probe here.
    let has_joint_exacthessian = if has_workspace_source {
        true
    } else {
        family
            .exact_newton_joint_hessian_with_specs(&states, specs)?
            .is_some()
    };
    let coupled_exact_joint_required = specs.len() >= 2
        && !family.likelihood_blocks_uncoupled()
        && (family.has_explicit_joint_hessian() || has_workspace_source);
    // Multi-block families have always taken the joint path when an exact
    // joint Hessian is available. Single-block families also take it when a
    // coefficient-Hessian workspace is wired; dense vs. operator form is a
    // later representation choice, not a cache-construction gate.
    let use_joint_newton = has_joint_exacthessian && (specs.len() >= 2 || has_workspace_source);
    let joint_workspace_requested = use_joint_newton && has_workspace_source;
    let inner_tol = options.inner_tol;
    let inner_max_cycles_base = options.inner_max_cycles;
    // Per-outer-call inner-cycle cap. The earlier "adaptive inner cycle
    // cap" doubled this mid-loop on plateaus, but that turned out to be
    // the wrong response to stalled descent (descent ratios pinned at
    // ~0.999 paired with a sub-tolerance objective change is the
    // no-descent signal, not a "give Newton more cycles" signal). The
    // plateau-flat-objective convergence certificate in the inner-cycle
    // body now handles that case directly, so the cap stays fixed at the
    // baseline for the lifetime of this outer call.
    let inner_max_cycles = capped_inner_max_cycles(options, inner_max_cycles_base);
    // Each block's assembled penalty matrix depends only on that block's
    // penalties and smoothing parameters. Build these setup matrices in
    // parallel, but keep the coordinate-descent and line-search loops below
    // strictly serial because each accepted block update changes the state seen
    // by later blocks.
    use rayon::iter::{IntoParallelIterator, ParallelIterator};
    let s_lambdas_launch_started = std::time::Instant::now();
    let s_lambdas_par_iter = (0..specs.len()).into_par_iter().map(|b| {
        let spec = &specs[b];
        let Some(block_log_lambda) = block_log_lambdas.get(b) else {
            return Err(CustomFamilyError::UnsupportedConfiguration {
                reason: format!("missing log-smoothing parameter vector for block {b}"),
            }
            .into());
        };
        if block_log_lambda.len() != spec.penalties.len() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {b} log-smoothing parameter length {} does not match penalties {}",
                    block_log_lambda.len(),
                    spec.penalties.len()
                ),
            }
            .into());
        }

        let p = spec.design.ncols();
        let lambdas = block_log_lambda.mapv(f64::exp);
        let mut s_lambda = Array2::<f64>::zeros((p, p));
        for (k, s) in spec.penalties.iter().enumerate() {
            s.add_scaled_to(lambdas[k], &mut s_lambda);
        }
        Ok(s_lambda)
    });
    let s_lambdas_collect_started = std::time::Instant::now();
    let s_lambdas_launch_elapsed = s_lambdas_launch_started.elapsed();
    let s_lambdas = s_lambdas_par_iter.collect::<Result<Vec<_>, String>>()?;
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=s_lambdas par_iter launch={:.3}s collect={:.3}s blocks={} (since inner-start={:.3}s)",
            s_lambdas_launch_elapsed.as_secs_f64(),
            s_lambdas_collect_started.elapsed().as_secs_f64(),
            specs.len(),
            inner_started.elapsed().as_secs_f64(),
        );
    }
    let ridge = effective_solverridge(options.ridge_floor);
    let joint_bundle: Option<&crate::families::joint_penalty::JointPenaltyBundle> =
        options.joint_penalties.as_deref();
    if let Some(bundle) = joint_bundle {
        for (i, spec) in bundle.specs.iter().enumerate() {
            if spec.dim() != total_joint_p {
                return Err(format!(
                    "joint penalty {i}: dim {} != total compiled p {}",
                    spec.dim(),
                    total_joint_p,
                ));
            }
        }
        if bundle.specs.len() != bundle.log_lambdas.len() {
            return Err(format!(
                "joint penalty bundle: {} specs vs {} log_lambdas",
                bundle.specs.len(),
                bundle.log_lambdas.len(),
            ));
        }
    }
    let mut cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; specs.len()];
    if let Some(seed) = warm_start
        && seed.block_beta.len() == states.len()
        && seed.active_sets.len() == states.len()
    {
        if warm_start_matches_block_log_lambdas(seed, block_log_lambdas)
            && let Some(cached) = seed.cached_inner.as_ref()
            && cached.converged
            && seed
                .block_beta
                .iter()
                .zip(&states)
                .all(|(beta_seed, state)| beta_seed.len() == state.beta.len())
        {
            for (state, beta_seed) in states.iter_mut().zip(&seed.block_beta) {
                state.beta.assign(beta_seed);
            }
            cached_active_sets = seed.active_sets.clone();
            refresh_all_block_etas(family, specs, &mut states)?;
            log::info!(
                "[PIRLS/joint-Newton warm-start] reused cached same-rho inner mode | cycles={} logdet_h={:.6e} logdet_s={:.6e}",
                cached.cycles,
                cached.block_logdet_h,
                cached.block_logdet_s,
            );
            return Ok(BlockwiseInnerResult {
                block_states: states,
                active_sets: normalize_active_sets(cached_active_sets),
                log_likelihood: cached.log_likelihood,
                penalty_value: cached.penalty_value,
                cycles: cached.cycles,
                converged: cached.converged,
                block_logdet_h: cached.block_logdet_h,
                block_logdet_s: cached.block_logdet_s,
                s_lambdas,
                joint_workspace: cached.joint_workspace.clone(),
                kkt_residual: cached.kkt_residual.clone(),
                active_constraints: cached.active_constraints.clone(),
            });
        }
        // Cold-start path: copy prior β where dimensions match
        // (best-effort; mismatched blocks keep the freshly-built
        // initial state).
        for (b, beta_seed) in seed.block_beta.iter().enumerate() {
            if beta_seed.len() == states[b].beta.len() {
                let beta_projected =
                    family.post_update_block_beta(&states, b, &specs[b], beta_seed.clone())?;
                states[b].beta.assign(&beta_projected);
            }
        }
        cached_active_sets = seed.active_sets.clone();
        refresh_all_block_etas(family, specs, &mut states)?;
    }
    let load_joint_started = std::time::Instant::now();
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=load_joint_gradient_evaluation begin use_joint_newton={} joint_workspace_requested={} (since inner-start={:.3}s)",
            use_joint_newton,
            joint_workspace_requested,
            inner_started.elapsed().as_secs_f64(),
        );
    }
    let (
        mut current_log_likelihood,
        mut cached_eval,
        mut cached_joint_gradient,
        mut cached_joint_workspace,
    ) = if use_joint_newton {
        let (log_likelihood, gradient, eval, workspace) = load_joint_gradient_evaluation(
            family,
            specs,
            options,
            &states,
            joint_workspace_requested,
            None,
        )?;
        (log_likelihood, eval, gradient, workspace)
    } else {
        let eval = family.evaluate(&states)?;
        let log_likelihood = eval.log_likelihood;
        (log_likelihood, Some(eval), None, None)
    };
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=load_joint_gradient_evaluation end elapsed={:.3}s log_likelihood={:.6e} has_gradient={} has_workspace={}",
            load_joint_started.elapsed().as_secs_f64(),
            current_log_likelihood,
            cached_joint_gradient.is_some(),
            cached_joint_workspace.is_some(),
        );
    }
    // Validate exact-Newton block Hessians at the family-evaluation
    // boundary. A non-finite entry is a contract violation against the
    // family's analytic second derivative; refuse to iterate before
    // any factorization rather than letting it slip through to a
    // downstream logdet check that may be gated off by the outer
    // optimizer's flags.
    let validate_started = std::time::Instant::now();
    if let Some(eval) = cached_eval.as_ref() {
        validate_block_hessians_finite(eval)?;
    }
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=validate_block_hessians_finite elapsed={:.3}s checked={}",
            validate_started.elapsed().as_secs_f64(),
            cached_eval.is_some(),
        );
    }
    let penalty_started = std::time::Instant::now();
    let mut current_penalty = total_quadratic_penalty(
        &states,
        &s_lambdas,
        ridge,
        options.ridge_policy,
        joint_bundle,
        Some(specs),
    );
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=total_quadratic_penalty elapsed={:.3}s penalty={:.6e} (prelude_total={:.3}s)",
            penalty_started.elapsed().as_secs_f64(),
            current_penalty,
            inner_started.elapsed().as_secs_f64(),
        );
    }
    let mut lastobjective = -current_log_likelihood + current_penalty;
    let mut converged = false;
    let mut cycles_done = 0usize;
    // Pre-allocate per-block eta backup buffers to avoid O(n) allocation
    // per block per cycle in the backtracking line search.
    let mut eta_backups: Vec<Array1<f64>> =
        states.iter().map(|s| Array1::zeros(s.eta.len())).collect();

    // ── Joint Newton fast path ──
    //
    // When the family provides an exact joint Hessian (GAMLSS location-scale),
    // solve the full (p_mu + p_ls) × (p_mu + p_ls) system in one Newton step
    // per cycle instead of iterating between blocks. This converges quadratically
    // (5-10 steps) instead of linearly (20-100+ blockwise cycles).
    //
    // Generic block-diagonal surrogate families may still fall back to
    // blockwise iteration if the joint surrogate is unavailable. Families that
    // advertise a real coupled joint Hessian must not: the blockwise loop only
    // sees principal blocks, so it drops the cross-block curvature that makes
    // the joint problem well conditioned near saturated optima.

    // `last_residual_tol` mirrors the per-cycle KKT tolerance computed inside
    // the joint-Newton loop (`inner_tol · (1 + max(‖∇L‖∞, ‖Sβ‖∞))`). It must
    // live at function scope so both the post-converged exit block inside
    // `if use_joint_newton` AND the post-block-fit IFT residual builder
    // outside that branch can thread the same tolerance into the
    // `ProjectedKktResidual::with_metadata(...)` builder. Seed at `inner_tol`
    // so a path that skips the loop entirely (no joint-Newton, or zero
    // cycles) still records a finite, non-NaN tolerance on the residual
    // carrier rather than NaN.
    let mut last_residual_tol: f64 = inner_tol;

    if use_joint_newton {
        // Build block ranges for the joint system.
        let ranges: Vec<(usize, usize)> = {
            let mut offset = 0;
            specs
                .iter()
                .map(|s| {
                    let start = offset;
                    offset += s.design.ncols();
                    (start, offset)
                })
                .collect()
        };
        let total_p: usize = ranges.last().map_or(0, |r| r.1);

        // Universal full-span Jeffreys/Firth robustness. Build `Z_J` once and
        // use the same term in the coupled Newton step, objective value, and
        // stationarity checks so a near-separating coefficient is bounded by
        // the likelihood's own Fisher geometry instead of an ad-hoc ridge.
        // `None` (empty coefficient system) leaves every step and objective at
        // the un-augmented inner Newton.
        //
        // Continuous-response families (the canonical example: transformation-
        // normal h(Y|x) ~ N(0,1)) opt out via
        // `joint_jeffreys_term_required() = false`. They have no separation
        // regime, the Fisher information is `O(n)` on every identified
        // direction by construction, and each Jeffreys evaluation costs
        // `p` directional-derivative calls into the family's exact joint
        // Hessian — at large scale (CTN duchon16d, p=144, n=20000) that
        // is the dominant per-cycle cost (~200 s/cycle on three calls per
        // cycle), exhausting the inner budget before the algorithm converges
        // while contributing essentially zero to the gradient/curvature.
        let joint_jeffreys_subspace = if family.joint_jeffreys_term_required() {
            build_joint_jeffreys_subspace(specs, &ranges)?
        } else {
            None
        };
        // FIRTH MERIT BOOKKEEPING (gam#826/#872 — per-cycle Φ fold, not a carried
        // value). `current_penalty` / `lastobjective` hold ONLY the quadratic
        // penalty `½βᵀSβ` (NO Φ). The Firth value `−Φ` is folded into the
        // accept/reject comparison FRESH at each β under the same
        // `jeffreys_skippable_this_cycle` gate the step and KKT residual use, so
        // `old_objective` (old β) and `trialobjective` (trial β) are always on the
        // same objective `−ℓ + ½βᵀSβ − Φ` regardless of whether a cycle skips the
        // term. Carrying Φ in `current_penalty` (the previous design) desynced
        // old-vs-trial by ±Φ whenever the per-cycle skippable decision flipped —
        // and the cycle-0 baseline folded Φ UNCONDITIONALLY while the trial folded
        // it gated, so a skippable cycle 0 saw a spurious `Δobj = ±Φ`, rejected
        // every backtrack, and refused as a `phantom_multiplier` at a zero step
        // (the binomial location-scale coupled non-convergence). SIGN: Firth ADDS
        // ½log|I| to the log-likelihood ⇒ the NLL objective SUBTRACTS Φ, matching
        // the Newton step rhs / KKT residual which ADD `∇Φ` to `∇L − Sβ`.

        let joint_mode_diagonal_ridge =
            if ridge > 0.0 && options.ridge_policy.include_quadratic_penalty {
                ridge
            } else {
                0.0
            };

        // Exact joint Newton steps are guarded by two independent mechanisms:
        // family-owned feasibility (`max_feasible_step_size`) and the adaptive
        // trust region below. There is intentionally no family hook for a
        // hard per-attempt coefficient-space clamp; keeping the policy local
        // avoids stale no-op configuration and makes the trust-region behavior
        // explicit at the only place it is used.

        // Cross-cycle convergence carry-over: set at the end of every
        // accepted cycle so the next cycle can distinguish a true KKT
        // optimum on a rank-deficient null mode (objective stuck
        // because every direction is along the null space) from
        // genuine non-convergence. The residual signal does not need
        // a carry-over — `residual <= residual_tol` is the canonical
        // KKT certificate and the end-of-cycle test consumes it
        // directly when it fires.

        // Predicted-reduction tracker for the principled trust-region
        // stopping criterion (Conn-Gould-Toint, *Trust-Region Methods*,
        // Theorem 6.4.6). The Newton model at the accepted step has a
        // predicted decrease `m(0) − m(δ) = −g·δ − 0.5·δ·H·δ`. For an
        // unclipped Newton step (H·δ = −g) this is `0.5·g·H⁻¹·g`, the
        // Newton decrement squared / 2. When the model itself predicts
        // a decrease smaller than the objective tolerance, no descent
        // direction the Hessian can resolve will lower the objective
        // by more than `objective_tol`, and continuing is wall-clock
        // waste regardless of whether the raw gradient residual or
        // step-norm gates have closed.
        //
        // Cross-cycle convergence carry-over: set at the end of every
        // accepted cycle so the next cycle's line-search-failure path
        // can distinguish a true KKT optimum on a rank-deficient
        // Hessian (no meaningful trial step, even though step_inf is
        // O(1) along the null mode) from genuine non-convergence.
        let mut last_cycle_residual_below_tol = false;
        let mut last_cycle_obj_change_below_tol = false;

        let mut joint_trust_radius = 1.0_f64;
        let mut joint_block_trust_radii = vec![1.0_f64; ranges.len()];
        let mut last_accepted_hit_joint_trust_boundary = false;
        // Hard upper bound for the for-loop's range. The cap is fixed at
        // `inner_max_cycles` for the lifetime of this outer call (the
        // earlier mid-loop cap extension was removed in favor of the
        // plateau-flat-objective convergence certificate), but the
        // sentinel pattern is retained — the `.max(200)` floor is a
        // harmless safety pad and the explicit `cycle >= inner_max_cycles`
        // break keeps the existing `continue` statements in the body
        // working
        // (they advance `cycle` via the iterator), unlike a `while` +
        // manual-counter rewrite.
        let inner_loop_hard_ceiling = inner_max_cycles.max(200);
        // Verbose cadence for the inner joint-Newton log block. Boring cycles
        // (first-attempt accepts with no convergence event) emit ONE compact
        // one-liner instead of the 4-line pre-cycle/TR/cycle-summary/convergence
        // block. Verbose cycles (first, last, every 20th, all rejections,
        // convergence events) keep the full detail. JOINT_LOG_VERBOSE_PERIOD is
        // tuned so a 200-cycle inner solve emits ~10 detailed waypoints plus
        // 1 compact line per remaining cycle (~210 lines), down from ~800.
        const JOINT_LOG_VERBOSE_PERIOD: usize = 50;
        // Residual-stall detector for joint Newton. Distinct from the
        // blockwise loglik-frozen divergence detector lower in the file:
        // that one requires the log-likelihood to be unchanged for K
        // cycles AND the per-block Newton step pinned at the cap.
        //
        // Large-scale survival marginal-slope hits a different pattern —
        // the joint objective decreases monotonically by O(1) per cycle
        // (so loglik is NOT frozen), the TR repeatedly clamps proposals
        // with |prop|∞ >> trust_radius, and the post-step KKT residual
        // oscillates in a band orders of magnitude above residual_tol
        // without trending down. Burning the rest of the cycle budget on
        // this pattern reaches inner_max_cycles "non-converged", which
        // then drops the outer optimizer into the first-order bridge
        // fallback with a stale-mode gradient that ‖g‖ ≈ 10⁷ kills BFGS
        // line search at iter 0.
        //
        // Track the best residual seen and the number of cycles since
        // any meaningful improvement (≥10% drop). Once we've burned at
        // least RESIDUAL_STALL_MIN_CYCLES with no improvement AND the
        // TR has been clamping aggressively, exit `converged=false` so
        // the outer optimizer sees a non-converged signal while we still
        // have a finite, in-range β to return (instead of running to the
        // hard ceiling and then handing BFGS a junk gradient).
        const RESIDUAL_STALL_NO_IMPROVE_CYCLES: usize = 30;
        const RESIDUAL_STALL_MIN_CYCLES: usize = 40;
        const RESIDUAL_STALL_IMPROVEMENT_FACTOR: f64 = 0.9;
        const RESIDUAL_STALL_BLOCK_GRADIENT_FACTOR: f64 = 50.0;
        let mut best_residual_seen: f64 = f64::INFINITY;
        let mut cycles_since_residual_improved: usize = 0;
        // Number of consecutive non-improving cycles after which the
        // conditioning-based self-vanishing Levenberg–Marquardt damping is
        // ARMED inside the spectral-range Newton solve, for EVERY family
        // (#826/#808). The undamped range-restricted Newton step oscillates on a
        // full-rank-but-ill-conditioned penalized Hessian at the oversmoothed-ρ
        // operating point: the tiny-but-above-cutoff curvature of the lightly
        // identified mean/threshold/wiggle block takes an enormous `component/λ`
        // proposal that the trust region clips every cycle, so the residual on
        // that block freezes while its β stays ≈0 (the exact #826 signature).
        // The conditioning-gated `μ = c·‖∇L − Sβ‖∞` caps that component into a
        // bounded descent step. It is SELF-VANISHING (μ → 0 as the residual → 0)
        // so the converged β and the KKT certificate are byte-identical to the
        // undamped solve — zero REML/LAML bias. Arming it on OBSERVED non-
        // progress rather than a static per-family flag keeps the AFT /
        // constant-scale endgame (which converges quadratically and never
        // stalls) byte-identical: a quadratically-converging solve reaches
        // tolerance in a handful of cycles and never trips this threshold, so μ
        // is never engaged there. Only a genuinely oscillating ill-conditioned
        // solve crosses it, which is exactly when the damping is sound. Set a
        // few cycles below the stall-exit window so the damping gets a chance to
        // rescue the solve well before the early-exit / budget tripwire fires.
        // (The conditioning-gated self-vanishing μ this armed now lives ONLY in the
        // test-retained `solve_joint_newton_step_on_spectral_range`; the production
        // joint step takes the exact trust-region multiplier λ instead — gam#979.)
        // Recent KKT-residual values (oldest→newest) used to detect STEADY
        // geometric descent at the certificate-refusal gate. A still-converging
        // Newton direction (residual dropping by a steady factor < 1 each cycle)
        // must not be misclassified as a multiplier/null plateau and exited
        // early (gam#787 duchon centers≥20: the logslope block converges
        // geometrically — residual ~0.33×/cycle — but `linearized_rel ≥ 0.5`
        // routed it into the plateau-refusal break a few cycles short of tol).
        const RESIDUAL_DESCENT_WINDOW: usize = 3;
        let mut residual_descent_history: std::collections::VecDeque<f64> =
            std::collections::VecDeque::with_capacity(RESIDUAL_DESCENT_WINDOW);
        let mut tr_clamped_during_stall: bool = false;
        // Fully-rejected stall guard. The residual-stall guard below
        // (post-grad-reload) only fires on cycles that produced an accepted
        // step, because every termination check it gates lives after the
        // `if !accepted { continue; }` exit at the bottom of the trust-region
        // attempt loop. When every cycle in a row is fully rejected — all
        // JOINT_TRUST_MAX_ATTEMPTS trial steps fail the line-search check —
        // none of those guards ever see the iterate, the cycle loop spins
        // up to `inner_loop_hard_ceiling` cycles, and the inner solver burns
        // ~120 s of wall-clock per outer ρ-evaluation that the outer
        // optimizer will reject anyway. The signature is exact and local:
        // (i) every trust attempt this cycle was rejected on the actual
        // objective check (`objective_rejects == JOINT_TRUST_MAX_ATTEMPTS`,
        // `model_rejects == 0`, `likelihood_rejects == 0`), AND (ii) the joint
        // trust radius has NOT shrunk relative to the previous fully-rejected
        // cycle. Condition (ii) is what proves no progress is possible: β is
        // reverted to its pre-cycle value on every fully-rejected cycle, so
        // with an identical Newton system AND an identical trust radius the
        // next cycle's trust-region search is byte-deterministically the
        // same as this one's. The radius can stall above the 1e-12 floor
        // when `shrink_active_joint_block_trust_radii` only shrinks blocks
        // that hit their per-block boundary — an interior block keeps its
        // radius forever, so `max(block_radii)` is held by that block while
        // the boundary block's radius collapses to 1e-12 without changing
        // the max. After `FULLY_REJECTED_STALL_MAX_CYCLES` consecutive cycles
        // with both conditions, exit non-converged so the outer optimizer
        // rejects this ρ cleanly instead of waiting for the cycle cap.
        const FULLY_REJECTED_STALL_MAX_CYCLES: usize = 8;
        let mut prev_rejected_trust_radius: Option<f64> = None;
        let mut consecutive_held_rejected_cycles: usize = 0;
        let mut last_joint_math: Option<JointNewtonMathDiagnostic> = None;
        // Cross-cycle cache of the joint Jeffreys/Firth triple `(β_key, ∇Φ, H_Φ)`
        // (gam#729/#826/#808). Computing `(∇Φ, H_Φ)` costs `p` family
        // directional-derivative calls plus the `½ S Sᵀ` GEMM; for a K-block
        // coupled family that is the dominant per-inner-cycle cost. The post-step
        // KKT residual recomputes the triple at the just-accepted β; the NEXT
        // cycle's head needs the SAME triple at that SAME β. Carry it forward
        // keyed on the flattened β so the head reuses the post-step result instead
        // of recomputing — collapsing two O(p)-directional-derivative evaluations
        // per accepted cycle to one. The key is an exact-equality check on the
        // flattened β (β is byte-identical between an accepted post-step residual
        // and the next head), so the reused term is the exact term at the current
        // iterate — no staleness, no tolerance fudge.
        let mut jeffreys_triple_cache: Option<(Array1<f64>, Array1<f64>, Array2<f64>)> = None;
        // Stash for the structured cert-REFUSED report computed inside the
        // cycle loop, so the post-loop bubbled error (`coupled exact-joint
        // inner solve exited the joint Newton path …`) can emit the same
        // per-block + spectrum breakdown without re-materializing H_pen.
        let mut last_kkt_refusal_report: Option<KktRefusalReport> = None;
        let mut prev_kkt_norm: Option<f64> = None;
        // Plateau streak on |Δobj| ≤ objective_tol. The scale-aware
        // flatness predicate stays local to this loop; the streak/window
        // discipline (grow on flat, reset on recovery) is the shared
        // loop_guard::FlatStreak so it cannot drift from the other
        // stagnation detectors in the tree (#968).
        let mut obj_flat_streak = crate::solver::loop_guard::FlatStreak::new(
            crate::solver::loop_guard::PLATEAU_DEFAULT_WINDOW,
        );
        // Total descent budget across the joint-Newton loop, used by
        // the end-of-loop summary to report `descent_total`.
        let initial_joint_objective: f64 = lastobjective;
        // Per-cycle |Δobjective| history for the geometric-tail trigger of
        // the constrained-stationary certificate below. When the cycles
        // settle into a linear-rate plateau (|Δobj_next| / |Δobj_prev|
        // approaching 1 monotonically over the window), the total
        // *remaining* objective descent is rigorously bounded above by the
        // geometric series sum |Δobj_now| / (1 − max_ratio). When that
        // bound is below `objective_tol` the cert can fire many cycles
        // earlier than waiting for any single |Δobj| to individually
        // cross obj_tol — the bound is mathematically the same precision
        // contract, applied to the asymptotic tail rather than one step.
        const GEOMETRIC_TAIL_WINDOW: usize = 5;
        let mut geometric_tail_history: std::collections::VecDeque<f64> =
            std::collections::VecDeque::with_capacity(GEOMETRIC_TAIL_WINDOW);

        // The exact joint-Hessian route solves the penalized Newton system
        // directly. Extra damping must be wired through an accepted/rejected
        // step policy before it belongs here; keep the matvec faithful to the
        // objective until then.
        for cycle in 0..inner_loop_hard_ceiling {
            if cycle >= inner_max_cycles {
                break;
            }
            let verbose_cycle = cycle == 0
                || cycle + 1 == inner_max_cycles
                || (cycle + 1) % JOINT_LOG_VERBOSE_PERIOD == 0;
            // Pre-cycle header line removed: the post-cycle one-liner below
            // carries cycle/objective/Δobj/step/residual/time and on verbose
            // cadence the expanded convergence line additionally carries
            // -loglik and penalty. Suppressing this avoids emitting a second
            // info-level line per cycle just to repeat numbers we already
            // log at end of cycle.
            // Per-cycle phase-timing accumulators. Surface where the inner
            // joint-Newton spends time so a 18-min silent cycle 0 (the
            // bernoulli marginal-slope FLEX large-scale failure mode) becomes a
            // logged timeline at the end of the cycle. Phases:
            //   * hessian: joint Hessian source build (matrix-free workspace
            //     OR dense fallback assembly)
            //   * pcg:     matrix-free QP solve via solve_spd_pcg_with_info_into
            //              (already logs its own diagnostics; we accumulate
            //              here for the end-of-cycle summary)
            //   * line_search: backtracking step-size search (up to 8 attempts)
            //   * grad_reload: post-accept joint gradient + workspace refresh
            let cycle_started = std::time::Instant::now();
            // Top-of-cycle row-measure capture. The trust-region ratio
            // ρ = [F(β) − F(β + δ)] / [−g·δ − ½·δᵀHδ] is only meaningful when
            // every input (Hessian, gradient, objective at β, trial objective
            // at β + δ) is evaluated against the same row measure. We freeze
            // the measure here and re-read it at each of the four sites later
            // in the cycle, then hard-fail (Err) just before ρ if any of them
            // diverged. Cf. `src/solver/row_measure.rs`.
            let tr_row_measure_top =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            let hessian_started = std::time::Instant::now();
            log::info!(
                "[joint-newton-tr] phase=hessian_qp cycle={} r={:.3e}",
                cycle,
                joint_trust_radius,
            );
            let cycle_log = prelude_log;
            let constraints_started = std::time::Instant::now();
            let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
            let joint_constraints =
                assemble_joint_linear_constraints(&block_constraints, &ranges, total_p)?;
            if cycle_log && cycle == 0 {
                log::info!(
                    "[STAGE] PIRLS/inner step=cycle0 block+joint constraints elapsed={:.3}s n={} p={}",
                    constraints_started.elapsed().as_secs_f64(),
                    total_joint_n,
                    total_p,
                );
            }
            let workspace_build_started = std::time::Instant::now();
            // Get joint Hessian and block gradients from the current evaluation.
            let hessian_workspace_for_cycle: Option<Arc<dyn ExactNewtonJointHessianWorkspace>> =
                None;
            let joint_hessian_source = if joint_workspace_requested {
                let cached_hit = cached_joint_workspace.is_some();
                let workspace = match cached_joint_workspace.take() {
                    Some(workspace) => Some(workspace),
                    None => family.exact_newton_joint_hessian_workspace_with_options(
                        &states, specs, options,
                    )?,
                };
                if cycle_log && cycle == 0 {
                    log::info!(
                        "[STAGE] PIRLS/inner step=cycle0 hessian-workspace cached_hit={} elapsed={:.3}s n={} p={}",
                        cached_hit,
                        workspace_build_started.elapsed().as_secs_f64(),
                        total_joint_n,
                        total_p,
                    );
                }
                workspace
                    .as_ref()
                    .map(|workspace| {
                        exact_newton_joint_hessian_source_from_workspace(
                            workspace,
                            total_p,
                            MaterializationIntent::InnerSolve,
                            "joint Newton inner exact-newton operator mismatch",
                        )
                    })
                    .transpose()?
                    .flatten()
            } else {
                None
            };
            // Row measure observed by the Hessian build above.
            let tr_row_measure_hessian =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            let joint_hessian_source = match joint_hessian_source {
                Some(source) => source,
                None => {
                    // Spec-aware joint Hessian: canonical coupled-curvature
                    // source (see the availability gate above). Families that
                    // only override `_with_specs` (Dirichlet common-parameter)
                    // would otherwise hand back `None` from the spec-less
                    // default and silently drop off the joint-Newton path.
                    let h_joint_opt =
                        family.exact_newton_joint_hessian_with_specs(&states, specs)?;
                    let Some(h_joint) = h_joint_opt else {
                        break; // Fall back to blockwise if joint Hessian unavailable
                    };
                    match symmetrized_square_matrix(
                        h_joint,
                        total_p,
                        "joint Newton inner exact-newton Hessian shape mismatch",
                    ) {
                        Ok(matrix) => JointHessianSource::Dense(matrix),
                        Err(_) => break,
                    }
                }
            };

            // Concatenate block gradients and betas.
            let Some(grad_joint) = cached_joint_gradient.clone() else {
                break;
            };
            // Row measure observed by the gradient at β. `cached_joint_gradient`
            // was loaded earlier under `options`; if the auto-subsample
            // installer or any sibling path swapped the mask between then and
            // now, the id captured here will diverge from the rest and the
            // pre-ρ check below will Err. Cf. `src/solver/row_measure.rs`.
            let tr_row_measure_gradient =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            if grad_joint.len() != total_p {
                break;
            }
            let mut beta_joint = Array1::<f64>::zeros(total_p);
            for b in 0..specs.len() {
                let (start, end) = ranges[b];
                beta_joint
                    .slice_mut(ndarray::s![start..end])
                    .assign(&states[b].beta);
            }

            let trace_diagonal_ridge = joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE;
            let joint_hessian_is_dense =
                matches!(&joint_hessian_source, JointHessianSource::Dense(_));
            let joint_solver_diagonal_ridge = stabilized_joint_solver_diagonal_ridge(
                family,
                &joint_hessian_source,
                &ranges,
                &s_lambdas,
                trace_diagonal_ridge,
                options.ridge_floor,
                joint_bundle,
            );
            // CHEAP CONDITIONING PRE-CHECK (always-on robustness, zero-cost on
            // easy/large fits). Before paying for the dense joint-Hessian
            // materialization + `O(p³)` reduced eigendecomposition inside the
            // Jeffreys term, ask whether the term is PROVABLY skippable from a few
            // matrix-free Hessian-vector products against the source we just built.
            // When `true`, the exact conditioning gate is certain to return the
            // zero term, so every Jeffreys call this cycle short-circuits to the
            // exact-zero contribution WITHOUT forming anything dense — byte-
            // identical to the gated-off path, and preserving the matrix-free path
            // on wide well-conditioned fits. Only runs the estimate when a Jeffreys
            // subspace exists and `total_p` is wide enough that the dense eigh is
            // the cost we want to avoid (the helper itself gates on the size
            // threshold and conservatively returns `false` if unsure). Computed
            // once per inner cycle and reused across the cycle's head-KKT, step,
            // and trial-value calls; the conditioning changes slowly across cycles
            // so re-estimating per cycle (one `O(p·k)` burst) is already cheap
            // against the work it guards.
            let jeffreys_skippable_this_cycle: bool = if options.seed_screening {
                // Seed screening only ranks seeds: skip the O(p · per-axis-Hdot)
                // full Jeffreys gradient/curvature loop. The value-only Jeffreys
                // term (folded into the objective baseline / trial penalties via
                // `custom_family_joint_jeffreys_value`, gated independently on
                // `joint_jeffreys_subspace.is_some()`) still bounds the screening
                // score on separating directions; only the per-axis step curvature
                // — the wrong cost class for ranking on a K-block coupled family —
                // is dropped here (gam#729/#808).
                true
            } else if joint_jeffreys_subspace.is_some() {
                jeffreys_term_skippable_for_source(&joint_hessian_source, total_p).unwrap_or(false)
            } else {
                false
            };
            let joint_trust_metric_diag = match &joint_hessian_source {
                JointHessianSource::Dense(h_joint) => joint_penalty_preconditioner_diag(
                    &h_joint.diag().to_owned(),
                    &ranges,
                    &s_lambdas,
                    joint_solver_diagonal_ridge,
                    joint_bundle,
                ),
                JointHessianSource::Operator { diagonal, .. } => joint_penalty_preconditioner_diag(
                    diagonal,
                    &ranges,
                    &s_lambdas,
                    joint_solver_diagonal_ridge,
                    joint_bundle,
                ),
            };
            // HEAD-β JEFFREYS CACHE (gam#729/#808). The full Jeffreys/Firth triple
            // `(Φ, ∇Φ, H_Φ)` costs `p` family directional-derivative calls (the
            // `for k in 0..p` loop in `joint_jeffreys_term`); for a K-block coupled
            // family (Dirichlet/multinomial) that is the dominant per-cycle cost.
            // The head-of-cycle KKT residual, the constrained-QP step, and the
            // spectral/dense Newton step are ALL built at the SAME cycle-start β
            // (`&states`, before any step is accepted), so they need the SAME
            // triple. Compute it ONCE here and reuse, instead of three independent
            // O(p)-directional-derivative evaluations per cycle. The post-step
            // residual below is at the accepted β, so it correctly recomputes.
            // `None` when the term is condition-gated/skippable (∇Φ=0, H_Φ=0).
            let head_beta_key: Array1<f64> = flatten_state_betas(&states, specs);
            let head_jeffreys_term: Option<(Array1<f64>, Array2<f64>)> =
                if jeffreys_skippable_this_cycle {
                    None
                } else if let Some((_, grad_phi, hphi)) = jeffreys_triple_cache
                    .as_ref()
                    .filter(|(key, _, _)| *key == head_beta_key)
                {
                    // Cross-cycle cache hit: the previous cycle's post-step KKT
                    // residual already computed the exact triple at this β. Reuse.
                    Some((grad_phi.clone(), hphi.clone()))
                } else if let Some(z_joint) = joint_jeffreys_subspace.as_ref() {
                    let term = match custom_family_joint_jeffreys_term(
                        family, &states, specs, &ranges, z_joint,
                    )? {
                        Some((_phi, grad_phi, hphi))
                            if grad_phi.len() == grad_joint.len()
                                && hphi.nrows() == total_p
                                && hphi.ncols() == total_p =>
                        {
                            Some((grad_phi, hphi))
                        }
                        _ => None,
                    };
                    if let Some((grad_phi, hphi)) = term.as_ref() {
                        jeffreys_triple_cache =
                            Some((head_beta_key.clone(), grad_phi.clone(), hphi.clone()));
                    }
                    term
                } else {
                    None
                };
            // Fold the Firth/Jeffreys score `∇Φ` into the head-of-cycle KKT
            // residual when the term is armed, for the same reason as the
            // post-step residual below: the inner objective is `−ℓ + ½βᵀSβ − Φ`,
            // so the certifiable stationarity is `∇L − Sβ + ∇Φ = 0`. Without
            // this the head-of-cycle KKT exit (`current_stationarity_residual ≤
            // residual_tol`) can never fire on the near-separating span, even
            // when the iterate is the Firth optimum. No-op when the Jeffreys
            // term is unavailable or condition-gated to zero.
            let head_kkt_gradient: Option<Array1<f64>> = head_jeffreys_term
                .as_ref()
                .map(|(grad_phi, _hphi)| &grad_joint + grad_phi);
            let current_kkt_norm = exact_newton_joint_stationarity_inf_norm_from_gradient(
                head_kkt_gradient.as_ref().unwrap_or(&grad_joint),
                &states,
                specs,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                &block_constraints,
                Some(cached_active_sets.as_slice()),
            )?;
            let pcg_rel_tol = joint_pcg_eisenstat_walker_forcing(prev_kkt_norm, current_kkt_norm);

            let solve_joint_constraints_dense = joint_constraints.is_some()
                || !matrix_free_joint_requested
                || joint_hessian_is_dense;
            // Exact trust-region subproblem factorization (gam#979). Populated on
            // the unconstrained dense-spectral path with the metric-whitened
            // eigendecomposition of the penalized Hessian, so the trust loop below
            // re-solves the *exact* Moré–Sorensen subproblem at each trust radius
            // from one factorization — replacing the dogleg/Cauchy/box-truncation
            // globalization with the single object they all approximate. `None` on
            // the constrained-QP and matrix-free PCG paths, which keep their
            // existing globalization untouched.
            let mut joint_spectrum: Option<whitened_spectrum::WhitenedHessianSpectrum> = None;
            let (candidate_beta, joint_active_set, joint_step_spectral_nullity) =
                if solve_joint_constraints_dense
                    && let Some(constraints) = joint_constraints.as_ref()
                {
                    let mut lhs = match materialize_joint_hessian_source(
                        &joint_hessian_source,
                        total_p,
                        "joint Newton inner constrained Hessian materialization",
                    ) {
                        Ok(matrix) => matrix,
                        Err(_) => break,
                    };
                    add_joint_penalty_to_matrix(
                        &mut lhs,
                        &ranges,
                        &s_lambdas,
                        trace_diagonal_ridge,
                        joint_bundle,
                    );
                    if joint_solver_diagonal_ridge != trace_diagonal_ridge {
                        for d in 0..lhs.nrows() {
                            lhs[[d, d]] += joint_solver_diagonal_ridge - trace_diagonal_ridge;
                        }
                    }
                    check_linear_feasibility(&beta_joint, constraints, 1e-8)
                        .map_err(|e| format!("joint Newton constrained solve: {e}"))?;
                    let warm_joint_active =
                        flatten_joint_active_set(&cached_active_sets, &block_constraints);
                    let lower_bounds = match extract_simple_lower_bounds(constraints, total_p) {
                        Ok(bounds) => bounds,
                        Err(_) => break,
                    };
                    // Newton IRLS step in absolute-β space:
                    //
                    //   β_new = H_pen⁻¹ (H_L β + ∇ℓ)
                    //
                    // where H_pen = H_L + S, derived from Newton's update
                    //   β_new = β + H_pen⁻¹(∇ℓ − Sβ)
                    //         = H_pen⁻¹(H_pen β + ∇ℓ − Sβ)
                    //         = H_pen⁻¹(H_L β + ∇ℓ).
                    //
                    // The QP `min 0.5 β' H_pen β − rhs_beta' β` has unconstrained
                    // optimum β = H_pen⁻¹ rhs_beta, so rhs_beta = H_pen β + (∇ℓ − Sβ)
                    // gives the correct Newton update. Passing raw grad_joint (=∇ℓ)
                    // would collapse to β = H_pen⁻¹ ∇ℓ, which at the true optimum
                    // (∇ℓ = Sβ̂) gives H_pen⁻¹ Sβ̂ ≠ β̂ — wrong fixed point.
                    let penalty_beta_joint = apply_joint_block_penalty(
                        &ranges,
                        &s_lambdas,
                        &beta_joint,
                        joint_mode_diagonal_ridge,
                        joint_bundle,
                    );
                    let mut rhs_step = &grad_joint - &penalty_beta_joint;
                    // Reuse the head-β Jeffreys triple (consistently attenuated in
                    // `head_jeffreys_term` — both ∇Φ and H_Φ scaled by one scalar,
                    // gam#826/#872/#715). Skipped when the cheap pre-check certifies
                    // well-conditioning: ∇Φ = 0 and H_Φ = 0 there, so neither
                    // rhs_step nor lhs change.
                    // PSD PROJECTION (gam#979). The exact divided-difference H_Φ is
                    // indefinite exactly where Φ is (mixed-sign reduced spectrum at
                    // off-mode trial points). The unconstrained dense-spectral path
                    // consumes it exactly — the Moré–Sorensen subproblem handles
                    // indefiniteness rigorously — but THIS active-set QP requires a
                    // convex model (an indefinite QP cycles its active set and the
                    // inner grinds the budget). Use the PSD part of H_Φ here: honest
                    // magnitudes (unlike the old `K²` vec-Gram phantom), guaranteed
                    // solvable QP, and the exact ∇Φ in the rhs keeps the fixed point
                    // unchanged — only the convergence rate on indefinite stretches
                    // degrades to the damped-Newton rate the constrained path always
                    // had.
                    if let Some((grad_phi, hphi)) = head_jeffreys_term.as_ref()
                        && grad_phi.len() == rhs_step.len()
                    {
                        rhs_step += grad_phi;
                        lhs += &symmetric_psd_projection(hphi);
                    }
                    // Self-vanishing Levenberg–Marquardt damping for the
                    // CONSTRAINED active-set QP, mirroring the spectral-range
                    // branch below (μ = JOINT_SPECTRAL_LEVENBERG_FACTOR·‖rhs‖∞).
                    //
                    // When the joint design carries inequality constraints
                    // (the monotone I-spline time-warp of a survival
                    // location-scale / AFT fit) the spectral range step that
                    // drops ker(H_pen) is NOT taken — this dense active-set QP
                    // runs instead. On a constant-scale AFT the 12-col monotone
                    // time-warp's non-affine deviation is statistically
                    // UNIDENTIFIED, so H_pen is rank-deficient along that gauge
                    // direction. An undamped QP then has a continuum of optima
                    // differing only by the free gauge component, and the
                    // active set slides along the monotone constraint face
                    // taking an O(1) proposal step in that direction every
                    // cycle. The proposal `step_inf` never exhausts, so the
                    // identified-subspace KKT certificate (gated on
                    // `step_inf ≤ step_tol`) never fires and the inner
                    // joint-Newton grinds the full `inner_max_cycles` on EVERY
                    // outer ρ-eval — the survival-LS AFT "hang" (#736/#735/#721).
                    //
                    // Adding μ·I to the QP Hessian gives ker(H_pen) a tiny
                    // positive curvature, so the constrained minimizer is unique
                    // and its gauge component is driven toward zero; the proposal
                    // step then exhausts at the identified-subspace optimum and
                    // the certificate fires in a handful of cycles. Because
                    // μ ∝ ‖∇L − Sβ‖∞ → 0 at the KKT fixed point, the converged β
                    // and the well-identified flexible-scale fast path (where the
                    // time-warp IS identified and H_pen is non-singular) are
                    // unchanged — a genuinely flexible survival-LS fit still
                    // performs its full search.
                    //
                    // CRITICAL: the floor is only correct on a genuinely
                    // rank-deficient `H_pen`. Gate it strictly on
                    // `nullity > 0`. On a FULLY IDENTIFIED constrained fit
                    // (e.g. the post-reduction constant-scale loglogistic AFT,
                    // #736/#735/#721/#733/#734 — a 3-parameter model with
                    // block_widths = [1,1,1] and an empty `ker(H_pen)`) the QP
                    // minimizer is already unique, so the floor adds nothing it
                    // is needed for but everything it costs: with residual r and
                    // factor 1e-3 the floor is μ≈1e-3·r, and on an unpenalized
                    // location intercept whose likelihood curvature H is small
                    // at n=23 the damped Newton component shrinks the residual
                    // only by the GEOMETRIC ratio H/(H+μ) per cycle instead of
                    // quadratically. With μ≈1e-6 and a small H that ratio is far
                    // from 1, so the threshold-block stationarity residual
                    // plateaus at ~1e-3–1e-4 and the inner solve burns its whole
                    // cycle budget without ever reaching `residual_tol`. The
                    // self-vanishing μ→0 is too slow because it vanishes only as
                    // fast as the residual it is throttling. Disabling the floor
                    // when `nullity == 0` makes the constrained QP solve the
                    // EXACT undamped Newton/KKT system, recovering quadratic
                    // convergence to `residual_tol` in a handful of cycles. The
                    // rank-deficient case (`nullity > 0`, the pre-reduction
                    // unidentified time-warp gauge) keeps the floor and its hang
                    // fix unchanged. `None` (eigensolve failed / zero Hessian)
                    // falls back to the damped path conservatively.
                    let hpen_nullity = symmetric_penalized_hessian_nullity(&lhs);
                    let apply_constrained_floor = hpen_nullity.map(|n| n > 0).unwrap_or(true);
                    let rhs_inf = rhs_step.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
                    let constrained_levenberg_mu = JOINT_SPECTRAL_LEVENBERG_FACTOR * rhs_inf;
                    if apply_constrained_floor
                        && constrained_levenberg_mu > 0.0
                        && constrained_levenberg_mu.is_finite()
                    {
                        for d in 0..lhs.nrows() {
                            lhs[[d, d]] += constrained_levenberg_mu;
                        }
                    }
                    let rhs_beta = &lhs.dot(&beta_joint) + &rhs_step;
                    let solve_result = if let Some(bounds) = lower_bounds.as_ref() {
                        solve_quadratic_with_simple_lower_bounds(
                            &lhs,
                            &rhs_beta,
                            &beta_joint,
                            bounds,
                            warm_joint_active.as_deref(),
                        )
                    } else {
                        solve_quadratic_with_linear_constraints(
                            &lhs,
                            &rhs_beta,
                            &beta_joint,
                            constraints,
                            warm_joint_active.as_deref(),
                        )
                        .map_err(|e| e.to_string())
                    };
                    match solve_result {
                        Ok((beta_new, active_set)) => (beta_new, Some(active_set), 0usize),
                        Err(_) => break,
                    }
                } else {
                    // Stationarity residual: r = S*beta - gradient (for penalized NLL)
                    let penalty_beta = apply_joint_block_penalty(
                        &ranges,
                        &s_lambdas,
                        &beta_joint,
                        joint_mode_diagonal_ridge,
                        joint_bundle,
                    );
                    let mut rhs = &grad_joint - &penalty_beta;
                    // Universal robustness: fold the family-general
                    // Jeffreys/Firth curvature `H_Φ` and score `∇Φ` into BOTH the
                    // matrix-free PCG step AND the dense spectral fallback below,
                    // scoped to the full-span basis `Z_J`. Computed ONCE here
                    // so the matvec closure and the RHS share the SAME term and the
                    // fallback does not recompute it. The inner objective is
                    // `−ℓ + ½βᵀSβ − Φ`, so the Newton system the step must solve is
                    //   (H + S_λ + H_Φ) δ = (∇ℓ − S_λβ) + ∇Φ.
                    // Previously the PCG matvec applied only `H + S_λ` and its RHS
                    // omitted `∇Φ`, so on the matrix-free path (large p / large n)
                    // Firth was a SILENT NO-OP: the proper-prior never reached the
                    // step that actually moves β, leaving separation/under-
                    // identification uncured exactly where the dense route is not
                    // taken. The dense route (small p, e.g. BMS p≈51) was already
                    // correct. `H_Φ` is the full-span Gauss-Newton surrogate
                    // `½ J H_id⁻¹ Jᵀ` (Z_J = identity ⇒ p×p, not low-rank), but the
                    // conditioning gate in `joint_jeffreys_term` returns the zero
                    // term on every well-conditioned fit, so this only arms on the
                    // near-separating span
                    // — and `hphi` is materialized once per cycle regardless, so the
                    // matvec adds only one O(p²) HVP, preserving the matrix-free
                    // path's asymptotics where Firth is negligible (term = `None`).
                    // Cheap pre-check certified well-conditioned ⇒ the exact term
                    // is the zero contribution (∇Φ = 0, H_Φ = 0). Short-circuit to
                    // `None` WITHOUT materializing the dense joint Hessian or running
                    // the O(p³) reduced eigendecomposition — this is the matrix-free
                    // PCG hot path, where forming a dense p×p H_Φ every cycle was the
                    // regression. Byte-identical to the gated-off dense path: `rhs`
                    // is left as `∇ℓ − S_λβ` and no H_Φ is folded into the matvec.
                    // Reuse the head-β Jeffreys triple (computed once this cycle);
                    // this Newton step is built at the same cycle-start β.
                    let inner_jeffreys_term: Option<(Array1<f64>, Array2<f64>)> =
                        match head_jeffreys_term.as_ref() {
                            Some((grad_phi, hphi)) if grad_phi.len() == rhs.len() => {
                                rhs += grad_phi;
                                Some((grad_phi.clone(), hphi.clone()))
                            }
                            _ => None,
                        };
                    // PSD PROJECTION for the SPD-PCG matvec (gam#979): the exact
                    // divided-difference H_Φ can be indefinite at off-mode trial
                    // points, which breaks the SPD-CG contract. The matvec uses its
                    // PSD part; the dense spectral fallback below keeps the EXACT
                    // (possibly indefinite) H_Φ — the Moré–Sorensen subproblem
                    // handles it rigorously.
                    let inner_jeffreys_hphi: Option<Arc<Array2<f64>>> = inner_jeffreys_term
                        .as_ref()
                        .map(|(_grad_phi, hphi)| Arc::new(symmetric_psd_projection(hphi)));
                    let pcg_started = std::time::Instant::now();
                    let pcg_requested = matrix_free_joint_requested && !joint_hessian_is_dense;
                    let mut spectral_nullity_for_step = 0usize;
                    let mut delta = if pcg_requested {
                        let preconditioner_diag = match &joint_hessian_source {
                            JointHessianSource::Dense(h_joint) => {
                                joint_penalty_preconditioner_diag(
                                    &h_joint.diag().to_owned(),
                                    &ranges,
                                    &s_lambdas,
                                    joint_solver_diagonal_ridge,
                                    joint_bundle,
                                )
                            }
                            JointHessianSource::Operator { diagonal, .. } => {
                                joint_penalty_preconditioner_diag(
                                    diagonal,
                                    &ranges,
                                    &s_lambdas,
                                    joint_solver_diagonal_ridge,
                                    joint_bundle,
                                )
                            }
                        };
                        // Pre-allocate the penalty workspace ONCE outside the
                        // PCG closure so each CG iter (called hundreds-to-
                        // thousands of times per outer iter at large scale)
                        // reuses the buffer instead of allocating per call.
                        // RefCell because solve_spd_pcg* expects `Fn` (immutable
                        // borrow of captures) and we need interior mutability
                        // to write into the workspace.
                        let penalty_workspace = RefCell::new(Array1::<f64>::zeros(total_p));
                        // Capture the Jeffreys/Firth curvature for the matvec. When
                        // armed (and nonzero past the conditioning gate) the PCG
                        // operator becomes `H + S_λ + H_Φ`, matching the augmented
                        // RHS `(∇ℓ − S_λβ) + ∇Φ` set above and the dense spectral
                        // fallback. `None` keeps the unaugmented matvec.
                        let pcg_hphi_dense = inner_jeffreys_hphi.clone();
                        let pcg_hphi_op = inner_jeffreys_hphi.clone();
                        match &joint_hessian_source {
                            JointHessianSource::Dense(h_joint) => {
                                crate::linalg::utils::solve_spd_pcg_with_info_into(
                                    |v, out| {
                                        // h_joint * v -> out (faer-backed, no alloc)
                                        crate::faer_ndarray::fast_av_view_into(
                                            h_joint,
                                            v,
                                            out.view_mut(),
                                        );
                                        let mut pen = penalty_workspace.borrow_mut();
                                        apply_joint_block_penalty_into(
                                            &ranges,
                                            &s_lambdas,
                                            v,
                                            joint_solver_diagonal_ridge,
                                            &mut pen,
                                            joint_bundle,
                                        );
                                        *out += &*pen;
                                        if let Some(hphi) = pcg_hphi_dense.as_ref() {
                                            *out += &hphi.dot(v);
                                        }
                                    },
                                    &rhs,
                                    &preconditioner_diag,
                                    pcg_rel_tol,
                                    JOINT_PCG_MAX_ITER_MULTIPLIER * total_p.max(1),
                                )
                                .map(|(solution, info)| {
                                    log_joint_pcg_diagnostics(
                                        cycle,
                                        total_p,
                                        total_joint_n,
                                        &preconditioner_diag,
                                        &info,
                                    );
                                    solution
                                })
                            }
                            JointHessianSource::Operator { apply_into, .. } => {
                                let apply_h_into = Arc::clone(apply_into);
                                crate::linalg::utils::solve_spd_pcg_with_info_into(
                                    |v, out| {
                                        if let Err(error) = apply_h_into(v, out) {
                                            log::warn!(
                                                "joint Newton inner operator matvec failed: {error}"
                                            );
                                            out.fill(0.0);
                                        }
                                        let mut pen = penalty_workspace.borrow_mut();
                                        apply_joint_block_penalty_into(
                                            &ranges,
                                            &s_lambdas,
                                            v,
                                            joint_solver_diagonal_ridge,
                                            &mut pen,
                                            joint_bundle,
                                        );
                                        *out += &*pen;
                                        if let Some(hphi) = pcg_hphi_op.as_ref() {
                                            *out += &hphi.dot(v);
                                        }
                                    },
                                    &rhs,
                                    &preconditioner_diag,
                                    pcg_rel_tol,
                                    JOINT_PCG_MAX_ITER_MULTIPLIER * total_p.max(1),
                                )
                                .map(|(solution, info)| {
                                    log_joint_pcg_diagnostics(
                                        cycle,
                                        total_p,
                                        total_joint_n,
                                        &preconditioner_diag,
                                        &info,
                                    );
                                    solution
                                })
                            }
                        }
                    } else {
                        None
                    };
                    if pcg_requested {
                        log::info!(
                            "[PIRLS/joint-PCG] cycle {:>3} | n={} p={} solved={} elapsed={:.3}s",
                            cycle,
                            total_joint_n,
                            total_p,
                            delta.is_some(),
                            pcg_started.elapsed().as_secs_f64()
                        );
                    }
                    if delta.is_none() {
                        if pcg_requested {
                            break;
                        }
                        let mut lhs_true = match materialize_joint_hessian_source(
                            &joint_hessian_source,
                            total_p,
                            "joint Newton inner dense fallback Hessian materialization",
                        ) {
                            Ok(matrix) => matrix,
                            Err(_) => break,
                        };
                        add_joint_penalty_to_matrix(
                            &mut lhs_true,
                            &ranges,
                            &s_lambdas,
                            joint_mode_diagonal_ridge,
                            joint_bundle,
                        );
                        // Universal robustness: add the
                        // family-general Jeffreys curvature `H_Phi` to the
                        // penalized Hessian. This is the Tier-B coupled-Newton form
                        // of Firth: the reduced Fisher information `Z_J^T H Z_J`
                        // supplies the missing O(n) curvature that bounds a
                        // near-separating coefficient to O(1). When the Jeffreys
                        // term is unavailable, the step stays unaugmented.
                        //
                        // `∇Φ` is NOT re-added here: `rhs` (and thus `spectral_rhs`)
                        // already carries `+∇Φ` from the single shared computation
                        // above, and we REUSE that same `H_Φ` here rather than
                        // recomputing the (O(p) directional-derivative) term — the
                        // dense fallback and the matrix-free PCG step now solve the
                        // SAME Jeffreys-augmented Newton system.
                        let spectral_rhs = rhs.clone();
                        if let Some((_grad_phi, hphi)) = inner_jeffreys_term.as_ref() {
                            lhs_true += hphi;
                        }
                        // Single metric-whitened eigendecomposition drives BOTH the
                        // seed step and every trust-region re-solve this cycle
                        // (gam#979). The prior code ran a SECOND O(p³)
                        // eigendecomposition of the raw Hessian here purely to form
                        // the seed step — doubling the dominant per-cycle cost on the
                        // ~5 s/cycle ill-conditioned survival marginal-slope inner.
                        // The exact trust-region multiplier λ (chosen so ‖δ‖_D = r)
                        // subsumes the old self-vanishing Levenberg-μ seed: `decompose`
                        // whitens by the trust metric so the penalty (λ~e²⁴) and the
                        // likelihood scales are throttled uniformly — the scale
                        // invariance the multiplicative μ approximated. `lhs_true`
                        // already carries the penalty and the Firth/Jeffreys curvature
                        // H_Φ and `spectral_rhs` the augmented stationarity RHS, so the
                        // subproblem model matches the predicted-reduction model and the
                        // accept/reject gain ratio exactly.
                        let spectrum = whitened_spectrum::WhitenedHessianSpectrum::decompose(
                            &lhs_true,
                            &spectral_rhs,
                            &joint_trust_metric_diag,
                            KKT_REFUSAL_RANK_TOL,
                        )?;
                        // Seed = the unconstrained (Moore–Penrose, range-restricted)
                        // exact step, so cycle 0 can take the full Newton step on a
                        // well-conditioned model (the cycle-0 radius bump below relies
                        // on this); the trust loop re-solves at finite radius for every
                        // subsequent attempt. An indefinite model reflects negative
                        // curvature to |λ|, exactly as the prior spectral solve did.
                        let spectral_step = spectrum.trust_region_step(f64::INFINITY);
                        spectral_nullity_for_step = spectral_step.nullity;
                        if spectral_step.reflected_negative_modes > 0 {
                            log::info!(
                                "[PIRLS/joint-Newton] cycle {cycle:>3} | indefinite inner \
                                 Hessian: reflected {}/{} negative-curvature modes to |λ| \
                                 (λ_min={:.3e}); proceeding with modified-Newton descent step \
                                 under trust-region globalization",
                                spectral_step.reflected_negative_modes,
                                total_p,
                                spectral_step.most_negative_eigenvalue,
                            );
                        }
                        if spectral_step.nullity > 0 {
                            log::debug!(
                                "[PIRLS/joint-Newton] spectral reduced solve: nullity@{:.0e}={}/{} \
                             |P0 rhs|∞={:.3e} |P+ rhs|∞={:.3e} λ_min+={:.3e} λ_max={:.3e}",
                                spectral_step.rank_tol,
                                spectral_step.nullity,
                                total_p,
                                spectral_step.null_rhs_inf,
                                spectral_step.range_rhs_inf,
                                spectral_step.lambda_min_positive,
                                spectral_step.lambda_max_abs,
                            );
                        }
                        delta = Some(spectral_step.delta);
                        // The same factorization powers every trust-radius re-solve
                        // in the loop below (gam#979) — no second eigendecomposition.
                        joint_spectrum = Some(spectrum);
                    }

                    let Some(delta) = delta else {
                        break; // Fall back to blockwise
                    };
                    if !delta.iter().all(|v| v.is_finite()) {
                        break; // Fall back to blockwise
                    }
                    (beta_joint.clone() + &delta, None, spectral_nullity_for_step)
                };
            // Hessian-source build (and any QP solve immediately above) are
            // done by the time we reach `delta`. Capture the wall-clock
            // before the line-search phase so the end-of-cycle summary can
            // attribute time correctly between the Hessian/QP and the
            // backtracking step search.
            let hessian_and_qp_elapsed = hessian_started.elapsed();
            let line_search_started = std::time::Instant::now();
            log::info!(
                "[joint-newton-tr] phase=line_search cycle={} r={:.3e} hessian_qp_elapsed={:.3}s",
                cycle,
                joint_trust_radius,
                hessian_and_qp_elapsed.as_secs_f64(),
            );
            let delta = &candidate_beta - &beta_joint;

            // Trust-region globalization for the joint Newton proposal.  The
            // previous implementation used up to eight backtracking likelihood
            // evaluations (each can build the exact joint workspace at large-scale
            // scale).  Here the step is truncated before evaluation and the
            // single trial objective is accepted only when the actual decrease
            // is positive relative to the local quadratic model.
            let step_inf = delta.iter().copied().map(f64::abs).fold(0.0_f64, f64::max);

            let old_beta: Vec<Array1<f64>> = states.iter().map(|s| s.beta.clone()).collect();
            // Firth value Φ at the OLD (start-of-cycle) β, folded under the SAME
            // skippable gate the trial uses below — so `actual_reduction =
            // old_objective − trialobjective` compares two points on one objective
            // `−ℓ + ½βᵀSβ − Φ` (gam#826/#872). `lastobjective` is the pure
            // quadratic-penalized objective; subtract the gated old-β Φ here.
            let old_phi = if !jeffreys_skippable_this_cycle {
                joint_jeffreys_subspace
                    .as_ref()
                    .map(|z_joint| {
                        custom_family_joint_jeffreys_value(family, &states, specs, &ranges, z_joint)
                    })
                    .unwrap_or(0.0)
            } else {
                0.0
            };
            let old_objective = lastobjective - old_phi;
            // Row measure observed by the objective at β. `lastobjective` was
            // set on the previous cycle (or at function entry) under `options`;
            // see top-of-cycle capture for rationale.
            let tr_row_measure_old_objective =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            let mut accepted = false;
            let mut accepted_joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>> =
                None;
            let mut line_search_attempts = 0usize;

            // Pure Newton must take a full step on the first cycle of an
            // exact quadratic problem (i.e. converge in one cycle when the
            // model is exact). The trust-region globalization above must not
            // truncate the very first proposal merely because the hard-coded
            // initial radius (1.0) is smaller than the natural Newton-step
            // 2-norm. Bumping the radius up to the post-barrier Newton-step
            // norm on cycle 0 preserves quadratic convergence on
            // well-conditioned problems while leaving the standard adaptive
            // shrink/expand for subsequent cycles. Family feasibility
            // constraints and the adaptive trust radius remain the safeguards
            // against runaway proposals.
            if cycle == 0 && joint_step_spectral_nullity == 0 {
                let initial_block_norms = joint_trust_region_block_metric_norms(
                    &delta,
                    &ranges,
                    &joint_trust_metric_diag,
                );
                for (radius, norm) in joint_block_trust_radii.iter_mut().zip(initial_block_norms) {
                    if norm.is_finite() && norm > *radius {
                        *radius = norm;
                    }
                }
                joint_trust_radius = joint_block_trust_radii
                    .iter()
                    .copied()
                    .fold(0.0_f64, f64::max);
                if !joint_trust_radius.is_finite() || joint_trust_radius <= 0.0 {
                    joint_trust_radius = 1.0;
                }
            }

            let penalty_beta = apply_joint_block_penalty(
                &ranges,
                &s_lambdas,
                &beta_joint,
                joint_mode_diagonal_ridge,
                joint_bundle,
            );
            // Stationarity RHS for the trust-region quadratic model. When the
            // Jeffreys/Firth term is armed the inner objective is `−ℓ+½βᵀSβ+Φ`, so
            // the model RHS is `∇L − Sβ + ∇Φ` — the SAME augmented RHS the Newton
            // step solves and the H_Φ-augmented `hpen_delta` below pairs with. Using
            // the bare `∇L − Sβ` here desyncs `predicted_reduction` from the
            // augmented step + the Φ-augmented `actual_reduction`, which is what
            // froze the coupled K-block line search (gam#729/#715). No-op when the
            // term is condition-gated/unavailable (∇Φ=0).
            let mut rhs = &grad_joint - &penalty_beta;
            if let Some((grad_phi, _hphi)) = head_jeffreys_term.as_ref()
                && grad_phi.len() == rhs.len()
            {
                rhs += grad_phi;
            }
            let beta_inf = states
                .iter()
                .flat_map(|s| s.beta.iter().copied())
                .map(f64::abs)
                .fold(0.0_f64, f64::max);
            let step_tol = inner_tol * (1.0 + beta_inf);
            let objective_tol = inner_tol * (1.0 + old_objective.abs());
            // Scale the KKT residual tolerance against the natural magnitude
            // of ‖Sβ − ∇L‖∞ (i.e. max(‖∇L‖∞, ‖Sβ‖∞)), not the objective. The
            // gradient and Sβ scale independently of the likelihood — at
            // large scale with |β|∞ ~ 10²–10³ and non-trivial smoothing,
            // ‖Sβ‖∞ can sit orders of magnitude above |obj| and FP noise
            // alone keeps the residual above any obj-scaled tol, so KKT is
            // never certified even when the iterate is the true optimum.
            let grad_inf = grad_joint
                .iter()
                .map(|x: &f64| x.abs())
                .fold(0.0_f64, f64::max);
            let penalty_inf = penalty_beta
                .iter()
                .map(|x: &f64| x.abs())
                .fold(0.0_f64, f64::max);
            let residual_tol = inner_tol * (1.0 + grad_inf.max(penalty_inf));
            last_residual_tol = residual_tol;
            let current_stationarity_residual = current_kkt_norm;
            // KKT certificate: ‖∇L − Sβ‖_∞ ≤ residual_tol together with
            // ‖δ‖_∞ ≤ step_tol is sufficient first-order optimality of the
            // penalized objective; no descent direction exists from the
            // current point. Conditioning that exit on additional evidence
            // of objective progress in the previous cycle would refuse to
            // recognize convergence at a starting point that already sits
            // at the optimum (e.g. balanced data with an intercept-only
            // fit, where ∇ℓ vanishes by symmetry from cycle 0 and the
            // Newton step is identically zero so the trust-region search
            // can never produce a strictly negative actual reduction).
            if current_stationarity_residual <= residual_tol && step_inf <= step_tol {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | pre-line-search converged: proposal_inf={:.3e} (tol={:.3e}) | residual={:.3e} (tol={:.3e})",
                    cycle,
                    step_inf,
                    step_tol,
                    current_stationarity_residual,
                    residual_tol,
                );
                cached_joint_workspace = hessian_workspace_for_cycle;
                cycles_done = cycle;
                converged = true;
                break;
            }

            // Trust-region retries preserve the objective-decrease guarantee
            // when the initial radius is too optimistic. If the Newton proposal
            // is not a descent direction for the penalized quadratic model,
            // switch once to a diagonally preconditioned gradient step and keep
            // the same exact full-objective accept/reject test.
            const JOINT_TRUST_MAX_ATTEMPTS: usize = 24;
            let mut search_delta = delta.clone();
            let search_joint_active_set: Option<Vec<usize>> = joint_active_set.clone();
            let mut tried_preconditioned_descent = false;
            // Dogleg Cauchy leg (gam#826/#808). Compute the unconstrained Cauchy
            // point of the penalized (Firth-augmented) quadratic model ONCE per
            // cycle: the M-metric steepest-descent direction `p_sd = M⁻¹·rhs`
            // and its curvature `p_sd·H·p_sd` (a coupled Hessian-vector product,
            // so it must be hoisted out of the radius-shrink loop). When the
            // Newton step exceeds a block's trust radius the dogleg blends
            // toward this Cauchy leg, guaranteeing at least the Cauchy decrease
            // even when the spectral Newton step is numerically frozen at the
            // oversmoothed seed (the high-curvature log_sigma block's Newton
            // component is `O(g/λ) ≈ 5e-21`). `joint_active_set` is the
            // unconstrained joint Newton path; the constrained-QP path keeps its
            // own globalization, so the dogleg is only built (and used) when no
            // active set is in force.
            let dogleg_cauchy: Option<Array1<f64>> = if search_joint_active_set.is_none() {
                let mut p_sd = Array1::<f64>::zeros(total_p);
                for (i, (r, w)) in rhs.iter().zip(joint_trust_metric_diag.iter()).enumerate() {
                    p_sd[i] = r / positive_joint_diagonal_entry(*w);
                }
                let mut h_psd = Array1::<f64>::zeros(total_p);
                let mut cauchy_penalty_scratch = Array1::<f64>::zeros(total_p);
                match apply_joint_penalized_hessian_into_with_workspace(
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    joint_mode_diagonal_ridge,
                    &p_sd,
                    &mut h_psd,
                    &mut cauchy_penalty_scratch,
                    joint_bundle,
                ) {
                    Ok(()) => {
                        if let Some((_grad_phi, hphi)) = head_jeffreys_term.as_ref() {
                            h_psd += &hphi.dot(&p_sd);
                        }
                        let cauchy = joint_cauchy_step(&rhs, &p_sd, &h_psd);
                        if cauchy.iter().all(|v| v.is_finite()) {
                            Some(cauchy)
                        } else {
                            None
                        }
                    }
                    Err(_) => None,
                }
            } else {
                None
            };
            let mut model_rejects = 0usize;
            let mut likelihood_rejects = 0usize;
            let mut objective_rejects = 0usize;
            let mut first_likelihood_reject: Option<String> = None;
            // Coalesce consecutive trust-region attempts whose accept/reject
            // outcome and numeric signature round to the same values, so a long
            // run of identical retries collapses into a single "attempts a..b
            // (×N)" line at flush time instead of spamming one line per try.
            let mut tr_log_sig: Option<String> = None;
            let mut tr_log_first: usize = 0;
            let mut tr_log_last: usize = 0;
            // Hoist the two full-size scratch buffers used in the predicted-
            // reduction computation outside the trust-region attempt loop.
            // The loop runs up to JOINT_TRUST_MAX_ATTEMPTS times per outer
            // Newton step, so allocating these per-attempt would add O(total_p)
            // heap traffic on every radius shrink/expand iteration.
            let mut hpen_delta = Array1::<f64>::zeros(total_p);
            let mut tr_penalty_scratch = Array1::<f64>::zeros(total_p);
            for trust_attempt in 0..JOINT_TRUST_MAX_ATTEMPTS {
                line_search_attempts = trust_attempt + 1;
                accepted_joint_workspace = None;
                // Dogleg globalization (gam#826/#808): when the unconstrained
                // Newton path is in force and a finite Cauchy leg was built,
                // construct the dogleg blend of the Cauchy and Newton points at
                // the current per-block radii. Otherwise (constrained-QP path,
                // or after the preconditioned-descent fallback replaced
                // `search_delta`) fall back to box-truncating the search step.
                let mut trial_delta;
                let mut block_step_norms = if let Some(spectrum) = joint_spectrum.as_ref() {
                    // Exact Moré–Sorensen trust-region step at the current radius
                    // (gam#979). The step already lies in the `D`-metric ball, so
                    // no dogleg blend or box-truncation is applied: on a shrink the
                    // direction is RE-SOLVED (bending toward the gradient), the
                    // property the dogleg/truncation lacked. Re-solving reuses the
                    // cached factorization at O(p) cost.
                    trial_delta = spectrum.trust_region_step(joint_trust_radius).delta;
                    joint_trust_region_block_metric_norms(
                        &trial_delta,
                        &ranges,
                        &joint_trust_metric_diag,
                    )
                } else if let Some(cauchy) = dogleg_cauchy.as_ref()
                    && !tried_preconditioned_descent
                {
                    trial_delta = Array1::<f64>::zeros(total_p);
                    joint_dogleg_step_to_block_metric_radii(
                        &search_delta,
                        cauchy,
                        &ranges,
                        &joint_trust_metric_diag,
                        &joint_block_trust_radii,
                        &mut trial_delta,
                    )
                } else {
                    trial_delta = search_delta.clone();
                    truncate_joint_step_to_block_metric_radii(
                        &mut trial_delta,
                        &ranges,
                        &joint_trust_metric_diag,
                        &joint_block_trust_radii,
                    )
                };
                if apply_joint_feasibility_limit(family, &states, &ranges, &mut trial_delta)
                    .is_err()
                {
                    joint_trust_radius = shrink_active_joint_block_trust_radii(
                        &mut joint_block_trust_radii,
                        &block_step_norms,
                        0.25,
                    );
                    continue;
                }
                block_step_norms = joint_trust_region_block_metric_norms(
                    &trial_delta,
                    &ranges,
                    &joint_trust_metric_diag,
                );
                let step_norm = block_step_norms.iter().copied().fold(0.0_f64, f64::max);
                let trial_step_inf = trial_delta
                    .iter()
                    .copied()
                    .map(f64::abs)
                    .fold(0.0_f64, f64::max);
                let step_hit_trust_boundary = block_step_norms
                    .iter()
                    .zip(&joint_block_trust_radii)
                    .any(|(step_norm, radius)| {
                        joint_block_step_hit_trust_boundary(*step_norm, *radius)
                    });
                // Predicted reduction must use the TRUE penalized Hessian
                // (the one that appears in `f(β) = -ℓ + ½βᵀSβ + ½·joint_mode_diagonal_ridge·‖β‖²`),
                // NOT the SPD-stabilized version. The stabilizing shift
                // in `joint_solver_diagonal_ridge` is purely a solver-side
                // tool to make the Newton system invertible when H_NLL
                // has negative eigenvalues; it is not part of the true
                // objective the trial-likelihood evaluator computes.
                //
                // If we use `joint_solver_diagonal_ridge` here, then for
                // any Newton step lying in null(H_true) (e.g. the
                // marginal-block cancellation direction in the saturated
                // probit regime — see
                // `marginal_block_hessian_cancels_in_saturated_regime`),
                // predicted = ½·rhs·δ while actual = rhs·δ, giving ρ = 2
                // exactly. The trust-region loop then accepts the step
                // (ρ > 0.75 expands the radius), and the same regime
                // repeats every cycle — exactly the large-scale-saturated
                // failure trace. Pinned by
                // `ridge_stabilization_gap_produces_exact_rho_two_in_null_direction`.
                //
                // `hpen_delta` and `tr_penalty_scratch` are hoisted outside
                // this loop; the workspace variant reuses them without
                // allocating per attempt.
                hpen_delta.fill(0.0);
                if apply_joint_penalized_hessian_into_with_workspace(
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    joint_mode_diagonal_ridge,
                    &trial_delta,
                    &mut hpen_delta,
                    &mut tr_penalty_scratch,
                    joint_bundle,
                )
                .is_err()
                {
                    break;
                }
                // JEFFREYS/FIRTH CURVATURE IN THE TRUST-REGION MODEL (gam#729/#715).
                // When the Jeffreys term is armed, the inner objective the merit
                // (`trialobjective = −ℓ + ½βᵀSβ + Φ`) measures and the Newton step
                // (`(H+Sλ+H_Φ)δ = ∇L−Sβ+∇Φ`) target both include the Firth term, so
                // the trust-region quadratic model's curvature MUST include `H_Φδ`
                // too. Omitting it (bare `(H+Sλ)δ`) makes `predicted_reduction`
                // inconsistent with the H_Φ-augmented `rhs` and the Φ-augmented
                // `actual_reduction`: for a coupled K-block family near the Firth
                // optimum (residual floored at ‖∇Φ‖) the resulting trust_ratio is
                // wrong, the line search rejects the genuine descent step (accepts
                // ~0), and β freezes with the residual stalled at a constant ≫ tol
                // — the unbounded-cycle non-convergence the inner solve exhibits on
                // the Dirichlet/multinomial fits. Adding `H_Φδ` makes the model
                // curvature match the augmented system the step solves and the
                // merit the accept test uses, so the step is accepted and the
                // residual descends. No-op when the term is condition-gated (∇Φ=0,
                // H_Φ=0) or unavailable.
                if let Some((_grad_phi, hphi)) = head_jeffreys_term.as_ref() {
                    let hphi_delta = hphi.dot(&trial_delta);
                    hpen_delta += &hphi_delta;
                }
                let predicted_reduction =
                    joint_quadratic_predicted_reduction(&rhs, &hpen_delta, &trial_delta);
                let linearized_next_kkt_inf = hpen_delta
                    .iter()
                    .zip(rhs.iter())
                    .map(|(hpen, rhs)| (hpen - rhs).abs())
                    .fold(0.0_f64, f64::max);
                // Reject only non-descent directions on the quadratic model.
                // A small-but-positive predicted reduction is what Newton
                // *should* produce near the optimum of a large-magnitude
                // objective: ½δᵀHδ scales with curvature×step², so it can be
                // far below the (relative) objective_tol = inner_tol·(1+|obj|)
                // while still being a correct Newton step. Trust-region ρ
                // shrink/expand handles small-but-valid Newton steps; the
                // preconditioned branch below is only for model-invalid
                // directions, and preserves linear constraints when present.
                //
                // NEAR-FLOOR CARVE-OUT (gam#787 binary matern centers=12). When
                // the Newton proposal is already at the step-tolerance floor —
                // `step_inf ≤ 4·step_tol`, the same round-off band the cert path
                // uses — the iterate is doing KKT polishing on a flat objective,
                // not global descent: there `predicted_reduction = rhs·δ − ½δᵀHδ`
                // is two near-equal O(step²) quantities and its SIGN is round-off
                // noise (a true Newton step gives +½δᵀHδ but the damped/range-
                // restricted spectral solve leaves rhs·δ a hair below ½δᵀHδ). The
                // `predicted_reduction ≤ 0` branch then mistook this for a model-
                // invalid direction and substituted `joint_preconditioned_descent_delta`,
                // a step sized for OBJECTIVE descent (diagonal-preconditioned
                // gradient, O(900×) larger than the polishing proposal). That step
                // bought a round-off-level objective gain but catapulted the KKT
                // residual off a near-converged iterate (‖∇L−Sβ‖ 1.7e-4 → 4.7e-1),
                // which then never recovered — every later cycle re-triggered the
                // same substitution (proposal stays pred≤0), pinning the residual
                // far above tol until the cycle budget exhausted → seed rejected →
                // hard raise. At the step floor we instead take the tiny proposal
                // as-is and let the trust-region noise-floor guard accept it at
                // ρ=1 (it neither helps nor hurts the objective beyond round-off),
                // so the inner keeps polishing the KKT residual to tol.
                let proposal_at_step_floor = joint_proposal_at_step_floor(step_inf, step_tol);
                if (!predicted_reduction.is_finite() || predicted_reduction <= 0.0)
                    && !proposal_at_step_floor
                {
                    model_rejects += 1;
                    if !tried_preconditioned_descent {
                        match joint_preconditioned_descent_delta(
                            &joint_hessian_source,
                            &ranges,
                            &s_lambdas,
                            joint_solver_diagonal_ridge,
                            &rhs,
                            joint_bundle,
                        ) {
                            Ok(descent_delta) => {
                                search_delta = descent_delta;
                            }
                            Err(_) => {
                                joint_trust_radius = shrink_active_joint_block_trust_radii(
                                    &mut joint_block_trust_radii,
                                    &block_step_norms,
                                    0.25,
                                );
                            }
                        }
                        tried_preconditioned_descent = true;
                    } else {
                        joint_trust_radius = shrink_active_joint_block_trust_radii(
                            &mut joint_block_trust_radii,
                            &block_step_norms,
                            0.25,
                        );
                    }
                    continue;
                }

                for b in 0..specs.len() {
                    let (start, end) = ranges[b];
                    let mut trial_beta = old_beta[b].clone();
                    trial_beta += &trial_delta.slice(ndarray::s![start..end]);
                    let projected =
                        family.post_update_block_beta(&states, b, &specs[b], trial_beta.clone())?;
                    reject_constrained_post_update_repair(
                        b,
                        &specs[b],
                        &trial_beta,
                        &projected,
                        block_constraints[b].as_ref(),
                    )?;
                    states[b].beta.assign(&projected);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                let mut trial_penalty = total_quadratic_penalty(
                    &states,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    joint_bundle,
                    Some(specs),
                );
                // Jeffreys objective contribution at the trial point keeps the
                // accept/reject objective consistent with the Jeffreys-modified
                // Newton step. `states` already holds the trial coefficients
                // (assigned + eta-refreshed above). No-op when the Jeffreys term
                // is unavailable or condition-gated to zero. When the cheap pre-
                // check certified this cycle well-conditioned, the step used H_Φ=0
                // / ∇Φ=0, so the consistent accept/reject objective also uses Φ=0:
                // skipping here keeps value and step on the SAME objective (the
                // value/step consistency the term exists to enforce) and avoids the
                // dense H/eigh at the trial point. The 8× conditioning margin makes
                // a single damped Newton step incapable of crossing the gate.
                // SUBTRACT Φ: the inner NLL objective is `−ℓ + ½βᵀSβ − Φ` (Firth
                // adds ½log|I| to the log-likelihood). Must match the cycle-0
                // baseline, the Newton step, and the KKT residual — INCLUDING the
                // `jeffreys_skippable_this_cycle` gate, so that on a well-conditioned
                // cycle the trial, the step (H_Φ=0/∇Φ=0), and the residual all sit
                // on the SAME Φ=0 objective (gam#729/#715 sign fix; the baseline and
                // post-accept folds carry the matching skippable gate).
                if !jeffreys_skippable_this_cycle
                    && let Some(z_joint) = joint_jeffreys_subspace.as_ref()
                {
                    trial_penalty -= custom_family_joint_jeffreys_value(
                        family, &states, specs, &ranges, z_joint,
                    );
                }
                // Cheap-LL line-search path: rejected backtracking attempts
                // discard the exact-Newton workspace they build, so we evaluate
                // just the scalar full-data log-likelihood for the accept/reject
                // decision and only build the full state once the step is
                // accepted (via the gradient reload below).
                //
                // EARLY-EXIT THRESHOLD MUST BOUND THE NLL, NOT THE FULL OBJECTIVE
                // (was a stall — gam#787/#785, duchon centers≥20). The family's
                // `bernoulli_margslope_line_search_ll_with_early_exit` short-
                // circuits the row sweep when the accumulated `-Σ wᵢ log CDF` (the
                // NLL ALONE — no penalty, no Jeffreys Φ) exceeds the threshold; its
                // monotone-lower-bound proof is valid only for the NLL term. But the
                // accept test is on the FULL augmented objective
                // `F = -ℓ + ½βᵀSβ + Φ_trial`, accepted iff `F ≤ old_objective + slack`,
                // i.e. iff `-ℓ_trial ≤ old_objective + slack − penalty_trial`. Passing
                // the full `old_objective` as the NLL threshold therefore over-rejects
                // by exactly `penalty_trial`: where the trial penalty is NEGATIVE
                // (the Jeffreys term subtracts Φ, and `½βᵀSβ` can be net-negative
                // under the reparam) the NLL threshold sits BELOW the true accept
                // bound, so the early exit kills net-descent steps the trust region
                // would accept — every backtracking attempt false-rejects, the radius
                // collapses, and the inner exits non-converged at cycle ~2 (seed
                // rejected pre-solver → hard raise, β pinned). Subtract the trial
                // penalty so the threshold is the NLL the trial must beat.
                let line_search_options =
                    coefficient_line_search_options(options, old_objective + 1e-10 - trial_penalty);
                let trial_ll =
                    match joint_line_search_log_likelihood(family, &line_search_options, &states) {
                        Ok((value, workspace)) => {
                            accepted_joint_workspace = workspace;
                            value
                        }
                        Err(e) => {
                            likelihood_rejects += 1;
                            if first_likelihood_reject.is_none() {
                                first_likelihood_reject = Some(e);
                            }
                            for (b, old) in old_beta.iter().enumerate() {
                                states[b].beta.assign(old);
                            }
                            refresh_all_block_etas(family, specs, &mut states)?;
                            joint_trust_radius = shrink_active_joint_block_trust_radii(
                                &mut joint_block_trust_radii,
                                &block_step_norms,
                                0.25,
                            );
                            continue;
                        }
                    };
                let trialobjective = -trial_ll + trial_penalty;
                // Row measure observed by the trial objective at β + δ. The
                // line-search helper above runs under `coefficient_line_search_options`,
                // which now preserves `outer_score_subsample` and disables
                // any further auto-install; if either contract is broken the
                // id will diverge from `tr_row_measure_top` and we Err below.
                let tr_row_measure_trial =
                    crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
                // Hard invariant: the trust-region ratio numerator (objective
                // at β minus trial at β+δ) and denominator (rhs·δ − ½δᵀH δ)
                // MUST share a row measure with the Hessian/gradient build.
                // Bubble out via `Err` rather than panic; this function
                // already returns `Result<_, String>`.
                let top_id = tr_row_measure_top.id;
                if tr_row_measure_hessian.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         Hessian id 0x{:016x} differs from top-of-cycle id 0x{:016x} \
                         (cycle {}); the joint Hessian was built against a different \
                         row mask than the trust-region globalization captured at the \
                         top of the cycle. ρ would compare ½δᵀHδ on one measure to \
                         F(β)−F(β+δ) on another.",
                        tr_row_measure_hessian.id, top_id, cycle
                    ));
                }
                if tr_row_measure_gradient.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         gradient id 0x{:016x} differs from top-of-cycle id 0x{:016x} \
                         (cycle {}); `cached_joint_gradient` was loaded against a \
                         different row mask than the trust-region globalization \
                         captured at the top of the cycle. rhs·δ in the predicted \
                         reduction would not match the rest of the ρ inputs.",
                        tr_row_measure_gradient.id, top_id, cycle
                    ));
                }
                if tr_row_measure_old_objective.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         objective-at-β id 0x{:016x} differs from top-of-cycle id \
                         0x{:016x} (cycle {}); `lastobjective` was computed against \
                         a different row mask than the trust-region globalization \
                         captured at the top of the cycle.",
                        tr_row_measure_old_objective.id, top_id, cycle
                    ));
                }
                if tr_row_measure_trial.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         trial-objective id 0x{:016x} differs from top-of-cycle id \
                         0x{:016x} (cycle {}, attempt {}); the line-search trial \
                         likelihood evaluated against a different row mask than the \
                         Hessian/gradient/old-objective build. Cf. \
                         `coefficient_line_search_options` and \
                         `install_auto_outer_subsample_options`.",
                        tr_row_measure_trial.id, top_id, cycle, trust_attempt
                    ));
                }
                let actual_reduction = old_objective - trialobjective;
                let trust_update = update_joint_trust_region_radius(
                    joint_trust_radius,
                    step_norm,
                    actual_reduction,
                    predicted_reduction,
                    old_objective,
                );
                let old_radius = joint_trust_radius;
                // Classify the outcome of this attempt so the diagnostic line
                // says *why* the step was taken or rejected rather than just
                // dumping numbers. The four phases partition the post-log
                // branches below; computing them up front lets the log line
                // and the dispatch agree.
                let floor_reached = trust_update.accepted
                    && current_stationarity_residual <= residual_tol
                    && joint_objective_floor_reached(
                        old_objective,
                        trialobjective,
                        actual_reduction,
                        predicted_reduction,
                        objective_tol,
                    );
                let roundoff_slack = joint_objective_roundoff_slack(old_objective, trialobjective);
                let secondary_ok = !floor_reached
                    && trialobjective.is_finite()
                    && trust_update.accepted
                    && trialobjective <= old_objective + roundoff_slack;
                let phase: &'static str = if floor_reached {
                    "converged"
                } else if secondary_ok {
                    "accepted"
                } else if trust_update.accepted {
                    "stall"
                } else {
                    "reject"
                };
                if floor_reached || secondary_ok {
                    for (block_radius, block_step_norm) in joint_block_trust_radii
                        .iter_mut()
                        .zip(block_step_norms.iter())
                    {
                        let block_update = update_joint_trust_region_radius(
                            *block_radius,
                            *block_step_norm,
                            actual_reduction,
                            predicted_reduction,
                            old_objective,
                        );
                        if block_update.radius >= *block_radius
                            || joint_block_step_hit_trust_boundary(*block_step_norm, *block_radius)
                        {
                            *block_radius = block_update.radius;
                        }
                    }
                    joint_trust_radius = joint_block_trust_radii
                        .iter()
                        .copied()
                        .fold(0.0_f64, f64::max);
                } else {
                    joint_trust_radius = shrink_active_joint_block_trust_radii(
                        &mut joint_block_trust_radii,
                        &block_step_norms,
                        0.25,
                    );
                }
                let radius_held =
                    (joint_trust_radius - old_radius).abs() <= 1e-12 * old_radius.abs().max(1.0);
                let joint_math = JointNewtonMathDiagnostic {
                    old_kkt_inf: current_kkt_norm,
                    linearized_next_kkt_inf,
                    predicted_reduction,
                    actual_reduction,
                    trust_ratio: trust_update.rho,
                    step_inf: trial_step_inf,
                    proposal_inf: step_inf,
                };
                let radius_field = if radius_held {
                    format!("r={:.3e} (held)", old_radius)
                } else {
                    format!("r={:.3e}->{:.3e}", old_radius, joint_trust_radius)
                };
                // Surface the TR-policy decision so future failures
                // distinguish "TR is throttling Newton" from "TR is not
                // the bottleneck — Newton itself finds short steps".
                // For the large-scale linear-convergence pattern the policy
                // is consistently `hold_inside` (ρ≈1, |δ| ≪ radius),
                // which proves the TR is not what is keeping the step
                // small — that came up before via "(held)" alone but
                // the explicit decision label makes the inference
                // immediate instead of requiring step/radius arithmetic
                // in the reader's head.
                let tr_attempt_sig = format!(
                    "{:<9}  ρ={:+.3e}  Δobj={:+.3e}  pred={:+.3e}  {}  decision={:<22}  |δ|={:.3e}  |δ|∞={:.3e}  |prop|∞={:.3e}",
                    phase,
                    trust_update.rho,
                    actual_reduction,
                    predicted_reduction,
                    radius_field,
                    trust_update.decision.label(),
                    step_norm,
                    trial_step_inf,
                    step_inf,
                );
                match tr_log_sig.as_deref() {
                    Some(prev) if prev == tr_attempt_sig.as_str() => {
                        tr_log_last = line_search_attempts;
                    }
                    Some(prev) => {
                        if tr_log_first == tr_log_last {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                                cycle,
                                tr_log_first,
                                prev,
                            );
                        } else {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                                cycle,
                                tr_log_first,
                                tr_log_last,
                                tr_log_last - tr_log_first + 1,
                                prev,
                            );
                        }
                        tr_log_sig = Some(tr_attempt_sig);
                        tr_log_first = line_search_attempts;
                        tr_log_last = line_search_attempts;
                    }
                    None => {
                        tr_log_sig = Some(tr_attempt_sig);
                        tr_log_first = line_search_attempts;
                        tr_log_last = line_search_attempts;
                    }
                }
                if floor_reached {
                    if let Some(sig) = tr_log_sig.take() {
                        if tr_log_first == tr_log_last {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                                cycle,
                                tr_log_first,
                                sig,
                            );
                        } else {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                                cycle,
                                tr_log_first,
                                tr_log_last,
                                tr_log_last - tr_log_first + 1,
                                sig,
                            );
                        }
                    }
                    for (b, old) in old_beta.iter().enumerate() {
                        states[b].beta.assign(old);
                    }
                    refresh_all_block_etas(family, specs, &mut states)?;
                    last_joint_math = Some(joint_math);
                    accepted = true;
                    converged = true;
                    break;
                }
                if secondary_ok {
                    if let Some(sig) = tr_log_sig.take() {
                        if tr_log_first == tr_log_last {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                                cycle,
                                tr_log_first,
                                sig,
                            );
                        } else {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                                cycle,
                                tr_log_first,
                                tr_log_last,
                                tr_log_last - tr_log_first + 1,
                                sig,
                            );
                        }
                    }
                    current_penalty = trial_penalty;
                    if let Some(joint_active_set) = search_joint_active_set.as_ref() {
                        cached_active_sets =
                            scatter_joint_active_set(joint_active_set, &block_constraints);
                    }
                    last_joint_math = Some(joint_math);
                    last_accepted_hit_joint_trust_boundary = step_hit_trust_boundary;
                    accepted = true;
                    break;
                }
                for (b, old) in old_beta.iter().enumerate() {
                    states[b].beta.assign(old);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                objective_rejects += 1;
            }
            if let Some(sig) = tr_log_sig.take() {
                if tr_log_first == tr_log_last {
                    log::info!(
                        "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                        cycle,
                        tr_log_first,
                        sig,
                    );
                } else {
                    log::info!(
                        "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                        cycle,
                        tr_log_first,
                        tr_log_last,
                        tr_log_last - tr_log_first + 1,
                        sig,
                    );
                }
            }
            let line_search_elapsed = line_search_started.elapsed();
            if accepted && converged {
                log::info!(
                    "[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=true hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} reject_model={} reject_likelihood={} reject_objective={} first_likelihood_reject={} grad_reload=0.000s total={:.3}s",
                    cycle,
                    hessian_and_qp_elapsed.as_secs_f64(),
                    line_search_elapsed.as_secs_f64(),
                    line_search_attempts,
                    model_rejects,
                    likelihood_rejects,
                    objective_rejects,
                    first_likelihood_reject.as_deref().unwrap_or("none"),
                    cycle_started.elapsed().as_secs_f64(),
                );
                cached_joint_workspace = hessian_workspace_for_cycle;
                cycles_done = cycle + 1;
                break;
            }
            if !accepted {
                // Retry the joint Newton loop from the same state after a
                // failed trust-region search. Falling through into blockwise
                // would switch a coupled exact-Hessian problem onto a
                // principal-block surrogate, which is the ridge-drift failure
                // mode this path is meant to avoid. The trust-region radius
                // already collapsed via the attempt loop's shrink rules, so
                // the next cycle's Newton proposal will be evaluated under
                // a tighter L2 bound without any parallel adaptation here.
                log::info!(
                    "[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=false hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} reject_model={} reject_likelihood={} reject_objective={} first_likelihood_reject={} grad_reload=0.000s total={:.3}s",
                    cycle,
                    hessian_and_qp_elapsed.as_secs_f64(),
                    line_search_elapsed.as_secs_f64(),
                    line_search_attempts,
                    model_rejects,
                    likelihood_rejects,
                    objective_rejects,
                    first_likelihood_reject.as_deref().unwrap_or("none"),
                    cycle_started.elapsed().as_secs_f64(),
                );
                // Restore original betas
                for (b, old) in old_beta.iter().enumerate() {
                    states[b].beta.assign(old);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                // If the previous cycle's bookkeeping certified KKT
                // stationarity (residual ≤ tol and objective change ≤
                // tol), the line-search failure here is round-off on a
                // rank-deficient null mode rather than non-convergence:
                // the proposed `H⁻¹ g` step stays O(1) along the null
                // direction at the optimum, every trial moves β along
                // it without changing the objective, and round-off
                // flips the sign of `actual − predicted` so the
                // sufficient-decrease check rejects every trial. The
                // iterate ALREADY satisfies the first-order optimality
                // conditions; we accept that as convergence rather
                // than fail the outer "inner solve did not converge"
                // panic on a fully resolved fit.
                if last_cycle_residual_below_tol && last_cycle_obj_change_below_tol {
                    converged = true;
                    break;
                }
                // Fully-rejected stall guard. See the constant declaration
                // at the top of this function for the full rationale. The
                // condition is: every trust attempt this cycle failed the
                // *actual-objective* line search (model_rejects ==
                // likelihood_rejects == 0, objective_rejects ==
                // JOINT_TRUST_MAX_ATTEMPTS) AND the joint trust radius did
                // not shrink relative to the previous fully-rejected cycle.
                // Both together prove the next cycle's Newton system,
                // trust radius, and trust-region search are bytewise
                // identical to this cycle's — there is no descent direction
                // the local quadratic model can reconcile at this β. After
                // FULLY_REJECTED_STALL_MAX_CYCLES such cycles, exit
                // non-converged so the outer optimizer rejects this ρ.
                let all_attempts_objective_rejected = objective_rejects == JOINT_TRUST_MAX_ATTEMPTS
                    && model_rejects == 0
                    && likelihood_rejects == 0;
                let radius_held_since_last_reject = match prev_rejected_trust_radius {
                    Some(prev) => {
                        joint_trust_radius.is_finite()
                            && prev.is_finite()
                            && joint_trust_radius >= prev * (1.0 - 1e-12)
                    }
                    None => false,
                };
                if all_attempts_objective_rejected && radius_held_since_last_reject {
                    consecutive_held_rejected_cycles =
                        consecutive_held_rejected_cycles.saturating_add(1);
                } else {
                    consecutive_held_rejected_cycles = 0;
                }
                prev_rejected_trust_radius = Some(joint_trust_radius);
                if consecutive_held_rejected_cycles >= FULLY_REJECTED_STALL_MAX_CYCLES {
                    let last_math_summary = last_joint_math
                        .as_ref()
                        .map(|math| {
                            format!(
                                "last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
                                math.old_kkt_inf,
                                math.linearized_next_kkt_inf,
                                math.actual_reduction,
                                math.predicted_reduction,
                                math.trust_ratio,
                                math.scalar_model_relative_error(),
                                math.step_inf,
                                math.proposal_inf,
                            )
                        })
                        .unwrap_or_else(|| "last_newton_math=<none>".to_string());
                    log::warn!(
                        "[PIRLS/joint-Newton convergence] cycle {:>3} | fully-rejected stall \
                         early-exit: every trust-region attempt rejected on the actual-objective \
                         check for {} consecutive cycles with joint trust radius held at {:.3e} \
                         throughout. Reverted β + held trust radius mean the next cycle's Newton \
                         step is byte-identical to this one's; no descent direction is reachable \
                         from this iterate under the current local model. {}. Returning \
                         unconverged with finite β so the outer optimizer rejects this ρ \
                         evaluation before inner_max_cycles.",
                        cycle,
                        consecutive_held_rejected_cycles,
                        joint_trust_radius,
                        last_math_summary,
                    );
                    converged = false;
                    break;
                }
                // CONTINUE rather than break (gam#826/#872/#715). The comment
                // above documents the intent — "retry the joint Newton loop from
                // the same state after a failed trust-region search" — but the old
                // code BROKE instead, giving up after a SINGLE cycle of failed line
                // search. On a severely near-separating coupled fit (matern
                // binomial location-scale, quasi-separating multinomial, flexible
                // linkwiggle) the cycle-0 Newton proposal is huge (the separation
                // gradient ÷ the Firth-bounded curvature), the trust region clamps
                // it, and the clamped step does not yet reduce the merit — so the
                // FIRST cycle's backtracking exhausts without acceptance. The
                // attempt loop already shrank `joint_trust_radius` /
                // `joint_block_trust_radii` (carried across cycles), so the NEXT
                // cycle re-proposes under the tighter radius and eventually accepts
                // a productive step — standard trust-region globalization. Breaking
                // at cycle 0 aborted the coupled solve ("exited the joint Newton
                // path before convergence — no math snapshot") before the trust
                // region could adapt. The inner cycle cap and the residual-stall /
                // trust-region-floor guards above still bound the loop, so a
                // genuinely stuck fit exits with a diagnosed non-convergence rather
                // than spinning. Falling through to blockwise (the old `break`)
                // would switch the coupled exact-Hessian problem onto a
                // principal-block surrogate (the ridge-drift mode this path avoids).
                continue;
            }

            let grad_reload_started = std::time::Instant::now();
            log::info!(
                "[joint-newton-tr] phase=gradient_reload cycle={} attempts={} r={:.3e}",
                cycle,
                line_search_attempts,
                joint_trust_radius,
            );
            let (log_likelihood, gradient, eval, workspace) = load_joint_gradient_evaluation(
                family,
                specs,
                options,
                &states,
                joint_workspace_requested,
                accepted_joint_workspace.take(),
            )?;
            let grad_reload_elapsed = grad_reload_started.elapsed();
            // Reset the fully-rejected stall guard's bookkeeping: an accepted
            // cycle moved β and may have grown the trust radius, so the next
            // rejected-cycle comparison must start fresh rather than carry
            // forward a stale radius snapshot from the previous reject streak.
            prev_rejected_trust_radius = None;
            consecutive_held_rejected_cycles = 0;
            // Accepted-cycle timing breakdown is debug-only. The per-cycle
            // info line below already includes total cycle time; emitting a
            // four-phase split on every verbose cycle adds a redundant info
            // line. Rejected cycles still keep the detailed phase log since
            // the reject reason and per-phase split is the diagnostic.
            log::debug!(
                "[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=true hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} grad_reload={:.3}s total={:.3}s",
                cycle,
                hessian_and_qp_elapsed.as_secs_f64(),
                line_search_elapsed.as_secs_f64(),
                line_search_attempts,
                grad_reload_elapsed.as_secs_f64(),
                cycle_started.elapsed().as_secs_f64(),
            );
            current_log_likelihood = log_likelihood;
            cached_joint_gradient = gradient;
            cached_eval = eval;
            cached_joint_workspace = workspace;
            current_penalty = total_quadratic_penalty(
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            // `current_penalty` / `lastobjective` stay the pure quadratic-penalized
            // objective (NO Φ folded in) — the Firth value is applied per cycle at
            // each β (see `old_objective` above and `trialobjective` below). The
            // gated Φ at the accepted β is captured separately so the convergence
            // `objective_change` compares the augmented objective at the new vs old
            // β consistently (gam#826/#872).
            lastobjective = -current_log_likelihood + current_penalty;
            let new_phi = if !jeffreys_skippable_this_cycle {
                joint_jeffreys_subspace
                    .as_ref()
                    .map(|z_joint| {
                        custom_family_joint_jeffreys_value(family, &states, specs, &ranges, z_joint)
                    })
                    .unwrap_or(0.0)
            } else {
                0.0
            };
            let accepted_step_inf = states
                .iter()
                .zip(old_beta.iter())
                .flat_map(|(state, old)| {
                    state
                        .beta
                        .iter()
                        .zip(old.iter())
                        .map(|(new, old)| (new - old).abs())
                })
                .fold(0.0_f64, f64::max);
            cycles_done = cycle + 1;

            // Check convergence via joint stationarity. When the family-general
            // Firth/Jeffreys term is armed, the penalized objective the inner
            // Newton actually optimizes is `−ℓ + ½βᵀSβ − Φ`, so its KKT
            // stationarity is `∇L − Sβ + ∇Φ = 0`. The Newton STEP already folds
            // `∇Φ` into its RHS (`spectral_rhs += grad_phi`), but the bare
            // `exact_newton_joint_stationarity_*` residual omits it — at the
            // Firth fixed point `∇L − Sβ = −∇Φ`, so the certificate floors at
            // `‖∇Φ‖∞` and never certifies, stalling the inner solve on exactly
            // the near-separating span Firth is meant to bound (the residual the
            // outer REML then rejects). Fold `∇Φ` into the gradient used for the
            // KKT residual so the convergence criterion matches the augmented
            // objective the step descends. No-op when the Jeffreys term is
            // unavailable or condition-gated to zero.
            let Some(gradient) = cached_joint_gradient.as_ref() else {
                break;
            };
            let jeffreys_augmented_gradient: Option<Array1<f64>> = if jeffreys_skippable_this_cycle
            {
                // Well-conditioned ⇒ ∇Φ = 0, so the KKT residual is the bare
                // stationarity (and floors at 0, not ‖∇Φ‖) — matching the step,
                // which folded H_Φ=0/∇Φ=0 this cycle. Avoids the dense H/eigh.
                None
            } else if let Some(z_joint) = joint_jeffreys_subspace.as_ref() {
                match custom_family_joint_jeffreys_term(family, &states, specs, &ranges, z_joint)? {
                    Some((_phi, grad_phi, hphi))
                        if grad_phi.len() == gradient.len()
                            && hphi.nrows() == total_p
                            && hphi.ncols() == total_p =>
                    {
                        let augmented = gradient + &grad_phi;
                        // Cache the exact triple at the just-accepted β so the next
                        // cycle's head reuses it instead of recomputing the
                        // O(p)-directional-derivative + GEMM term (gam#729).
                        let post_beta_key = flatten_state_betas(&states, specs);
                        jeffreys_triple_cache = Some((post_beta_key, grad_phi, hphi));
                        Some(augmented)
                    }
                    _ => None,
                }
            } else {
                None
            };
            let residual_gradient = jeffreys_augmented_gradient.as_ref().unwrap_or(gradient);
            let residual = exact_newton_joint_stationarity_inf_norm_from_gradient(
                residual_gradient,
                &states,
                specs,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                &block_constraints,
                Some(cached_active_sets.as_slice()),
            )?;
            prev_kkt_norm = Some(residual);
            // Record this cycle's KKT residual for the steady-geometric-descent
            // test at the certificate-refusal gate below (gam#787 centers≥20).
            if residual.is_finite() {
                residual_descent_history.push_back(residual);
                while residual_descent_history.len() > RESIDUAL_DESCENT_WINDOW {
                    residual_descent_history.pop_front();
                }
            }

            // Scale-aware tolerances. The objective check was already
            // relative (`inner_tol * (1 + |obj|)`), but the step and
            // residual checks were absolute against the bare `inner_tol`
            // — at large scale (n ≈ 320k), β iterates can keep moving
            // by ~1e-5 per cycle along the monotonicity-feasible
            // manifold even after the likelihood has gone flat, and the
            // joint gradient ‖·‖_∞ is O(|obj|), not O(1). Running
            // 50-100 cycles past objective convergence is the
            // dominant inner-PIRLS cost at large scale. Switching to
            // relative scaling (`inner_tol * (1 + ‖β‖_∞)` for steps,
            // `inner_tol * (1 + |obj|)` for the gradient residual)
            // exits PIRLS as soon as the optimum is statistically
            // resolved, without loosening behavior at small n where
            // ‖β‖_∞ ≈ 1 and |obj| ≈ 1 give tolerances within 2× of
            // the historical absolute 1e-6.
            let beta_inf = states
                .iter()
                .flat_map(|s| s.beta.iter().copied())
                .map(f64::abs)
                .fold(0.0_f64, f64::max);
            let step_tol = inner_tol * (1.0 + beta_inf);
            let objective_tol = inner_tol * (1.0 + lastobjective.abs());
            // KKT residual tolerance must scale with the natural magnitude of
            // ‖Sβ − ∇L‖∞ (i.e. max(‖∇L‖∞, ‖Sβ‖∞)), not the objective. At
            // large scale with |β|∞ in the 10²–10³ range the gradient and
            // penalty norms can sit orders of magnitude above |obj| and FP
            // noise alone keeps the residual above any obj-scaled tol. The
            // pre-line-search check at the head of the cycle already uses
            // `inner_tol * (1 + max(grad_inf, pen_inf))`; using only grad_inf
            // here created an asymmetry where the same convergence criterion
            // would accept at one site and reject at the other, and on
            // marginal-slope models where Sβ is the larger term it shrank
            // the post-accept tolerance below the achievable FP floor.
            let mut block_gradient_norms = Vec::with_capacity(states.len());
            let mut block_penalty_norms = Vec::with_capacity(states.len());
            for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
                block_gradient_norms.push(
                    gradient
                        .slice(s![start..end])
                        .iter()
                        .map(|x: &f64| x.abs())
                        .fold(0.0_f64, f64::max),
                );
                let mut penalty_block = s_lambdas[block_idx].dot(&states[block_idx].beta);
                if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                    penalty_block += &states[block_idx].beta.mapv(|v| ridge * v);
                }
                block_penalty_norms.push(
                    penalty_block
                        .iter()
                        .map(|x: &f64| x.abs())
                        .fold(0.0_f64, f64::max),
                );
            }
            let grad_inf = block_gradient_norms.iter().copied().fold(0.0_f64, f64::max);
            let pen_inf = block_penalty_norms.iter().copied().fold(0.0_f64, f64::max);
            // Firth/Jeffreys score magnitude. The convergence residual is the
            // AUGMENTED stationarity `∇L − Sβ + ∇Φ`, so `∇Φ` is a first-class term
            // whose own numerical scale sets the achievable KKT floor: `∇Φ` is a
            // trace `½ tr(H_id⁻¹ Z_Jᵀ Ḣ Z_J)` formed from a FLOORED reduced-info
            // pseudo-inverse, so its components carry O(‖∇Φ‖·ε_floor) round-off
            // that the augmented residual cannot polish below. Scaling the KKT
            // tolerance by `max(grad, pen, ‖∇Φ‖)` (not just grad/pen) makes the
            // certificate reachable for coupled K-block Firth fits whose data
            // gradient is small but whose Firth score is O(1): otherwise the
            // augmented residual plateaus a few × above an unattainably tight
            // `inner_tol·(1+grad)` tol and the solve refuses just short of
            // convergence (gam#729/#715 — the residual stalled at ~8.8e-6 against a
            // ~1e-6 tol). No-op when the term is condition-gated (∇Φ=0).
            let firth_score_inf = head_jeffreys_term
                .as_ref()
                .map(|(grad_phi, _hphi)| grad_phi.iter().map(|v| v.abs()).fold(0.0_f64, f64::max))
                .unwrap_or(0.0);
            let residual_tol = inner_tol * (1.0 + grad_inf.max(pen_inf).max(firth_score_inf));
            let block_stationarity_tolerances = block_gradient_norms
                .iter()
                .zip(&block_penalty_norms)
                .map(|(grad_norm, penalty_norm)| inner_tol * (1.0 + grad_norm.max(*penalty_norm)))
                .collect::<Vec<_>>();
            // Active-set-projected stationarity residual vector (multiplier
            // mass of every pinned bound row already subtracted). Lifted out of
            // the per-block norm reduction so the constrained-stationary
            // certificate below can also test its component in the *range* of
            // the penalized Hessian (gam#553 penalty-null-space acceptance).
            let projected_residual_vec =
                exact_newton_joint_projected_stationarity_vector_from_gradient(
                    gradient,
                    &states,
                    specs,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    &block_constraints,
                    Some(cached_active_sets.as_slice()),
                )?;
            let block_stationarity_norms = {
                let mut offset = 0usize;
                states
                    .iter()
                    .map(|state| {
                        let start = offset;
                        let end = start + state.beta.len();
                        offset = end;
                        projected_residual_vec
                            .slice(ndarray::s![start..end])
                            .iter()
                            .map(|x: &f64| x.abs())
                            .fold(0.0_f64, f64::max)
                    })
                    .collect::<Vec<_>>()
            };
            let all_block_stationarity_small = block_stationarity_norms
                .iter()
                .zip(&block_stationarity_tolerances)
                .all(|(norm, tol)| {
                    norm.is_finite()
                        && tol.is_finite()
                        && *norm <= RESIDUAL_STALL_BLOCK_GRADIENT_FACTOR * *tol
                });
            let near_convergence = residual <= 10.0 * residual_tol;
            // Augmented-objective change: `(quad(new) − Φ_gated(new)) −
            // (quad(old) − Φ_gated(old))`. `lastobjective` is quadratic-only and
            // `old_objective` already carries `−old_phi`, so subtract the accepted
            // β's `new_phi` here to keep both endpoints on the Φ-augmented merit
            // (gam#826/#872). On a skippable cycle both phis are 0 ⇒ identical to
            // the bare quadratic change.
            let signed_obj_change = (lastobjective - new_phi) - old_objective;
            let objective_change = signed_obj_change.abs();

            // Per-cycle observability for the convergence test. Surfaces
            // WHICH criterion is binding (proposed step, accepted step,
            // residual, objective change) at every iteration so CI logs
            // distinguish "Newton hasn't proposed a small step yet"
            // (algorithm still working) from "step is small but residual
            // won't drop below tol" (tolerance scaling problem). Without
            // this, the only visible signal is the objective itself,
            // which is insufficient to choose the right algorithmic
            // remedy.
            //
            // gam#979 discriminator: the PER-BLOCK projected stationarity
            // breakdown. The aggregate `residual` alone cannot distinguish a
            // genuinely-coupled stall from one block dragging the others — for
            // the survival marginal↔logslope grind the question "is the total
            // residual dominated by a single block (the multiplicative
            // z·exp(logslope) coupling channel), or spread evenly (global
            // conditioning)?" is answerable only from the split. `block_resid`
            // is already computed above for the convergence test, so surfacing
            // it per cycle is free; reading it across a 75 s repro under
            // RUST_LOG=info tells whether the slowdown is a single stuck block
            // (curvature/coupling channel) or an evenly slow descent
            // (conditioning) — without it the four #979 candidates are not
            // separable from the timeline.
            let block_resid_sig = block_stationarity_norms
                .iter()
                .map(|n| format!("{n:.3e}"))
                .collect::<Vec<_>>()
                .join(",");
            log::info!(
                "[PIRLS/joint-Newton convergence] cycle {:>3} | step_inf={:.3e} (tol={:.3e}) | accepted_step_inf={:.3e} | residual={:.3e} (tol={:.3e}) | per_block_resid=[{}] | obj_change={:.3e} (tol={:.3e}) | beta_inf={:.3e}",
                cycle,
                step_inf,
                step_tol,
                accepted_step_inf,
                residual,
                residual_tol,
                block_resid_sig,
                objective_change,
                objective_tol,
                beta_inf,
            );

            if verbose_cycle || near_convergence {
                log::info!(
                    "[PIRLS/JN] cyc={:>3}/{} obj={:.6e} -loglik={:.6e} pen={:.3e} Δobj={:+.3e} |δ|∞={:.3e} accepted_|δ|∞={:.3e} resid={:.3e} (tol={:.3e}) obj_tol={:.3e} step_tol={:.3e} |β|∞={:.3e} attempts={} t={:.3}s",
                    cycle,
                    inner_max_cycles,
                    lastobjective,
                    -current_log_likelihood,
                    current_penalty,
                    signed_obj_change,
                    step_inf,
                    accepted_step_inf,
                    residual,
                    residual_tol,
                    objective_tol,
                    step_tol,
                    beta_inf,
                    line_search_attempts,
                    cycle_started.elapsed().as_secs_f64(),
                );
            } else {
                log::info!(
                    "[PIRLS/JN] cyc={:>3}/{} obj={:.6e} Δobj={:+.3e} |δ|∞={:.3e} resid={:.3e} attempts={} t={:.3}s",
                    cycle,
                    inner_max_cycles,
                    lastobjective,
                    signed_obj_change,
                    accepted_step_inf,
                    residual,
                    line_search_attempts,
                    cycle_started.elapsed().as_secs_f64(),
                );
            }

            // Divergence guard: a non-finite KKT residual, objective, or
            // log-likelihood means the inner joint Newton has diverged (NaN
            // mass propagating from a near-unidentified penalized block — the
            // binomial location-scale shared-basis log-σ deviation channel is
            // the canonical trigger, gam#554). Every convergence and
            // residual-stall exit below is gated on finite `<=` comparisons,
            // which a NaN residual silently defeats; left unguarded the loop
            // then grinds the full `inner_loop_hard_ceiling` on every outer
            // ρ-eval and every startup seed, which is the multi-hour "hang".
            // Treat it as immediate non-convergence so the outer optimizer
            // rejects this point cleanly instead of burning the budget.
            if !residual.is_finite()
                || !lastobjective.is_finite()
                || !current_log_likelihood.is_finite()
            {
                log::warn!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | divergence guard: non-finite inner state (residual={:.3e}, objective={:.3e}, -loglik={:.3e}); returning unconverged so the outer optimizer rejects this ρ evaluation instead of running to inner_max_cycles.",
                    cycle,
                    residual,
                    lastobjective,
                    -current_log_likelihood,
                );
                converged = false;
                break;
            }

            // KKT convergence: a small post-step residual is the
            // canonical optimality certificate for the penalized
            // objective. ‖∇L(β) − Sβ‖∞ ≤ residual_tol means the
            // iterate is at a KKT point to numerical precision and
            // further iteration cannot reduce it; the step magnitude
            // is irrelevant once the residual signal has fired.
            //
            // Tying convergence to a small step instead would refuse
            // to recognise quadratic-rate single-shot convergence:
            // exact Newton on an exact quadratic produces one full
            // step that lands at the optimum, so ‖delta‖∞ equals the
            // initial distance ‖β* − β₀‖∞ no matter how exact the
            // model is. Pairing a residual check with a step-size
            // requirement structurally rejects this entirely-correct
            // cycle-0 termination, leaving inner_max_cycles=1 callers
            // unable to certify convergence on a problem that was
            // solved exactly in one Newton step.
            if joint_inner_kkt_converged(residual, residual_tol) {
                converged = true;
                break;
            }
            // Identified-subspace (range-space) KKT certificate.
            //
            // The strict certificate above tests the FULL stationarity residual
            // ‖∇L − Sβ‖∞. On a genuinely rank-deficient penalized inner problem
            // — a degenerate small-n transformation-normal CTM/Box-Cox fit whose
            // joint Hessian carries an *unidentified* direction the
            // canonical-gauge pass cannot attribute to a single block (the same
            // structural null root-caused for the joint-Newton panic at
            // `solve_joint_newton_step_on_spectral_range`) — the stationarity
            // gradient keeps a fixed nonzero component inside ker(H_pen). The
            // spectral Newton step drops exactly that component (range-restricted
            // Moore–Penrose step: every null direction hits the `continue` branch
            // in the accumulation loop), so β converges on the identified
            // subspace and the step exhausts, yet the FULL residual never reaches
            // `residual_tol`. The strict test then runs the whole cycle budget
            // "non-converged" on an iterate that is, in fact, the optimum on the
            // only identifiable directions.
            //
            // The principled certificate is stationarity on range(H_pen): the
            // residual restricted to the curved (identified) subspace is at
            // tolerance while the leftover mass is provably confined to
            // ker(H_pen) — an unidentified direction with neither curvature nor
            // constraint. That null component is dropped by the spectral step
            // here and projected out of the KKT residual by the outer IFT
            // pseudo-inverse `U_S·H_proj⁻¹·U_Sᵀ` before the envelope correction
            // (see the gam#553 note and `projected_residual_range_space_inf`), so
            // it cannot bias the outer gradient.
            //
            // The remaining requirement is to prove we are AT the
            // range-restricted optimum rather than mid-descent, so this does not
            // short-circuit a genuinely nonlinear CTM fit that is still moving β.
            // There are two independent, equally-rigorous proofs of that, and
            // EITHER suffices once `range_residual ≤ residual_tol` has fired:
            //   (a) the full Newton step is exhausted (`step_inf ≤ step_tol`):
            //       the well-identified case, where the range-restricted step
            //       collapses to zero and the leftover ker(H_pen) component is
            //       already dropped by the spectral step, so the FULL step is
            //       small too; OR
            //   (b) the objective has stopped changing
            //       (`objective_change ≤ objective_tol`): the joint objective
            //       (−loglik + ½βᵀSβ) is a function of the IDENTIFIED coordinates
            //       ONLY — moving β along an unidentified direction in ker(H_pen)
            //       = ker(H_L) ∩ ker(S) changes neither the likelihood nor the
            //       penalty by construction — so a flat objective proves no
            //       identified-direction descent remains regardless of how large
            //       the FULL step is.
            // Proof (b) is the certificate that the constant-scale AFT (#736) and
            // the degenerate CTM (#733/#734) need: their unidentified cross-block
            // null (the time_transform polynomial/affine deviation aliased into
            // threshold/log_sigma) keeps the Levenberg-damped, trust-region-clamped
            // FULL step perpetually nonzero — `step_inf` never reaches `step_tol`
            // — even though the identified fit is exactly at its optimum (zero
            // range-space residual, frozen objective). Tying the certificate ONLY
            // to the full step (proof (a)) therefore burned the entire 200/84-cycle
            // budget on an iterate that is already optimal on every identifiable
            // direction, and the inner solve was rejected by the FULL-residual KKT
            // check. Adding proof (b) certifies on the identified subspace without
            // loosening anything for a genuinely-identified fit: there
            // `projected_residual_range_space_inf` returns `None` (nullity == 0 ⇒
            // range == whole space), so this branch is dormant and the strict
            // full-residual path above governs unchanged.
            //
            // Unlike the constrained-stationary path below, this fires on a pure
            // identifiability null without requiring the `linearized_rel ≥ 0.5`
            // constraint-multiplier signature, which a structural rank-deficiency
            // need not produce.
            if (step_inf <= step_tol || objective_change <= objective_tol)
                && let Some(range_residual) = projected_residual_range_space_inf(
                    &projected_residual_vec,
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    total_p,
                )
                && range_residual <= residual_tol
            {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | identified-subspace KKT certificate: total residual={:.3e} > tol={:.3e} but its range-space (identified-subspace) component={:.3e} ≤ tol={:.3e}, step_inf={:.3e} (step_tol={:.3e}), |Δobjective|={:.3e} (obj_tol={:.3e}); the leftover residual lies in the unidentified penalized-Hessian null space ker(H_pen) (dropped by the range-restricted spectral step and projected out by the outer IFT pseudo-inverse) — the iterate is stationary on the entire identifiable subspace (proof: {}).",
                    cycle,
                    residual,
                    residual_tol,
                    range_residual,
                    residual_tol,
                    step_inf,
                    step_tol,
                    objective_change,
                    objective_tol,
                    if step_inf <= step_tol {
                        "full Newton step exhausted"
                    } else {
                        "objective frozen on the identified subspace while the unidentified null keeps the full step nonzero"
                    },
                );
                converged = true;
                break;
            }
            // Noise-floor KKT certificate.
            //
            // Reading the joint stationarity residual ‖∇L(β) − Sβ‖_∞ at finite
            // precision picks up rounding mass from the X'WX assembly and the
            // per-block penalty contraction. For well-conditioned problems
            // that floor sits well below `residual_tol`, so the strict path
            // fires and this branch is dormant. For tightly converged inner
            // states where the Newton iterate is already at the analytic
            // optimum but every additional step changes the objective by less
            // than `objective_tol` and the recomputed residual lands just
            // above `residual_tol` due to arithmetic noise, the strict path
            // alone refuses to certify convergence — even though no further
            // useful descent direction exists. Burning hundreds of identical
            // descent cycles past that point neither tightens the inner
            // optimum (the noise floor sets a hard lower bound on ‖rhs‖) nor
            // gives the outer optimizer more hyperparameter information; it
            // just causes the outer wrapper to reject every seed as
            // "inner did not converge" and downstream callers to mark the
            // analytic outer Hessian as unavailable.
            //
            // Combining two independent post-step signals — objective change
            // within scale-aware tolerance AND residual within the same KKT
            // tolerance — supplies the missing certificate without weakening
            // the envelope-theorem requirement. A residual above tolerance
            // can be a free Hessian-null gradient component, not an active
            // multiplier, so it must not be accepted by an objective-flatness
            // rule.
            //
            // Distinct from the strict path because the strict path is silent
            // on objective change;
            // distinct from the trust-region floor certificate at the head
            // of the cycle because that one fires only when the trust radius
            // has collapsed to its 1e-12 floor with all attempts rejected,
            // whereas this branch fires when the trust region is still open
            // but each accepted step is no longer producing detectable
            // objective progress.
            let objective_change = signed_obj_change.abs();
            if objective_change.is_finite() {
                geometric_tail_history.push_back(objective_change);
                while geometric_tail_history.len() > GEOMETRIC_TAIL_WINDOW {
                    geometric_tail_history.pop_front();
                }
            }
            if objective_change <= objective_tol && residual <= residual_tol {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | noise-floor KKT certificate: residual={:.3e} <= tol={:.3e}, |Δobjective|={:.3e} <= obj_tol={:.3e}",
                    cycle,
                    residual,
                    residual_tol,
                    objective_change,
                    objective_tol,
                );
                converged = true;
                break;
            }

            // Constrained-stationary certificate.
            //
            // The inner Newton system is `Hδ = -g`, solved over the
            // active-constraint-aware subspace (the QP step path).  When
            // the *unprojected* gradient `g` carries a large Lagrange-
            // multiplier component pointing into the constraint —
            // i.e. some β coordinates are pinned at the bound or against
            // the family's structural constraint surface — the linear
            // solve correctly DOES NOT try to eliminate that component,
            // because doing so would push β infeasibly.  The signature of
            // this state is precise and entirely local to the most recent
            // accepted step:
            //
            //   • `‖g + Hδ‖∞ / ‖g‖∞ ≥ 0.5` — the linear solve neutralised
            //     ≤ 50 % of g; the remainder is structurally outside the
            //     solver's range, i.e. it's a Lagrange multiplier of the
            //     active constraints, not a defect of the linear solve.
            //   • `|actual − pred| / max(|pred|, …) ≤ 1e-3` — the local
            //     quadratic Newton model agrees with the actual objective
            //     change to roundoff, so the Hessian and gradient are
            //     correct AT this β.  The "stuck" residual is not noise
            //     in the linearisation; it's a real multiplier.
            //   • `|Δobjective| ≤ objective_tol` — the objective has
            //     ceased moving meaningfully.
            //   • `|δ|∞ ≤ step_tol` — the accepted feasible Newton step is
            //     exhausted. Objective flatness alone is not a terminal
            //     signal on large survival fits: a step of O(1e-2..1e-1)
            //     can still continue reducing the KKT residual after the
            //     objective first crosses tolerance.
            //
            // Together these four are the rigorous certificate that
            // Newton has reached a constrained-stationary point: further
            // cycles would reproduce the same plateau (the diagnostic in
            // PIRLS/JN/math shows `‖g+Hδ‖/‖g‖` constant near 1 cycle
            // after cycle, the very signature this certificate names).
            //
            // The 0.5 threshold on `linearized_rel` is conservative —
            // an unconstrained Newton step has `linearized_rel ≈ 1e-12`;
            // a step deliberately constrained to a (k-1)-dim subspace
            // leaves the orthogonal Lagrange direction in the residual
            // and `linearized_rel ≈ |λ|/|g| > 0`, typically 0.9+ in
            // practice when the multiplier dominates.  Anything ≥ 0.5
            // is unambiguously in the constrained-stationary regime;
            // unconstrained Newton with `linearized_rel ≥ 0.5` would
            // have already failed the trust-region's scalar model test
            // and been rejected upstream.
            if let Some(math) = last_joint_math.as_ref() {
                let linearized_rel = math.linearized_rel();
                let scalar_model_relerr = math.scalar_model_relative_error();
                let geometric_tail_bound = if geometric_tail_history.len() == GEOMETRIC_TAIL_WINDOW
                {
                    let values = geometric_tail_history.iter().copied().collect::<Vec<_>>();
                    let mut max_ratio = 0.0_f64;
                    let mut valid = true;
                    for pair in values.windows(2) {
                        let prev = pair[0];
                        let next = pair[1];
                        if prev <= 0.0 || next < 0.0 || !prev.is_finite() || !next.is_finite() {
                            valid = false;
                            break;
                        }
                        let ratio = next / prev;
                        if !ratio.is_finite() || ratio >= 1.0 {
                            valid = false;
                            break;
                        }
                        max_ratio = max_ratio.max(ratio);
                    }
                    if valid {
                        Some(objective_change / (1.0 - max_ratio).max(1.0e-12))
                    } else {
                        None
                    }
                } else {
                    None
                };
                let certificate_decision = constrained_stationary_certificate_decision(
                    math,
                    objective_change,
                    objective_tol,
                    step_tol,
                    geometric_tail_bound,
                    residual,
                    residual_tol,
                );
                if !matches!(
                    certificate_decision,
                    ConstrainedStationaryCertificate::NotCandidate
                ) {
                    // The `linearized_rel >= 0.5` signal is necessary but not
                    // sufficient. It proves either (a) g carries a Lagrange
                    // multiplier of an active constraint that the QP's active
                    // set already represents — in which case the *projected*
                    // residual is at tolerance — or (b) H is rank-deficient
                    // in the direction of g, so Hδ ≈ 0 along the null
                    // direction regardless of whether g is a multiplier or a
                    // real defect. Case (b) is the survival marginal-slope
                    // pathology at large scale: H σ_min ≈ 1e-12 and Newton
                    // genuinely cannot move g, but the residual is NOT a
                    // captured multiplier — it's an unresolved KKT defect in
                    // the H-null subspace.
                    //
                    // The projected residual computed at the top of this
                    // block (line ~12055) already subtracts the multiplier
                    // mass of every row in `cached_active_sets`. If that
                    // residual is at tolerance, case (a) holds and the
                    // certificate is honest. If it's still orders of
                    // magnitude above tolerance, case (b) holds: certifying
                    // here would hand the unified evaluator a
                    // `kkt_residual` with norm ≈ ‖g‖ which then gets
                    // amplified by H⁻¹_proj in the cost/gradient IFT
                    // corrections, contaminating the envelope formula and
                    // triggering the "envelope-gradient consistency"
                    // tripwire downstream. Bail with `converged = false` so
                    // the outer optimizer rejects this ρ cleanly, exactly
                    // as it would on any other non-converged inner exit.
                    let cert_residual_factor = 1.0;
                    if matches!(
                        certificate_decision,
                        ConstrainedStationaryCertificate::Accept
                    ) {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | constrained-stationary certificate: \
                             linear-solve neutralised {:.1}% of g (the remaining {:.1}% is a Lagrange multiplier \
                             of the active constraint set, not an unresolved gradient); \
                             scalar Newton model agrees with reality to relerr={:.3e} (Hessian+gradient are correct \
                             at this β); projected residual={:.3e} ≤ {:.1}×tol={:.3e} (multipliers captured by active set); \
                             |Δobjective|={:.3e}, geometric_tail_bound={:.3e}, obj_tol={:.3e}; further cycles cannot reduce the \
                             multiplier mass and would reproduce this plateau indefinitely; \
                             active-set multiplier mass will be projected out of the KKT residual \
                             before the outer IFT correction is assembled",
                            cycle,
                            (1.0 - linearized_rel) * 100.0,
                            linearized_rel * 100.0,
                            scalar_model_relerr,
                            residual,
                            cert_residual_factor,
                            cert_residual_factor * residual_tol,
                            objective_change,
                            geometric_tail_bound.unwrap_or(objective_change),
                            objective_tol,
                        );
                        converged = true;
                        break;
                    }
                    // Penalty-null-space acceptance (gam#553). The phantom-
                    // multiplier refusal fires when the active-set-projected
                    // residual is above tolerance, but that residual can be
                    // confined to `ker(H_pen)` — the polynomial null space of a
                    // penalized smooth (TP / Bernstein trend) that the censored
                    // location-scale / custom-family data does not pin down in
                    // the time_transform / log_sigma channel. Along that
                    // direction there is neither curvature nor a constraint, so
                    // it is a genuinely free gauge direction and the iterate is
                    // stationary on the entire identifiable (range) subspace.
                    // The downstream outer IFT trace removes exactly this
                    // null-space component via the projected pseudo-inverse, so
                    // only a *range-space* residual biases the envelope gradient
                    // (the precise concern of the "do NOT soft-accept" note
                    // below). Accept iff the range-space residual is at
                    // tolerance — preserving outer-gradient correctness while no
                    // longer aborting a well-posed fit on a data-unconstrained
                    // null direction.
                    if let Some(range_residual) = projected_residual_range_space_inf(
                        &projected_residual_vec,
                        &joint_hessian_source,
                        &ranges,
                        &s_lambdas,
                        ridge,
                        options.ridge_policy,
                        total_p,
                    ) && range_residual <= cert_residual_factor * residual_tol
                    {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | penalty-null-space certificate (gam#553): \
                             total projected residual={:.3e} > tol={:.3e} but its range-space (curved-subspace) \
                             component={:.3e} ≤ {:.1}×tol={:.3e}; the remaining residual lies in the data-unconstrained \
                             penalty null space ker(H_pen) (a free polynomial-trend gauge direction, not a defect) and is \
                             projected out of the KKT residual by the outer IFT pseudo-inverse before the envelope \
                             correction; |Δobjective|={:.3e}, obj_tol={:.3e}",
                            cycle,
                            residual,
                            cert_residual_factor * residual_tol,
                            range_residual,
                            cert_residual_factor,
                            cert_residual_factor * residual_tol,
                            objective_change,
                            objective_tol,
                        );
                        converged = true;
                        break;
                    }
                    // Constrained exact-fixed-point acceptance (gam#797).
                    //
                    // We reach here only with the iterate ALREADY proven stationary
                    // (objective + step exhausted, `linearized_rel >= 0.5` so the
                    // residual is multiplier/null mass, `scalar_relerr <= 1e-3` so
                    // the quadratic model is exact), the strict/range-space/noise
                    // certificates having declined. For a CONSTRAINED block the
                    // remaining residual can be a genuine active-constraint Lagrange
                    // multiplier that the active-set QP under-identified (it reports
                    // only rows it drove tight during a non-degenerate step, so a
                    // monotone derivative-guard row tight at the optimum but never
                    // explicitly stepped is missing), leaving the cone projection
                    // unable to decompose `r = A_activeᵀ λ` and the residual stuck
                    // far above tol on an iterate that is EXACTLY the constrained
                    // optimum (the `active_set_incomplete` refusal; gam#797 survival
                    // marginal/logslope/time blocks).
                    //
                    // When (a) the joint Newton has reached a numerical FIXED POINT
                    // — the accepted step and objective change are both at the
                    // machine-epsilon floor relative to the iterate, so no further
                    // progress is mathematically possible — (b) the local quadratic
                    // model is exact (`scalar_relerr` tiny), and (c) the design
                    // carries linear inequality constraints AND `H_pen` has NO
                    // numerical null space (so the residual is an active-constraint
                    // multiplier, NOT an H-null/rank-deficient defect, which the
                    // range-space certificate above already handles), the iterate is
                    // a bona fide constrained KKT point. The active-constraint
                    // multiplier mass is projected out of the KKT residual by the
                    // unified evaluator's active-constraint-aware IFT correction
                    // before the envelope gradient, exactly as for an explicitly
                    // captured multiplier, so certifying here is correct. Gated
                    // strictly on a fixed point with no H-null, so a genuinely
                    // non-converged or rank-deficient iterate is never accepted.
                    let any_block_constrained = block_constraints.iter().any(|c| c.is_some());
                    let beta_scale = states
                        .iter()
                        .flat_map(|s| s.beta.iter().copied())
                        .map(f64::abs)
                        .fold(0.0_f64, f64::max)
                        .max(1.0);
                    let fixed_point_floor = 64.0 * f64::EPSILON * beta_scale;
                    let objective_floor = 64.0 * f64::EPSILON * (1.0 + lastobjective.abs());
                    let at_numerical_fixed_point = accepted_step_inf.is_finite()
                        && accepted_step_inf <= fixed_point_floor
                        && objective_change <= objective_floor
                        && scalar_model_relerr <= 1e-3;
                    if any_block_constrained && at_numerical_fixed_point {
                        // Materialize H_pen = H + S(λ) (+ model ridge) and count its
                        // numerical null space at the shared rank tolerance: nullity == 0
                        // ⇒ the stuck residual is NOT an H-null/rank-deficient defect
                        // (that case is handled by the range-space certificate above) but
                        // a genuine active-constraint multiplier.
                        let hpen_nullity = materialize_joint_hessian_source(
                            &joint_hessian_source,
                            total_p,
                            "constrained fixed-point nullity check",
                        )
                        .ok()
                        .map(|mut h_pen| {
                            let model_diagonal_ridge =
                                if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                                    ridge
                                } else {
                                    0.0
                                };
                            add_joint_penalty_to_matrix(
                                &mut h_pen,
                                &ranges,
                                &s_lambdas,
                                model_diagonal_ridge,
                                None,
                            );
                            symmetrize_dense_in_place(&mut h_pen);
                            symmetric_penalized_hessian_nullity(&h_pen)
                        })
                        .unwrap_or(None);
                        if hpen_nullity == Some(0) {
                            log::info!(
                                "[PIRLS/joint-Newton convergence] cycle {:>3} | constrained fixed-point certificate:                                  accepted_step_inf={:.3e} ≤ {:.3e} and |Δobjective|={:.3e} ≤ {:.3e} (numerical fixed point),                                  scalar_relerr={:.3e}, linearized_rel={:.3e}; H_pen has no numerical null space so the                                  residual={:.3e} is an active-constraint Lagrange multiplier (the QP under-identified the                                  binding rows), projected out of the KKT residual by the active-constraint-aware IFT                                  correction before the envelope gradient — the iterate is a constrained KKT point",
                                cycle,
                                accepted_step_inf,
                                fixed_point_floor,
                                objective_change,
                                objective_floor,
                                scalar_model_relerr,
                                linearized_rel,
                                residual,
                            );
                            converged = true;
                            break;
                        }
                    }
                    // Still-converging guard (gam#787 duchon centers≥20). The
                    // certificates above all declined, so the iterate would be
                    // refused as a multiplier/null plateau. But the
                    // `linearized_rel ≥ 0.5` + flat-objective signature that
                    // routed us here ALSO holds for a logslope block whose
                    // objective is already at its Φ-bounded floor while the KKT
                    // residual is still polishing by a STEADY geometric factor
                    // each cycle. Refusing there rejects the seed a few cycles
                    // short of `residual_tol` (→ outer seed-rejection → raise).
                    // If the residual is in steady geometric descent over the
                    // recent window, the direction is genuinely converging, not
                    // plateaued: keep iterating (bounded by the inner cycle cap)
                    // rather than refuse. The genuine plateau (flat/oscillating
                    // residual above tol) fails this test and refuses as before.
                    if residual_in_steady_geometric_descent(&residual_descent_history) {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | certificate declined but residual in steady geometric descent (history={:?}, residual={:.3e}, tol={:.3e}); continuing to convergence rather than refusing as a plateau",
                            cycle,
                            residual_descent_history,
                            residual,
                            residual_tol,
                        );
                        continue;
                    }
                    // EARLY-CYCLE CARVE-OUT (gam#826/#872). The phantom-multiplier
                    // refusal asserts that the residual is a captured Lagrange
                    // multiplier / H-null mass that Newton genuinely cannot move —
                    // a claim that requires EVIDENCE of a plateau. The candidate
                    // conditions above (objective + step exhausted, linearized_rel ≥
                    // 0.5) are ALSO satisfied transiently when a single Newton step
                    // is small because the augmented (Firth) curvature `H_Φ` is
                    // legitimately large in the `∇Φ` direction at an oversmoothed
                    // cycle-0 seed: the step `(H+Sλ+H_Φ)⁻¹(∇L−Sβ+∇Φ)` is tiny (high
                    // curvature ⇒ short step) and ONE step undershoots the
                    // nonquadratic Firth optimum, so `step_inf` and `|Δobj|` look
                    // exhausted while the residual is still O(‖∇Φ‖) ≫ tol. Refusing
                    // there at cycle 0 (no descent history yet) aborts the coupled
                    // binomial location-scale / flexible-linkwiggle fit before the
                    // inner has taken the handful of cycles it needs to walk the
                    // curved Firth basin to its optimum. When the residual is still
                    // ORDERS above tol and we lack a full descent window to prove a
                    // genuine plateau, keep iterating — the inner cycle cap and the
                    // residual-stall / trust-region-floor guards still bound the
                    // loop and diagnose a true non-convergence. A genuine multiplier
                    // plateau (residual flat across the window) is caught once the
                    // history fills, exactly as before. The threshold is the same
                    // `RESIDUAL_DESCENT_WINDOW` the descent test uses, so this only
                    // defers the refusal until there is enough history to make it,
                    // never weakens it.
                    let residual_far_above_tol = residual.is_finite()
                        && residual_tol.is_finite()
                        && residual > cert_residual_factor * residual_tol;
                    if residual_far_above_tol
                        && residual_descent_history.len() < RESIDUAL_DESCENT_WINDOW
                    {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | constrained-stationary refusal DEFERRED: residual={:.3e} ≫ tol={:.3e} but only {} descent samples (< {} window) — too early to prove a multiplier/null plateau vs a high-curvature Firth-basin transient; continuing",
                            cycle,
                            residual,
                            residual_tol,
                            residual_descent_history.len(),
                            RESIDUAL_DESCENT_WINDOW,
                        );
                        continue;
                    }
                    // UNCONSTRAINED MODEL-STATIONARY ACCEPTANCE (gam#826/#808/#715).
                    //
                    // The phantom-multiplier refusal asserts the residual is a
                    // captured Lagrange multiplier of an active constraint that
                    // the QP could not decompose. That diagnosis is categorically
                    // IMPOSSIBLE when there is no active constraint at all: a
                    // residual cannot be a phantom multiplier of a constraint that
                    // does not exist. For a fully UNCONSTRAINED coupled fit
                    // (multinomial softmax; the location-scale flat blocks) on a
                    // near-flat Fisher surface (`diag(p)−ppᵀ → 0`, or the
                    // high-curvature/low-curvature `log_sigma` block) the
                    // Firth-augmented stationarity residual `‖∇L−Sβ+∇Φ‖` floors
                    // LEGITIMATELY above `4·residual_tol`: the absolute curvature
                    // is tiny so `residual_tol = inner_tol·(1+grad/pen/firth)` is
                    // tiny too, yet the Newton/dogleg step exhausts before the
                    // residual drops below that band — `residual_tol` is scaled by
                    // the gradient magnitude and does not see the flat-Fisher
                    // absolute-curvature floor. The well-conditioned spectrum keeps
                    // the conditioning-keyed Levenberg gate (`COND_NEWTON_SAFETY`)
                    // off, so neither LM nor the cond-armed dogleg engages, and
                    // every seed is refused as `phantom_multiplier_with_well_
                    // conditioned_H`.
                    //
                    // When the model itself certifies stationarity — the standard
                    // trust-region "predicted decrease ≈ 0" criterion, here the
                    // `at_numerical_fixed_point` flag (accepted step at the
                    // machine-eps floor, |Δobj| at the eps floor, scalar model
                    // exact to relerr ≤ 1e-3) — AND no further progress is being
                    // made (the steady-geometric-descent test above declined) AND
                    // we have a full descent window (the early-cycle deferral above
                    // passed, so this is a proven plateau not a Firth-basin
                    // transient), an unconstrained iterate is a bona fide
                    // first-order optimum: the quadratic model says no step can
                    // reduce the residual further, and there is no constraint whose
                    // multiplier the residual could otherwise represent. The
                    // residual that remains lives where the model is flat
                    // (vanishing curvature), so it carries no `gᵀ∂β/∂ρ` envelope
                    // contribution the outer IFT could not already neutralise
                    // through its penalty-projected pseudo-inverse. Accept.
                    //
                    // This does NOT regress #729 (coupled Dirichlet): that fit
                    // converges to a genuine `residual < residual_tol` and exits
                    // via the strict KKT certificate long before this branch, and
                    // even if reached it has a curved (non-flat) Fisher surface so
                    // its model is not at a fixed point with a residual stuck above
                    // tol. It does NOT mask a real non-convergence: a still-moving
                    // iterate fails `at_numerical_fixed_point` (its step / |Δobj|
                    // are above the eps floor), and a rank-deficient H-null defect
                    // is the CONSTRAINED concern the fixed-point certificate above
                    // already handles via its nullity check.
                    // The certificate-candidate conditions that routed us into
                    // this block already PROVE model stationarity for the
                    // unconstrained case: `objective_exhausted` + `step_inf ≤
                    // step_tol` (the model's minimizer is at this β), `scalar_relerr
                    // ≤ 1e-3` (the quadratic model is exact), and `linearized_rel ≥
                    // 0.5` (‖g+Hδ‖ ≈ ‖g‖, so `Hδ ≈ 0` — the residual lives in the
                    // flat/near-null subspace of H, exactly a flat-Fisher direction
                    // for an unconstrained fit). We do NOT additionally require the
                    // far stricter machine-eps `at_numerical_fixed_point` here: on a
                    // flat Fisher surface the dogleg keeps taking a small step at
                    // the `step_tol` floor every cycle, so `accepted_step_inf` floors
                    // a hair above `64·eps·|β|` and the eps-fixed-point flag never
                    // sets even though the model is stationary. The `step_tol` floor
                    // (`inner_tol·(1+|β|∞)`) is the principled stationarity gate; the
                    // eps floor is for the constrained-multiplier certificate, where
                    // a tighter proof is warranted because a wrong accept biases the
                    // constraint-aware IFT kernel.
                    let any_active_set_rows = cached_active_sets
                        .iter()
                        .any(|maybe| maybe.as_ref().is_some_and(|rows| !rows.is_empty()));
                    let unconstrained_fit = !any_block_constrained && !any_active_set_rows;
                    if unconstrained_fit {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | unconstrained model-stationary certificate (gam#826/#808/#715): \
                             no active constraint (active_set_rows_total=0) so the residual={:.3e} cannot be a phantom multiplier; \
                             the iterate is a numerical fixed point (accepted_step_inf={:.3e}, |Δobjective|={:.3e}, scalar_relerr={:.3e}) \
                             on a flat Fisher surface where residual_tol={:.3e} sits below the absolute-curvature floor; \
                             linearized_rel={:.3e}, |Δobjective| exhausted and residual not in steady descent → genuine first-order optimum, accepting",
                            cycle,
                            residual,
                            accepted_step_inf,
                            objective_change,
                            scalar_model_relerr,
                            residual_tol,
                            linearized_rel,
                        );
                        converged = true;
                        break;
                    }
                    // Structured per-block + per-spectrum refusal report.
                    // The legacy one-line refusal log printed only aggregate
                    // numbers (linearized_rel, scalar_relerr, residual,
                    // |Δobj|) and was not actionable on models with many
                    // blocks: it could not identify WHICH smooth carried
                    // the unresolved mass, nor whether H_pen was genuinely
                    // rank-deficient (the "polynomial null space slipped
                    // past absorption" pathology). Cost: one dense
                    // materialize + symmetric eigh on H_pen at this β,
                    // sub-millisecond for typical p, executed once per
                    // refusal (the loop breaks immediately after).
                    let report = compute_kkt_refusal_report(
                        cycle,
                        &states,
                        specs,
                        &s_lambdas,
                        &ranges,
                        cached_joint_gradient.as_ref(),
                        &cached_active_sets,
                        &block_constraints,
                        Some(&joint_hessian_source),
                        total_p,
                        ridge,
                        options.ridge_policy,
                        accepted_step_inf,
                        step_inf,
                        joint_trust_radius,
                        residual_tol,
                        objective_tol,
                        step_tol,
                        objective_change,
                        residual,
                        Some(&math),
                    );
                    log::warn!(
                        "{}",
                        report.format_structured_log(cert_residual_factor * residual_tol)
                    );
                    last_kkt_refusal_report = Some(report);
                    converged = false;
                    break;
                }
            }

            // INVESTIGATION NOTE — do NOT soft-accept here.
            //
            // The outer objective is V(ρ) = f(β*(ρ), ρ), where β*(ρ)
            // satisfies g(β*,ρ)=∇_β f=0.  The envelope/IFT gradient used
            // by the outer optimizer is
            //
            //   dV/dρ_j = ∂f/∂ρ_j
            //
            // only at g=0.  At a non-stationary β, the actual chain rule is
            //
            //   d f(β(ρ),ρ)/dρ_j = ∂f/∂ρ_j + gᵀ ∂β/∂ρ_j.
            //
            // A soft certificate based only on small Δf discards the second
            // term without proving it is small.  The projected pseudo-inverse
            // in the outer trace path removes null-space components of g, but
            // any range-space component still contributes gᵀ∂β/∂ρ and gives
            // ARC/BFGS a biased outer gradient.  The `[PIRLS/JN/math]` line
            // above now prints the actual Newton identity:
            //
            //   old_kkt = ‖g‖∞,
            //   linearized_next = ‖g + Hδ‖∞ = ‖Hδ-rhs‖∞,
            //   new_kkt = ‖g(β+δ)‖∞,
            //   scalar_model relerr = |actual-pred|/max(1,|pred|).
            //
            // That is the proof surface. The diagnostic reports the measured
            // linear solve residual, post-step KKT residual, scalar model
            // error, and step sizes directly; downstream analysis should use
            // those numbers rather than this solver attaching labels.

            // Residual-stall early-exit. The strict and noise-floor
            // certificates above require the KKT residual to land within
            // a small multiple of residual_tol. On survival marginal-slope
            // at large scale the residual oscillates in a band that is
            // orders of magnitude above tol without trending down while
            // the unconstrained proposal has |prop|∞ in the 10³–10⁶ range,
            // the TR clamps it, and each clamped step moves β by O(1)
            // without driving ‖∇L − Sβ‖∞ closer to KKT.
            //
            // Spending the remaining cycle budget on this pattern hits
            // inner_max_cycles "non-converged", which then routes the
            // outer optimizer through the first-order bridge with a stale
            // same-ρ inner mode and a gradient of magnitude 10⁷ that kills
            // BFGS line search at iter 0 (the failure mode pinned in the
            // commit messages of 6578e884 and 1c181d1f).
            //
            // Track the best residual seen so far and the number of
            // cycles since any meaningful improvement (≥ 10 % drop). Once
            // the inner has burned at least RESIDUAL_STALL_MIN_CYCLES
            // without progress, the accepted step kept hitting the
            // trust-region clamp, AND every block is already inside a
            // loose stationarity band, return `converged = false` with
            // the current finite β. The per-block gate is essential for
            // block-metric trust regions: an aggregate residual plateau
            // dominated by one near-singular block must not hide an
            // unresolved marginal block that can still make progress under
            // its own radius.
            if residual.is_finite() {
                if residual < RESIDUAL_STALL_IMPROVEMENT_FACTOR * best_residual_seen {
                    best_residual_seen = residual;
                    cycles_since_residual_improved = 0;
                    tr_clamped_during_stall = false;
                } else {
                    cycles_since_residual_improved =
                        cycles_since_residual_improved.saturating_add(1);
                    if last_accepted_hit_joint_trust_boundary {
                        tr_clamped_during_stall = true;
                    }
                }
            }
            if cycle + 1 >= RESIDUAL_STALL_MIN_CYCLES
                && cycles_since_residual_improved >= RESIDUAL_STALL_NO_IMPROVE_CYCLES
                && tr_clamped_during_stall
                && all_block_stationarity_small
            {
                let last_math_summary = last_joint_math
                    .as_ref()
                    .map(|math| {
                        format!(
                            "last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
                            math.old_kkt_inf,
                            math.linearized_next_kkt_inf,
                            math.actual_reduction,
                            math.predicted_reduction,
                            math.trust_ratio,
                            math.scalar_model_relative_error(),
                            math.step_inf,
                            math.proposal_inf,
                        )
                    })
                    .unwrap_or_else(|| "last_newton_math=<none>".to_string());
                log::warn!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | residual-stall early-exit: residual={:.3e} best_seen={:.3e} no_improve_cycles={} accepted_step_inf={:.3e} trust_radius={:.3e} block_stationarity_inf={:?} {}; returning unconverged with finite β so the outer optimizer rejects this ρ evaluation before inner_max_cycles.",
                    cycle,
                    residual,
                    best_residual_seen,
                    cycles_since_residual_improved,
                    accepted_step_inf,
                    joint_trust_radius,
                    block_stationarity_norms,
                    last_math_summary,
                );
                converged = false;
                break;
            }

            // KKT convergence: small residual plus EITHER a small
            // Newton step (tight quadratic-rate convergence, lets β
            // polish to machine precision), confirmed stagnation
            // (`accepted_step_inf <= step_tol` AND `objective_change
            // <= objective_tol`, the rank-deficient null-mode case),
            // OR a stricter stationarity certificate where both the
            // residual and objective change are an additional factor of
            // `inner_tol` below their scale-aware tolerances. The last
            // branch is deliberately stricter than the public tolerance:
            // it handles machine-precision null directions where β can
            // still move by about `step_tol` but the KKT residual and
            // objective are already over-polished. Using objective
            // stagnation alone is not sufficient; the residual guard is
            // what preserves first-order correctness.
            let superconverged_residual_tol = inner_tol * residual_tol;
            let superconverged_objective_tol = inner_tol * objective_tol;
            let superconverged_stationarity = residual <= superconverged_residual_tol
                && objective_change <= superconverged_objective_tol;
            if residual <= residual_tol
                && (step_inf <= step_tol
                    || (accepted_step_inf <= step_tol && objective_change <= objective_tol)
                    || superconverged_stationarity)
            {
                log::info!(
                    "[JN-EXIT] cycle={cycle} reason=plateau_objective_flat residual={residual:.3e} residual_tol={residual_tol:.3e} obj_change={objective_change:.3e} objective_tol={objective_tol:.3e} consecutive_flat={} accepted_step_inf={accepted_step_inf:.3e} step_tol={step_tol:.3e}",
                    obj_flat_streak.streak(),
                );
                converged = true;
                break;
            }
            obj_flat_streak.note(objective_change <= objective_tol);
            // Carry the KKT-stationarity / objective-stagnation signals
            // into the next cycle so the line-search-failure path above
            // can recognise a true KKT optimum on a rank-deficient null
            // mode. See that path for the full rationale.
            last_cycle_residual_below_tol = residual <= residual_tol;
            last_cycle_obj_change_below_tol = objective_change <= objective_tol;

            // NOTE: there is deliberately NO wall-clock-driven "adaptive
            // early-exit" here. A convergence verdict that fires when a cycle's
            // wall-clock happens to fall below a fraction of a running EMA is
            // non-deterministic — under CPU contention (a parallel sweep) the
            // same fit accepts at a different iterate than it does run alone,
            // which cascades into a different outer seed and a different
            // continuation-pre-warm fire/collapse decision (gam#979's
            // "collapses sequentially, fires in parallel" instability). It also
            // accepts iterates up to 10× outside the real KKT/objective
            // tolerance, biasing the REML/LAML criterion the inner residual
            // feeds. Convergence is certified ONLY by the mathematical tests
            // above (KKT residual / Newton step / objective change at their
            // scale-aware tolerances); whether convergence is *reachable within
            // the cycle budget* is judged by the deterministic descent-rate
            // guard alongside the residual-stall detector above.
        }

        // Explicit terminal verdict for the joint-Newton inner solve.
        //
        // The per-cycle `[PIRLS/JN] cyc=N/MAX … resid=… (tol=…)` line prints
        // the KKT/step/objective gaps at every cycle but never states which
        // criterion *terminated* the loop, so the final visible line on a
        // budget-exhausted solve looks identical to an ordinary mid-run cycle
        // (gam#744). A reader scanning a sweep log cannot tell a fit that
        // reached a stationary point from one that simply ran out of cycles
        // with the residual still orders of magnitude above tolerance and only
        // the objective stalled. Emit one authoritative line, on every exit
        // path, naming the terminating condition: `converged` is the honest
        // status the result carries downstream, `budget_exhausted` distinguishes
        // "ran the full cap" from an early certificate/divergence exit, and the
        // residual/step/objective stall flags say *why*. A budget-exhausted,
        // non-converged exit is logged at WARN so it is impossible to miss even
        // when per-cycle INFO is filtered out; a clean convergence is INFO.
        {
            let budget_exhausted = cycles_done >= inner_max_cycles;
            let terminator = if converged {
                "KKT/certificate-converged"
            } else if budget_exhausted {
                "budget-exhausted (max cycles reached)"
            } else {
                "early-exit non-converged (divergence/stall guard)"
            };
            // `solve_wall` (whole inner-solve elapsed) + `cycles` make the
            // per-solve cost explicit on ONE line: gam#979's "outer
            // multiplication" candidate is read off by counting these terminal
            // lines across a repro and summing their wall-times, and the
            // overhead candidate by comparing `solve_wall / cycles` against the
            // [joint-newton-tr] phase splits. Together with the per-cycle
            // `per_block_resid` (which block stalls) and the existing TR line
            // (ρ gain-ratio + decision: model infidelity vs TR throttling), a
            // single RUST_LOG=info run separates all four #979 candidates.
            let verdict = format!(
                "[PIRLS/joint-Newton terminal] converged={} terminator={} cycles={}/{} \
                 solve_wall={:.3}s best_residual_inf={:.3e} (tol={:.3e}) last_residual_below_tol={} \
                 last_obj_change_below_tol={} objective={:.6e}; this is the status the inner \
                 solve reports to the outer REML/LAML evaluation — a non-converged exit \
                 (residual ≫ tol with only the objective stalled) is rejected, not accepted",
                converged,
                terminator,
                cycles_done,
                inner_max_cycles,
                inner_started.elapsed().as_secs_f64(),
                best_residual_seen,
                last_residual_tol,
                last_cycle_residual_below_tol,
                last_cycle_obj_change_below_tol,
                lastobjective,
            );
            if converged {
                log::info!("{verdict}");
            } else {
                log::warn!("{verdict}");
            }
        }

        // If joint Newton converged, skip the blockwise loop entirely.
        if converged {
            let penalty_value = total_quadratic_penalty(
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            let (block_logdet_h, block_logdet_s) = blockwise_logdet_terms_with_workspace(
                family,
                specs,
                &mut states,
                block_log_lambdas,
                options,
                cached_joint_workspace.clone(),
            )?;
            // The IFT/outer KKT residual must be the AUGMENTED stationarity
            // `∇L − Sβ + ∇Φ` the inner Newton actually drove to zero — NOT the bare
            // `∇L − Sβ`. With the Firth term armed, `∇L − Sβ = −∇Φ` at the
            // converged β, so the bare residual's null-space component equals ∇Φ
            // (O(‖∇Φ‖), e.g. 2.49 for the coupled Dirichlet). The outer evaluator's
            // range-projected IFT validity gate (`projected_into_reduced_range`)
            // then sees that ‖∇Φ‖ of "unresolved mass outside the reduced range"
            // and rejects EVERY seed at outer startup validation ("no candidate
            // seeds passed", gam#729/#715). Folding ∇Φ into the gradient makes the
            // residual the genuinely-near-zero augmented stationarity the inner
            // certified, so the gate passes. No-op when the term is
            // condition-gated/unavailable (∇Φ=0).
            let augmented_joint_gradient: Option<Array1<f64>> = match (
                cached_joint_gradient.as_ref(),
                joint_jeffreys_subspace.as_ref(),
            ) {
                (Some(gradient), Some(z_joint)) => {
                    match custom_family_joint_jeffreys_term(
                        family, &states, specs, &ranges, z_joint,
                    )? {
                        Some((_phi, grad_phi, _hphi)) if grad_phi.len() == gradient.len() => {
                            Some(gradient + &grad_phi)
                        }
                        _ => None,
                    }
                }
                _ => None,
            };
            let ift_gradient = augmented_joint_gradient
                .as_ref()
                .or(cached_joint_gradient.as_ref());
            let kkt_residual = exact_newton_joint_kkt_residual_for_ift_from_cached_gradient(
                family,
                specs,
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                Some(cached_active_sets.as_slice()),
                ift_gradient,
            )?;
            let kkt_residual =
                require_projected_kkt_residual(kkt_residual, "joint-Newton converged exit")?;
            // Thread the cert tolerance + free subspace rank through to
            // the unified evaluator's certificate so the outer
            // optimiser's InnerStatus carrier sees honest numbers
            // instead of NaN / None.
            let active_set_rows_total: usize = cached_active_sets
                .iter()
                .map(|maybe| maybe.as_ref().map(|v| v.len()).unwrap_or(0))
                .sum();
            let free_rank_at_cert = total_p.saturating_sub(active_set_rows_total);
            let kkt_residual = kkt_residual.with_metadata(last_residual_tol, free_rank_at_cert);
            // Build the joint active-constraint block for the unified
            // evaluator's constraint-aware kernel
            // `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. Returns `None` when
            // the family has no declared inequality constraints, or when
            // no rows are currently active at the cert point; in either
            // case the consumer-side `with_active_constraints` helper
            // degrades back to the bare penalty-projected pseudo-inverse.
            let active_constraints = {
                let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
                assemble_active_constraint_block(
                    &block_constraints,
                    &cached_active_sets,
                    &ranges,
                    total_p,
                )
                .map(std::sync::Arc::new)
            };
            return Ok(BlockwiseInnerResult {
                block_states: states,
                active_sets: normalize_active_sets(cached_active_sets),
                log_likelihood: current_log_likelihood,
                penalty_value,
                cycles: cycles_done,
                converged,
                block_logdet_h,
                block_logdet_s,
                s_lambdas,
                joint_workspace: cached_joint_workspace.clone(),
                kkt_residual: Some(kkt_residual),
                active_constraints,
            });
        }
        if cycles_done >= inner_max_cycles {
            if !converged {
                // Engine-level diagnostic. Emit measured quantities only:
                // objective movement, coefficient scale, per-block dimensions,
                // per-block β and gradient scales, the unprojected stationarity
                // norm at exit, the Hessian source shape, and the last accepted
                // Newton identity diagnostics. The outer error path has no
                // access to these internals, so this line is the complete
                // numerical record needed to decide the next fix.
                let block_grad_norms: Vec<f64> = match cached_joint_gradient.as_ref() {
                    Some(joint_grad) => {
                        let mut acc = 0usize;
                        states
                            .iter()
                            .map(|s| {
                                let n = s.beta.len();
                                let end = (acc + n).min(joint_grad.len());
                                let nrm = if acc < end {
                                    joint_grad
                                        .slice(ndarray::s![acc..end])
                                        .iter()
                                        .map(|x: &f64| x.abs())
                                        .fold(0.0_f64, f64::max)
                                } else {
                                    f64::NAN
                                };
                                acc += n;
                                nrm
                            })
                            .collect()
                    }
                    None => vec![f64::NAN; states.len()],
                };
                let block_widths: Vec<usize> = states.iter().map(|s| s.beta.len()).collect();
                let block_beta_inf: Vec<f64> = states
                    .iter()
                    .map(|s| s.beta.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max))
                    .collect();
                let descent_total = initial_joint_objective - lastobjective;
                let beta_inf_final = states
                    .iter()
                    .flat_map(|s| s.beta.iter().copied())
                    .map(f64::abs)
                    .fold(0.0_f64, f64::max);
                let block_diag_default =
                    !family.exact_newton_joint_hessian_beta_dependent() && specs.len() >= 2;
                let exit_unprojected_kkt_inf = cached_joint_gradient
                    .as_ref()
                    .and_then(|joint_grad| {
                        exact_newton_joint_stationarity_vector_from_gradient(
                            joint_grad,
                            &states,
                            specs,
                            &s_lambdas,
                            ridge,
                            options.ridge_policy,
                        )
                        .ok()
                    })
                    .map(|residual| {
                        residual
                            .iter()
                            .map(|x: &f64| x.abs())
                            .fold(0.0_f64, f64::max)
                    })
                    .unwrap_or(f64::NAN);
                let last_math_summary = last_joint_math
                    .as_ref()
                    .map(|math| {
                        format!(
                            "last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
                            math.old_kkt_inf,
                            math.linearized_next_kkt_inf,
                            math.actual_reduction,
                            math.predicted_reduction,
                            math.trust_ratio,
                            math.scalar_model_relative_error(),
                            math.step_inf,
                            math.proposal_inf,
                        )
                    })
                    .unwrap_or_else(|| "last_newton_math=<none>".to_string());
                log::warn!(
                    "[PIRLS/joint-Newton] cycle={} budget-exhausted without KKT: objective_start={:.6e} objective_end={:.6e} objective_drop={:+.3e} beta_inf={:.3e} exit_unprojected_kkt_inf={:.3e} total_p={} total_n={} block_widths={:?} block_beta_inf={:?} block_grad_inf={:?} block_diag_hessian_default={} {}; rejecting this outer REML/LAML evaluation",
                    cycles_done,
                    initial_joint_objective,
                    lastobjective,
                    descent_total,
                    beta_inf_final,
                    exit_unprojected_kkt_inf,
                    total_p,
                    total_joint_n,
                    block_widths,
                    block_beta_inf,
                    block_grad_norms,
                    block_diag_default,
                    last_math_summary,
                );
                if coupled_exact_joint_required {
                    // Budget-exhaustion error MUST carry `block_residual_inf=…`
                    // so the carrying block survives the bubble through the
                    // outer optimiser. If no in-cycle cert refusal produced
                    // a structured report we build one here from the cached
                    // joint gradient + states. `joint_hessian_source` is
                    // per-cycle so the H_pen spectrum fields degrade to
                    // NaN/empty; per-block residual data is fully present.
                    let block_diag = if let Some(report) = last_kkt_refusal_report.as_ref() {
                        report.format_bubbled_error()
                    } else {
                        let block_constraints =
                            collect_block_linear_constraints(family, &states, specs)?;
                        let report = compute_kkt_refusal_report(
                            cycles_done,
                            &states,
                            specs,
                            &s_lambdas,
                            &ranges,
                            cached_joint_gradient.as_ref(),
                            &cached_active_sets,
                            &block_constraints,
                            None,
                            total_p,
                            ridge,
                            options.ridge_policy,
                            f64::NAN,
                            f64::NAN,
                            f64::NAN,
                            last_residual_tol,
                            f64::NAN,
                            f64::NAN,
                            f64::NAN,
                            exit_unprojected_kkt_inf,
                            last_joint_math.as_ref(),
                        );
                        report.format_bubbled_error()
                    };
                    return Err(format!(
                        "coupled exact-joint inner solve exhausted the joint Newton budget without KKT convergence after {cycles_done} cycle(s) — {block_diag}"
                    ));
                }
            }
            let penalty_value = total_quadratic_penalty(
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            let (block_logdet_h, block_logdet_s) = blockwise_logdet_terms_with_workspace(
                family,
                specs,
                &mut states,
                block_log_lambdas,
                options,
                cached_joint_workspace.clone(),
            )?;
            let active_constraints = {
                let local_ranges = block_param_ranges(specs);
                let local_total_p = local_ranges.last().map(|(_, end)| *end).unwrap_or(0);
                let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
                assemble_active_constraint_block(
                    &block_constraints,
                    &cached_active_sets,
                    &local_ranges,
                    local_total_p,
                )
                .map(std::sync::Arc::new)
            };
            return Ok(BlockwiseInnerResult {
                block_states: states,
                active_sets: normalize_active_sets(cached_active_sets),
                log_likelihood: current_log_likelihood,
                penalty_value,
                cycles: cycles_done,
                converged,
                block_logdet_h,
                block_logdet_s,
                s_lambdas,
                joint_workspace: cached_joint_workspace.clone(),
                kkt_residual: None,
                active_constraints,
            });
        }
        if coupled_exact_joint_required {
            // Bubble the structured KKT refusal report (per-block residual
            // breakdown + H_pen spectrum + diagnosis) so the cause of the
            // refusal survives serialization through the outer optimizer,
            // the seed-validation cascade, and gamfit. When the cert refused
            // inside the cycle loop we already computed a `KktRefusalReport`
            // at the refusing iterate; reuse it verbatim. If a different
            // early-exit path reaches this branch, build the same structured
            // report from the last Newton math snapshot rather than routing
            // through a second diagnostic string format.
            let block_diag = last_kkt_refusal_report
                .as_ref()
                .map(KktRefusalReport::format_bubbled_error)
                .unwrap_or_else(|| {
                    "structured KKT refusal report unavailable: no joint Newton math snapshot"
                        .to_string()
                });
            return Err(format!(
                "coupled exact-joint inner solve exited the joint Newton path before convergence — {block_diag}"
            ));
        }
        // Otherwise fall through to blockwise iteration below.
    }

    let mut cached_eval = match cached_eval {
        Some(eval) => eval,
        None => family.evaluate(&states)?,
    };
    lastobjective = -cached_eval.log_likelihood + current_penalty;

    // Divergence-detection state for the blockwise loop.
    //
    // Some family parameterizations (e.g. BernoulliMarginalSlopeFamily with
    // linkwiggle + scorewarp) carry a near-null direction in the joint
    // Hessian when the link-deviation basis's empirical anchor — fixed at
    // the rigid-pilot η₀ when the basis is constructed — drifts during
    // PIRLS as the location/spatial blocks update η₀. The Newton step
    // becomes dominated by that null direction and is clamped at
    // MAX_NEWTON_STEP every cycle while β grows linearly along it; the
    // log-likelihood stays frozen, only the penalty changes (slowly).
    // Without an early-exit the loop runs to inner_max_cycles producing
    // the same -loglik over and over, which at large scale (each cycle
    // ~0.5s) burns ~50s per ρ-cost call and stacks up to a 2400s timeout.
    //
    // Detect the pattern and bail with `converged = false` so the cost
    // call returns Err / +∞, BFGS κ-optim backs off the divergent ρ
    // region, and the outer loop progresses instead of grinding.

    // Per-block trust-region radius in the block's penalized-Hessian metric.
    // Updated each cycle by `update_joint_trust_region_radius` (the same
    // function the joint-Newton path uses) on a real model-vs-truth rho
    // computed from each block's penalized quadratic. Using the curvature
    // metric here avoids the same starvation mechanism fixed in the joint
    // path: one near-null coordinate in a block must not raw-rescale every
    // other coordinate in that block. The η-overflow safety half of the
    // previous static `MAX_NEWTON_STEP = 20.0` is owned by the family's
    // `max_feasible_step_size` barrier check, called by the line search below;
    // this variable handles only the algorithmic trust-region half. The
    // initial seed value is the family-declared safe step for a fresh fit; the
    // function then adapts it freely (clamped to [1e-12, 1e6] by the function
    // itself, same as the joint path).
    const BLOCK_NEWTON_STEP_INITIAL: f64 = 20.0;
    let mut block_max_step: Vec<f64> = vec![BLOCK_NEWTON_STEP_INITIAL; specs.len()];

    let mut prev_log_likelihood_for_divergence_check = cached_eval.log_likelihood;
    // Frozen-loglik streak rides the shared window discipline
    // (loop_guard::FlatStreak, #968); the frozen-loglik predicate and the
    // clamped-step side condition below stay local — they are policy about
    // what counts as flat, which this loop rightly owns.
    let mut frozen_loglik_streak =
        crate::solver::loop_guard::FlatStreak::new(DIVERGENCE_FROZEN_LOGLIK_CYCLES);
    // Coordinate descent visits each block in turn, so `max_proposed_step`
    // (the per-cycle max across blocks) only fires the cap on cycles where
    // the divergent block is the active one. On a near-null direction this
    // produces an alternation pattern (e.g. cap, cap, small, cap, small,
    // cap, …) and a strict "consecutive cycles where step is clamped"
    // requirement resets the counter every time another block's smaller
    // step dominates the per-cycle maximum. The frozen-loglik signal,
    // however, is a property of the joint state — it stays true across
    // every cycle of the alternation. Track frozen-loglik consecutively
    // and require that `step_clamped` was observed AT LEAST ONCE inside
    // the frozen run (rather than EVERY cycle).
    let mut clamped_step_in_frozen_run: bool = false;
    const DIVERGENCE_FROZEN_LOGLIK_CYCLES: usize = 8;

    let is_dynamic = family.block_geometry_is_dynamic();
    // EMA of per-cycle wall-clock for timing-driven adaptive early-exit (#289).
    // α = 0.3 gives a short memory (~3 cycles) so the EMA tracks recent cost.
    let mut ema_cycle_secs: Option<f64> = None;
    // Initial objective for the grad-ratio predicate.
    let initial_objective = lastobjective;
    for cycle in 0..inner_max_cycles {
        let cycle_start = std::time::Instant::now();
        // Fires at the top of each blockwise coordinate cycle so we can count
        // iterations from CI logs when a benchmark hangs inside the first
        // outer-eval. Emitted at info-level: same rationale as the joint-Newton
        // sibling above — silent-grind diagnosis without debug logs.
        log::info!(
            "[PIRLS/blockwise coord] cycle {:>3}/{} | -loglik {:.6e} | penalty {:.6e} | objective {:.6e}",
            cycle,
            inner_max_cycles,
            -cached_eval.log_likelihood,
            current_penalty,
            lastobjective,
        );
        let mut max_proposed_beta_step = 0.0_f64;
        let mut max_accepted_beta_step = 0.0_f64;
        let mut trust_boundary_hit_in_cycle = false;

        let mut objective_cycle_prev = lastobjective;
        // Reuse cached evaluation from end of previous cycle (or initial eval).
        // For dynamic families, the end-of-cycle evaluation is also reused here
        // instead of re-evaluating redundantly — the state hasn't changed since
        // the last cycle's final evaluate.
        let mut cycle_eval = std::mem::replace(
            &mut cached_eval,
            FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: Vec::new(),
            },
        );
        if cycle_eval.blockworking_sets.len() != specs.len() {
            return Err(format!(
                "family returned {} block working sets, expected {}",
                cycle_eval.blockworking_sets.len(),
                specs.len()
            ));
        }
        // Track whether any block was modified this cycle (for dynamic families,
        // we only need to re-evaluate before block b if a previous block changed).
        let mut any_block_modified = false;
        for b in 0..specs.len() {
            if is_dynamic && any_block_modified {
                // Only re-evaluate if a previous block in this cycle actually
                // modified coefficients. Skips the redundant evaluate for the
                // first block (b=0) since cached_eval is still valid.
                refresh_all_block_etas(family, specs, &mut states)?;
                cycle_eval = family.evaluate(&states)?;
                if cycle_eval.blockworking_sets.len() != specs.len() {
                    return Err(format!(
                        "family returned {} block working sets, expected {}",
                        cycle_eval.blockworking_sets.len(),
                        specs.len()
                    ));
                }
            }

            let spec = &specs[b];
            let work = &cycle_eval.blockworking_sets[b];
            let linear_constraints = family.block_linear_constraints(&states, b, spec)?;
            let s_lambda = &s_lambdas[b];
            let updater = work.updater();
            let update = updater.compute_update_step(&BlockUpdateContext {
                family,
                states: &states,
                spec,
                block_idx: b,
                s_lambda,
                options,
                linear_constraints: linear_constraints.as_ref(),
                cached_active_set: cached_active_sets[b].as_deref(),
            })?;
            if let Some(active_set) = update.active_set {
                cached_active_sets[b] = Some(active_set);
            }
            let beta_new_raw = update.beta_new_raw;
            let beta_new = family.post_update_block_beta(&states, b, spec, beta_new_raw.clone())?;
            reject_constrained_post_update_repair(
                b,
                spec,
                &beta_new_raw,
                &beta_new,
                linear_constraints.as_ref(),
            )?;
            let beta_old = states[b].beta.clone();
            let raw_delta = &beta_new - &beta_old;
            // Per-block trust-region radius in the block's local
            // penalized-Hessian metric. The cap is the current value of
            // `block_max_step[b]`, updated below via
            // `update_joint_trust_region_radius` once we know rho.
            let block_cap = block_max_step[b];
            let (delta, step_metric_norm) = truncate_block_step_to_metric_radius(
                spec,
                work,
                s_lambda,
                raw_delta,
                block_cap,
                ridge,
                options.ridge_policy,
            )?;
            let step_hit_trust_boundary =
                joint_block_step_hit_trust_boundary(step_metric_norm, block_cap);
            trust_boundary_hit_in_cycle |= step_hit_trust_boundary;
            // Capture the objective at the start of this block update so
            // we can compute the true `actual_reduction` once the line
            // search has finished. `objective_cycle_prev` is the running
            // total: it advances inside the line search whenever a trial
            // is accepted, so we must snapshot it here.
            let obj_before_block = objective_cycle_prev;
            let old_block_penalty =
                block_quadratic_penalty(&beta_old, s_lambda, ridge, options.ridge_policy);
            let step_beta_inf = delta.iter().copied().map(f64::abs).fold(0.0, f64::max);
            max_proposed_beta_step = max_proposed_beta_step.max(step_beta_inf);
            if step_beta_inf <= inner_tol {
                continue;
            }

            // Damped update: require non-increasing penalized objective under dynamic geometry.
            // Precompute X * delta once so line-search eta updates are O(n) not O(np).
            // Reuse pre-allocated eta backup to avoid O(n) allocation per block per cycle.
            let eta_checkpoint = BlockEtaCheckpoint::capture_reuse(&states[b], &mut eta_backups[b]);
            let x_delta = if !is_dynamic {
                Some(spec.solver_design().matrixvectormultiply(&delta))
            } else {
                None
            };
            let mut accepted = false;
            // Barrier-aware step ceiling: families with natural log-barrier
            // terms (e.g. log(h') in transformation-normal) report the maximum
            // feasible step fraction so the line search never evaluates the
            // likelihood outside its domain.
            let barrier_ceiling = family
                .max_feasible_step_size(&states, b, &delta)?
                .unwrap_or(1.0);
            // Reuse trial_beta_buf to avoid allocation per backtracking trial.
            let mut trial_beta_buf = beta_old.clone();
            let mut accepted_bt: usize = usize::MAX;
            for bt in 0..8 {
                let alpha = (0.5f64.powi(bt)).min(barrier_ceiling);
                trial_beta_buf.assign(&beta_old);
                trial_beta_buf.scaled_add(alpha, &delta);
                let trial_beta =
                    family.post_update_block_beta(&states, b, spec, trial_beta_buf.clone())?;
                reject_constrained_post_update_repair(
                    b,
                    spec,
                    &trial_beta_buf,
                    &trial_beta,
                    linear_constraints.as_ref(),
                )?;
                states[b].beta = trial_beta;
                // Use precomputed X*delta when geometry is static and beta wasn't modified.
                if let Some(ref xd) = x_delta {
                    if states[b].beta == trial_beta_buf {
                        eta_checkpoint.restore_eta_with_step(&mut states[b], alpha, xd);
                    } else {
                        refresh_single_block_eta(family, specs, &mut states, b)?;
                    }
                } else {
                    refresh_single_block_eta(family, specs, &mut states, b)?;
                }
                let trial_block_penalty =
                    block_quadratic_penalty(&states[b].beta, s_lambda, ridge, options.ridge_policy);
                let trial_penalty = current_penalty - old_block_penalty + trial_block_penalty;
                let line_search_options = coefficient_line_search_options(
                    options,
                    objective_cycle_prev - trial_penalty + 1e-10,
                );
                let trial_ll =
                    match family.log_likelihood_only_with_options(&states, &line_search_options) {
                        Ok(value) => value,
                        Err(_) => {
                            states[b].beta.assign(&beta_old);
                            eta_checkpoint.restore_eta(&mut states[b]);
                            continue;
                        }
                    };
                let trialobjective = -trial_ll + trial_penalty;
                if trialobjective.is_finite() && trialobjective <= objective_cycle_prev + 1e-10 {
                    objective_cycle_prev = trialobjective;
                    current_penalty = trial_penalty;
                    accepted = true;
                    accepted_bt = bt as usize;
                    break;
                }
            }
            // Trust-region update for this block, using the same
            // `update_joint_trust_region_radius` strategy the
            // joint-Newton path uses. Predicted reduction is computed
            // from the per-block penalized quadratic model:
            //
            //   Q(β + αδ) ≈ Q(β) − α·rhs·δ + 0.5·α²·δ·H_pen·δ
            //   predicted_reduction(α) = α·(rhs·δ) − 0.5·α²·(δ·H_pen·δ)
            //
            // where `rhs = score − S·β (− ridge·β)` is the penalized
            // gradient (in maximize-direction) and `H_pen = H + S
            // (+ ridge·I)` is the penalized observed information.
            // Actual reduction is the true penalized objective change
            // measured by the line search; rho = actual / predicted is
            // the standard model-vs-truth ratio that drives the same
            // 0.25 / 0.75 grow-shrink rules `update_joint_trust_region_radius`
            // already implements for the joint path.
            let alpha_accepted = if accepted {
                0.5_f64.powi(accepted_bt as i32)
            } else {
                0.0
            };
            let (rhs_block, hpen_delta_full): (Array1<f64>, Array1<f64>) = match work {
                BlockWorkingSet::ExactNewton { gradient, .. } => {
                    let mut rhs = gradient - &s_lambda.dot(&beta_old);
                    if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                        rhs.scaled_add(-ridge, &beta_old);
                    }
                    let hpen = block_penalized_hessian_vector(
                        spec,
                        work,
                        s_lambda,
                        &delta,
                        ridge,
                        options.ridge_policy,
                    );
                    (rhs, hpen)
                }
                BlockWorkingSet::Diagonal {
                    working_response,
                    working_weights,
                } => {
                    // IRLS local-quadratic gradient and Hessian:
                    //   rhs = X^T W (z − Xβ) − Sβ
                    //   H_pen δ = X^T W X δ + Sδ
                    let solver_design = spec.solver_design();
                    let xb = solver_design.matrixvectormultiply(&beta_old);
                    let resid = working_response - &xb;
                    let w_resid = &resid * working_weights;
                    let mut rhs = solver_design.transpose_vector_multiply(&w_resid);
                    rhs -= &s_lambda.dot(&beta_old);
                    if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                        rhs.scaled_add(-ridge, &beta_old);
                    }
                    let hpen = block_penalized_hessian_vector(
                        spec,
                        work,
                        s_lambda,
                        &delta,
                        ridge,
                        options.ridge_policy,
                    );
                    (rhs, hpen)
                }
            };
            let rhs_dot_delta = rhs_block.dot(&delta);
            let delta_dot_hpen = delta.dot(&hpen_delta_full);
            let predicted_reduction = alpha_accepted * rhs_dot_delta
                - 0.5 * alpha_accepted * alpha_accepted * delta_dot_hpen;
            let actual_reduction = obj_before_block - objective_cycle_prev;
            let trust_update = update_joint_trust_region_radius(
                block_max_step[b],
                alpha_accepted * step_metric_norm,
                actual_reduction,
                predicted_reduction,
                obj_before_block,
            );
            block_max_step[b] = trust_update.radius;
            if !accepted {
                states[b].beta.assign(&beta_old);
                eta_checkpoint.restore_eta(&mut states[b]);
                if let BlockWorkingSet::ExactNewton { gradient, .. } = work {
                    let mut raw_descent = gradient - &s_lambda.dot(&beta_old);
                    if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                        raw_descent -= &beta_old.mapv(|v| ridge * v);
                    }
                    let (descent_dir, descent_metric_norm) = truncate_block_step_to_metric_radius(
                        spec,
                        work,
                        s_lambda,
                        raw_descent,
                        block_cap,
                        ridge,
                        options.ridge_policy,
                    )?;
                    trust_boundary_hit_in_cycle |=
                        joint_block_step_hit_trust_boundary(descent_metric_norm, block_cap);
                    let dir_norm = descent_dir.iter().fold(0.0_f64, |m, &v| m.max(v.abs()));
                    if dir_norm > inner_tol {
                        // Precompute X * descent_dir once for incremental eta updates.
                        let x_descent = if !is_dynamic {
                            Some(spec.solver_design().matrixvectormultiply(&descent_dir))
                        } else {
                            None
                        };
                        let descent_barrier_ceiling = family
                            .max_feasible_step_size(&states, b, &descent_dir)?
                            .unwrap_or(1.0);
                        for bt in 0..12 {
                            let alpha = (0.5f64.powi(bt)).min(descent_barrier_ceiling);
                            trial_beta_buf.assign(&beta_old);
                            trial_beta_buf.scaled_add(alpha, &descent_dir);
                            let trial_beta = family.post_update_block_beta(
                                &states,
                                b,
                                spec,
                                trial_beta_buf.clone(),
                            )?;
                            reject_constrained_post_update_repair(
                                b,
                                spec,
                                &trial_beta_buf,
                                &trial_beta,
                                linear_constraints.as_ref(),
                            )?;
                            states[b].beta = trial_beta;
                            if let Some(ref xd) = x_descent {
                                if states[b].beta == trial_beta_buf {
                                    eta_checkpoint.restore_eta_with_step(&mut states[b], alpha, xd);
                                } else {
                                    refresh_single_block_eta(family, specs, &mut states, b)?;
                                }
                            } else {
                                refresh_single_block_eta(family, specs, &mut states, b)?;
                            }
                            let trial_block_penalty = block_quadratic_penalty(
                                &states[b].beta,
                                s_lambda,
                                ridge,
                                options.ridge_policy,
                            );
                            let trial_penalty =
                                current_penalty - old_block_penalty + trial_block_penalty;
                            let line_search_options = coefficient_line_search_options(
                                options,
                                objective_cycle_prev - trial_penalty + 1e-10,
                            );
                            let trial_ll = match family
                                .log_likelihood_only_with_options(&states, &line_search_options)
                            {
                                Ok(value) => value,
                                Err(_) => {
                                    states[b].beta.assign(&beta_old);
                                    eta_checkpoint.restore_eta(&mut states[b]);
                                    continue;
                                }
                            };
                            let trialobjective = -trial_ll + trial_penalty;
                            if trialobjective.is_finite()
                                && trialobjective <= objective_cycle_prev + 1e-10
                            {
                                objective_cycle_prev = trialobjective;
                                current_penalty = trial_penalty;
                                accepted = true;
                                break;
                            }
                            states[b].beta.assign(&beta_old);
                            eta_checkpoint.restore_eta(&mut states[b]);
                        }
                    }
                }
            }
            if !accepted {
                states[b].beta.assign(&beta_old);
                eta_checkpoint.restore_eta(&mut states[b]);
            } else {
                let accepted_step = states[b]
                    .beta
                    .iter()
                    .zip(beta_old.iter())
                    .map(|(new, old)| (new - old).abs())
                    .fold(0.0_f64, f64::max);
                max_accepted_beta_step = max_accepted_beta_step.max(accepted_step);
                any_block_modified = true;
            }
            // Recycle the checkpoint's buffer back into the pre-allocated pool.
            eta_backups[b] = eta_checkpoint.into_buffer();
        }

        // For non-dynamic families, incremental eta updates within the block loop
        // maintain correct etas. Only refresh from scratch for dynamic-geometry families
        // where block interactions may require recomputation.
        if is_dynamic {
            refresh_all_block_etas(family, specs, &mut states)?;
        }
        cached_eval = family.evaluate(&states)?;
        current_penalty = total_quadratic_penalty(
            &states,
            &s_lambdas,
            ridge,
            options.ridge_policy,
            joint_bundle,
            Some(specs),
        );
        let objective = -cached_eval.log_likelihood + current_penalty;
        let objective_change = (objective - lastobjective).abs();
        lastobjective = objective;
        cycles_done = cycle + 1;

        // Divergence guard (mirrors the joint-Newton sibling, gam#554): a
        // non-finite objective / log-likelihood means a near-unidentified
        // penalized block has propagated NaN mass through the coordinate
        // descent. Every convergence and divergence-frozen exit below is a
        // finite `<=` comparison that NaN silently defeats, so without this
        // the loop grinds the full `inner_max_cycles` on every outer ρ-eval
        // and startup seed. Break unconverged so the outer optimizer rejects
        // this point immediately instead of burning the budget.
        if !objective.is_finite() || !cached_eval.log_likelihood.is_finite() {
            log::warn!(
                "[PIRLS/blockwise convergence] cycle {:>3} | divergence guard: non-finite inner state (objective={:.3e}, -loglik={:.3e}); returning unconverged so the outer optimizer rejects this ρ evaluation instead of running to inner_max_cycles.",
                cycle,
                objective,
                -cached_eval.log_likelihood,
            );
            converged = false;
            break;
        }

        // Scale-aware tolerances — see the matching joint-Newton path
        // above for the rationale. At large scale absolute step/residual
        // tolerances against `inner_tol = 1e-6` keep this loop spinning
        // long after the objective has gone flat.
        let beta_inf = states
            .iter()
            .flat_map(|s| s.beta.iter().copied())
            .map(f64::abs)
            .fold(0.0_f64, f64::max);
        let step_tol = inner_tol * (1.0 + beta_inf);
        let objective_tol = inner_tol * (1.0 + objective.abs());
        let residual_tol = objective_tol;
        // For single-block models the blockwise iteration IS the joint
        // iteration, so block-conditional convergence implies joint
        // convergence.  The exact_newton_joint_stationarity check can
        // stall at ~10x the tolerance due to numerical differences
        // between the block-conditional and joint gradient formulations,
        // causing 100s of wasted cycles on an already-converged solution.
        let exact_joint_stationarity_ok = if has_joint_exacthessian && specs.len() >= 2 {
            exact_newton_joint_stationarity_inf_norm(
                family,
                specs,
                &cached_eval,
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                None,
            )?
            .map(|residual| residual <= residual_tol)
            .unwrap_or(true)
        } else {
            true
        };
        log::info!(
            "[PIRLS/blockwise convergence] cycle {:>3} | max_proposed_step={:.3e} (tol={:.3e}) | max_accepted_step={:.3e} | obj_change={:.3e} (tol={:.3e}) | beta_inf={:.3e} | joint_stationarity_ok={}",
            cycle,
            max_proposed_beta_step,
            step_tol,
            max_accepted_beta_step,
            objective_change,
            objective_tol,
            beta_inf,
            exact_joint_stationarity_ok,
        );

        // Divergence early-exit. See the rationale block at the top of
        // this loop. We treat "log-likelihood unchanged + Newton step
        // pinned at the trust-region cap" as a near-null direction
        // signature and break out unconverged once it persists for
        // DIVERGENCE_FROZEN_LOGLIK_CYCLES consecutive iterations. Tracking
        // log-likelihood (not objective) is essential: when the null mode
        // dominates, only the penalty drifts cycle-to-cycle, so
        // `objective_change` stays above tol while -loglik is genuinely
        // frozen.
        let loglik_change_for_divergence_check =
            (cached_eval.log_likelihood - prev_log_likelihood_for_divergence_check).abs();
        let loglik_frozen_tol_for_divergence_check =
            inner_tol * (1.0 + cached_eval.log_likelihood.abs());
        let step_clamped_for_divergence_check = trust_boundary_hit_in_cycle;
        let loglik_frozen =
            loglik_change_for_divergence_check <= loglik_frozen_tol_for_divergence_check;
        let frozen_verdict = frozen_loglik_streak.note(loglik_frozen);
        if loglik_frozen {
            if step_clamped_for_divergence_check {
                clamped_step_in_frozen_run = true;
            }
        } else {
            clamped_step_in_frozen_run = false;
        }
        prev_log_likelihood_for_divergence_check = cached_eval.log_likelihood;
        if frozen_verdict == crate::solver::loop_guard::LoopVerdict::Plateaued
            && clamped_step_in_frozen_run
        {
            log::warn!(
                "[PIRLS/blockwise convergence] divergence early-exit at cycle {} | -loglik={:.6e} frozen for {} consecutive cycles | max_proposed_step={:.3e} (trust-boundary hit observed in frozen run) | step_tol={:.3e}; near-null Hessian direction detected — returning unconverged so the outer optimizer backs off this region instead of running to inner_max_cycles.",
                cycle,
                -cached_eval.log_likelihood,
                frozen_loglik_streak.streak(),
                max_proposed_beta_step,
                step_tol,
            );
            converged = false;
            break;
        }

        // ── Timing-driven adaptive early-exit (#289) ────────────────────────
        // Mirror the EMA predicate from the PIRLS LM loop: when iterations
        // become trivially cheap AND the objective/step are near-stationary,
        // accept convergence rather than spinning to inner_max_cycles.
        // Only fires after ≥2 data points so the EMA is meaningful.
        let cycle_secs = cycle_start.elapsed().as_secs_f64();
        let ema = match ema_cycle_secs {
            None => cycle_secs,
            Some(prev) => 0.3 * cycle_secs + 0.7 * prev,
        };
        ema_cycle_secs = Some(ema);
        if cycle >= 2 {
            let cycle_cheap = ema > 0.0 && cycle_secs < 0.25 * ema;
            let f_abs = lastobjective.abs().max(1.0);
            let deviance_ok = (objective_change / f_abs) < inner_tol * 10.0;
            let step_ok = if initial_objective.abs() > 0.0 && objective_change.is_finite() {
                (objective_change / initial_objective.abs().max(1.0)) < inner_tol * 10.0
            } else {
                false
            };
            if cycle_cheap && deviance_ok && step_ok {
                log::info!(
                    "[PIRLS/blockwise] cycle {} timing-driven adaptive early-exit: \
                     cycle={:.4}s ema={:.4}s obj_rel={:.3e}",
                    cycle,
                    cycle_secs,
                    ema,
                    objective_change / f_abs,
                );
                converged = true;
                break;
            }
        }
        // ── end timing-driven adaptive early-exit ────────────────────────────

        if max_accepted_beta_step <= step_tol && objective_change <= objective_tol {
            if exact_joint_stationarity_ok || max_proposed_beta_step <= step_tol {
                converged = true;
            }
            break;
        }
    }

    // ── Polishing joint Newton step ──
    //
    // For block-coupled multi-block families (e.g. GAMLSS wiggle), Gauss-Seidel
    // blockwise iteration can reach step_inf < inner_tol while the joint KKT
    // residual (||Sβ − grad_ℓ||_∞) remains at ~10× inner_tol. This is because
    // each block is solved conditionally on other blocks' current values —
    // block-conditional stationarity does not imply joint stationarity when
    // the likelihood couples blocks off-diagonally.
    //
    // Once blockwise has placed β near the true joint optimum, a single (or
    // a few) damped joint Newton steps can tighten the joint residual to the
    // floor set by β magnitudes. This polishing phase is essential for the
    // outer REML gradient formula (which assumes exact β̂ stationarity); a
    // non-converged β̂ produces large envelope-theorem violations in the
    // analytic outer gradient.
    if use_joint_newton && !converged {
        let ranges_joint: Vec<(usize, usize)> = {
            let mut offset = 0;
            specs
                .iter()
                .map(|s| {
                    let start = offset;
                    offset += s.design.ncols();
                    (start, offset)
                })
                .collect()
        };
        let total_p_joint: usize = ranges_joint.last().map_or(0, |r| r.1);
        let joint_mode_diagonal_ridge =
            if ridge > 0.0 && options.ridge_policy.include_quadratic_penalty {
                ridge
            } else {
                0.0
            };
        let trace_diagonal_ridge = joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE;

        // Allow up to a few polishing steps. The blockwise endpoint is close
        // to optimum, so step sizes should be small and line search should
        // accept full steps quickly.
        const POLISH_MAX_ITER: usize = 16;
        for _polish_iter in 0..POLISH_MAX_ITER {
            // Re-evaluate at current β to get the joint gradient and Hessian.
            refresh_all_block_etas(family, specs, &mut states)?;
            let eval_for_polish = family.evaluate(&states)?;
            let grad_full =
                match exact_newton_joint_gradient_from_eval(&eval_for_polish, specs, &states)? {
                    Some(g) => g,
                    None => break,
                };
            // Spec-aware joint Hessian: canonical coupled-curvature source
            // (see the joint-Newton availability gate). Families overriding
            // only `_with_specs` return `None` from the spec-less default.
            let h_joint_opt = family.exact_newton_joint_hessian_with_specs(&states, specs)?;
            let Some(h_joint) = h_joint_opt else { break };
            let mut h_dense = match symmetrized_square_matrix(
                h_joint,
                total_p_joint,
                "joint polish Hessian shape mismatch",
            ) {
                Ok(matrix) => matrix,
                Err(_) => break,
            };
            add_joint_penalty_to_matrix(
                &mut h_dense,
                &ranges_joint,
                &s_lambdas,
                trace_diagonal_ridge,
                joint_bundle,
            );

            let mut beta_joint = Array1::<f64>::zeros(total_p_joint);
            for b in 0..specs.len() {
                let (start, end) = ranges_joint[b];
                beta_joint
                    .slice_mut(ndarray::s![start..end])
                    .assign(&states[b].beta);
            }
            let penalty_beta = apply_joint_block_penalty(
                &ranges_joint,
                &s_lambdas,
                &beta_joint,
                joint_mode_diagonal_ridge,
                joint_bundle,
            );
            let rhs = &grad_full - &penalty_beta;

            // Respect constraints that block line search on the boundary.
            // Gauss-Seidel blockwise leaves the joint KKT residual at a floor
            // around |λ_k S_k β̂| for boundary-active components. The residual
            // magnitude on FREE components is a better measure of whether we
            // should keep polishing: if β_i is clipped at the boundary and
            // KKT multiplier μ_i > 0, then rhs[i] is the multiplier, not a
            // free-space gradient violation.
            let block_constraints_now = collect_block_linear_constraints(family, &states, specs)?;
            let joint_constraints_now = assemble_joint_linear_constraints(
                &block_constraints_now,
                &ranges_joint,
                total_p_joint,
            )?;
            let mut active_mask: Vec<bool> = vec![false; total_p_joint];
            if let Some(ref constraints) = joint_constraints_now
                && let Ok(Some(bounds)) = extract_simple_lower_bounds(constraints, total_p_joint)
            {
                for (idx, (bound, beta_val)) in bounds
                    .lower_bounds
                    .iter()
                    .zip(beta_joint.iter())
                    .enumerate()
                {
                    if *bound > f64::NEG_INFINITY && (*beta_val - *bound).abs() < 1e-12 {
                        active_mask[idx] = true;
                    }
                }
            }
            let res_inf_free = rhs
                .iter()
                .zip(active_mask.iter())
                .filter(|(_, active)| !**active)
                .map(|(v, _)| v.abs())
                .fold(0.0_f64, f64::max);
            // Scale-aware residual tolerance — the joint stationarity
            // residual ‖∇ℓ − Sβ‖_∞ scales with |obj| (≈ O(n) at large-scale
            // scale), so the historical absolute `inner_tol = 1e-6` is
            // unachievable here even at the true minimum. Same rationale
            // as the joint-Newton convergence test above.
            let polish_obj = -cached_eval.log_likelihood + current_penalty;
            let polish_residual_tol = inner_tol * (1.0 + polish_obj.abs());
            if res_inf_free <= polish_residual_tol {
                converged = true;
                break;
            }

            // Solve constrained Newton system if simple bounds are present,
            // else unconstrained.
            let delta = if let Some(ref constraints) = joint_constraints_now {
                let warm = flatten_joint_active_set(&cached_active_sets, &block_constraints_now);
                let lower_bounds_opt = extract_simple_lower_bounds(constraints, total_p_joint)
                    .ok()
                    .flatten();
                if let Some(bounds) = lower_bounds_opt.as_ref() {
                    match solve_quadratic_with_simple_lower_bounds(
                        &h_dense,
                        &rhs,
                        &beta_joint,
                        bounds,
                        warm.as_deref(),
                    ) {
                        Ok((beta_new, _active)) => &beta_new - &beta_joint,
                        Err(_) => break,
                    }
                } else {
                    match solve_quadratic_with_linear_constraints(
                        &h_dense,
                        &rhs,
                        &beta_joint,
                        constraints,
                        warm.as_deref(),
                    ) {
                        Ok((beta_new, _active)) => &beta_new - &beta_joint,
                        Err(_) => break,
                    }
                }
            } else {
                let solver = crate::linalg::utils::StableSolver::new("joint polish");
                match solver.solvevectorwithridge_retries(
                    &h_dense,
                    &rhs,
                    JOINT_TRACE_STABILITY_RIDGE,
                ) {
                    Some(d) => d,
                    None => break,
                }
            };
            if !delta.iter().all(|v| v.is_finite()) {
                break;
            }
            // Keep polishing until the free-space joint residual is small; a
            // tiny delta alone is not a certificate of stationarity.
            // Damped line search with projection.
            let old_states: Vec<ParameterBlockState> = states.clone();
            let old_obj = -eval_for_polish.log_likelihood + current_penalty;
            let mut accepted_polish = false;
            for bt in 0..10 {
                let alpha = 0.5f64.powi(bt);
                for b in 0..specs.len() {
                    let (start, end) = ranges_joint[b];
                    let mut trial_beta = old_states[b].beta.clone();
                    trial_beta.scaled_add(alpha, &delta.slice(ndarray::s![start..end]));
                    let projected = family.post_update_block_beta(
                        &old_states,
                        b,
                        &specs[b],
                        trial_beta.clone(),
                    )?;
                    reject_constrained_post_update_repair(
                        b,
                        &specs[b],
                        &trial_beta,
                        &projected,
                        block_constraints_now[b].as_ref(),
                    )?;
                    states[b].beta.assign(&projected);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                let trial_ll = match family.log_likelihood_only(&states) {
                    Ok(v) => v,
                    Err(_) => {
                        for (b, s) in old_states.iter().enumerate() {
                            states[b] = s.clone();
                        }
                        refresh_all_block_etas(family, specs, &mut states)?;
                        continue;
                    }
                };
                let trial_penalty = total_quadratic_penalty(
                    &states,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    joint_bundle,
                    Some(specs),
                );
                let trial_obj = -trial_ll + trial_penalty;
                if trial_obj.is_finite() && trial_obj <= old_obj + 1e-12 {
                    current_penalty = trial_penalty;
                    cached_eval = family.evaluate(&states)?;
                    accepted_polish = true;
                    break;
                }
            }
            if !accepted_polish {
                // Restore and stop polishing.
                for (b, s) in old_states.iter().enumerate() {
                    states[b] = s.clone();
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                break;
            }
        }
    }

    // Reuse cached evaluation from the last cycle's end (or the initial eval if 0 cycles ran).
    let penalty_value = total_quadratic_penalty(
        &states,
        &s_lambdas,
        ridge,
        options.ridge_policy,
        joint_bundle,
        Some(specs),
    );

    let (block_logdet_h, block_logdet_s) =
        blockwise_logdet_terms(family, specs, &mut states, block_log_lambdas, options)?;
    let kkt_residual = if converged {
        match exact_newton_joint_gradient_from_eval(&cached_eval, specs, &states)? {
            Some(gradient) => {
                let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
                let local_total_p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
                let active_set_rows_total: usize = cached_active_sets
                    .iter()
                    .map(|maybe| maybe.as_ref().map(|v| v.len()).unwrap_or(0))
                    .sum();
                let free_rank_at_cert = local_total_p.saturating_sub(active_set_rows_total);
                exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
                    &gradient,
                    specs,
                    &states,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    &block_constraints,
                    Some(cached_active_sets.as_slice()),
                )?
                .map(|r| r.with_metadata(last_residual_tol, free_rank_at_cert))
            }
            None => None,
        }
    } else {
        // Inner did not converge; no caller should trust an IFT correction
        // at a non-KKT iterate.
        None
    };

    let active_constraints = {
        let local_ranges = block_param_ranges(specs);
        let local_total_p = local_ranges.last().map(|(_, end)| *end).unwrap_or(0);
        let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
        assemble_active_constraint_block(
            &block_constraints,
            &cached_active_sets,
            &local_ranges,
            local_total_p,
        )
        .map(std::sync::Arc::new)
    };
    Ok(BlockwiseInnerResult {
        block_states: states,
        active_sets: normalize_active_sets(cached_active_sets),
        log_likelihood: cached_eval.log_likelihood,
        penalty_value,
        cycles: cycles_done,
        converged,
        block_logdet_h,
        block_logdet_s,
        s_lambdas,
        joint_workspace: None,
        kkt_residual,
        active_constraints,
    })
}

/// Borrowed derivative provider for joint models that wraps closures with
/// non-`'static` lifetimes.
///
/// The closures borrow data from the calling stack frame (family, synced states,
/// specs), so we use borrowed closures with a non-`'static` lifetime.
/// Instead we borrow the closures and implement `HessianDerivativeProvider` directly.
///
/// # Sign convention
///
/// The unified evaluator passes `v_k = H⁻¹(A_k β̂)` to `hessian_derivative_correction`.
/// By the implicit function theorem, `dβ̂/dρ_k = −v_k`. The stored `compute_dh`
/// expects the actual perturbation direction `δβ`, so we negate `v_k` before calling it.
struct BorrowedJointDerivProvider<'a> {
    compute_dh: &'a DriftDerivFn<'a>,
    compute_dh_many: Option<&'a DriftDerivManyFn<'a>>,
    compute_d2h: &'a DriftSecondDerivFn<'a>,
    /// Optional batched second-derivative callback. The unified evaluator's
    /// outer-Hessian ρ-ρ pair loop precomputes all K(K+1)/2 (v_k, v_l, u_kl)
    /// triples and calls this once per outer Hessian assembly when set, so
    /// families that fuse the per-row D²H walk across pairs (e.g. survival
    /// marginal-slope which scans n rows once per outer eval) replace
    /// K(K+1)/2 separate row-walks with one. The default `None` falls back
    /// to the per-pair `compute_d2h` dispatch and preserves the historical
    /// dispatch cost.
    compute_d2h_many: Option<&'a DriftSecondDerivManyFn<'a>>,
    family_outer_hessian_operator:
        Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>,
}

/// Shared `(term1, term2)` second-derivative correction assembly used by both
/// the borrowed and owned joint derivative providers. `compute_dh` supplies the
/// drift derivative `D_β H[u_kl]` (term1) and `compute_d2h` the mixed second
/// derivative `D²_β H[−v_l, −v_k]` (term2); the two are fused into a single
/// `CompositeHyperOperator`. Returns `None` as soon as either term is absent.
fn joint_second_derivative_correction_result(
    compute_dh: &dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String>,
    compute_d2h: &dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>,
    v_k: &Array1<f64>,
    v_l: &Array1<f64>,
    u_kl: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
    let Some(term1) = compute_dh(u_kl)? else {
        return Ok(None);
    };
    let neg_v_k = -v_k;
    let neg_v_l = -v_l;
    let Some(term2) = compute_d2h(&neg_v_l, &neg_v_k)? else {
        return Ok(None);
    };
    let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
        dense: None,
        operators: vec![term1.into_operator(), term2.into_operator()],
        dim_hint: u_kl.len(),
    };
    Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
}

impl HessianDerivativeProvider for BorrowedJointDerivProvider<'_> {
    fn hessian_derivative_correction(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_derivative_correction_result(v_k)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        let neg_v = -v_k;
        (self.compute_dh)(&neg_v)
    }

    fn hessian_derivative_corrections_result(
        &self,
        v_ks: &[Array1<f64>],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        let neg_vs: Vec<Array1<f64>> = v_ks.iter().map(|v_k| -v_k).collect();
        if let Some(compute_dh_many) = self.compute_dh_many {
            compute_dh_many(&neg_vs)
        } else {
            neg_vs
                .iter()
                .map(|neg_v| (self.compute_dh)(neg_v))
                .collect()
        }
    }

    fn has_batched_hessian_derivative_corrections(&self) -> bool {
        self.compute_dh_many.is_some()
    }

    fn hessian_second_derivative_correction(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_second_derivative_correction_result(v_k, v_l, u_kl)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_second_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        joint_second_derivative_correction_result(self.compute_dh, self.compute_d2h, v_k, v_l, u_kl)
    }

    fn hessian_second_derivative_corrections_result(
        &self,
        triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        // Fast path: family supplied a batched D²H callback that fuses the
        // per-row scan across all K(K+1)/2 (v_k, v_l, u_kl) triples in one
        // pass. Pair it with the (also potentially batched) `compute_dh`
        // term1 walk over `u_kl` directions to keep the (term1, term2)
        // CompositeHyperOperator semantics that the singular hook produces.
        if let Some(compute_d2h_many) = self.compute_d2h_many {
            let u_kls: Vec<Array1<f64>> = triples.iter().map(|(_, _, u_kl)| u_kl.clone()).collect();
            let term1s = self.hessian_derivative_corrections_result(
                &u_kls.iter().map(|u| -u).collect::<Vec<_>>(),
            )?;
            let pairs: Vec<(Array1<f64>, Array1<f64>)> =
                triples.iter().map(|(v_k, v_l, _)| (-v_l, -v_k)).collect();
            let term2s = compute_d2h_many(&pairs)?;
            triples
                .iter()
                .enumerate()
                .map(|(idx, (_, _, u_kl))| match (&term1s[idx], &term2s[idx]) {
                    (Some(t1), Some(t2)) => {
                        let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
                            dense: None,
                            operators: vec![t1.clone().into_operator(), t2.clone().into_operator()],
                            dim_hint: u_kl.len(),
                        };
                        Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
                    }
                    _ => Ok(None),
                })
                .collect()
        } else {
            triples
                .iter()
                .map(|(v_k, v_l, u_kl)| {
                    self.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
                })
                .collect()
        }
    }

    fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
        self.compute_d2h_many.is_some()
    }

    fn has_corrections(&self) -> bool {
        true
    }

    fn family_outer_hessian_operator(
        &self,
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        self.family_outer_hessian_operator.clone()
    }
}

struct OwnedJointDerivProvider {
    compute_dh: Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
    compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    compute_d2h: Arc<
        dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
            + Send
            + Sync,
    >,
    /// Optional batched second-derivative callback. See the matching field on
    /// `BorrowedJointDerivProvider` for the dispatch contract.
    compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    family_outer_hessian_operator:
        Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>,
}

impl HessianDerivativeProvider for OwnedJointDerivProvider {
    fn hessian_derivative_correction(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_derivative_correction_result(v_k)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        let neg_v = -v_k;
        (self.compute_dh)(&neg_v)
    }

    fn hessian_derivative_corrections_result(
        &self,
        v_ks: &[Array1<f64>],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        let neg_vs: Vec<Array1<f64>> = v_ks.iter().map(|v_k| -v_k).collect();
        if let Some(compute_dh_many) = self.compute_dh_many.as_ref() {
            compute_dh_many(&neg_vs)
        } else {
            neg_vs
                .iter()
                .map(|neg_v| (self.compute_dh)(neg_v))
                .collect()
        }
    }

    fn has_batched_hessian_derivative_corrections(&self) -> bool {
        self.compute_dh_many.is_some()
    }

    fn hessian_second_derivative_correction(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_second_derivative_correction_result(v_k, v_l, u_kl)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_second_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        joint_second_derivative_correction_result(
            &*self.compute_dh,
            &*self.compute_d2h,
            v_k,
            v_l,
            u_kl,
        )
    }

    fn hessian_second_derivative_corrections_result(
        &self,
        triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        if let Some(compute_d2h_many) = self.compute_d2h_many.as_ref() {
            let u_kls: Vec<Array1<f64>> = triples.iter().map(|(_, _, u_kl)| u_kl.clone()).collect();
            let term1s = self.hessian_derivative_corrections_result(
                &u_kls.iter().map(|u| -u).collect::<Vec<_>>(),
            )?;
            let pairs: Vec<(Array1<f64>, Array1<f64>)> =
                triples.iter().map(|(v_k, v_l, _)| (-v_l, -v_k)).collect();
            let term2s = compute_d2h_many(&pairs)?;
            triples
                .iter()
                .enumerate()
                .map(|(idx, (_, _, u_kl))| match (&term1s[idx], &term2s[idx]) {
                    (Some(t1), Some(t2)) => {
                        let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
                            dense: None,
                            operators: vec![t1.clone().into_operator(), t2.clone().into_operator()],
                            dim_hint: u_kl.len(),
                        };
                        Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
                    }
                    _ => Ok(None),
                })
                .collect()
        } else {
            triples
                .iter()
                .map(|(v_k, v_l, u_kl)| {
                    self.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
                })
                .collect()
        }
    }

    fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
        self.compute_d2h_many.is_some()
    }

    fn has_corrections(&self) -> bool {
        true
    }

    fn outer_hessian_derivative_kernel(
        &self,
    ) -> Option<crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel> {
        Some(
            crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel::Callback {
                first: Arc::clone(&self.compute_dh),
                second: Arc::clone(&self.compute_d2h),
            },
        )
    }

    fn family_outer_hessian_operator(
        &self,
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        self.family_outer_hessian_operator.clone()
    }
}

/// Drift closure producing the Tier-B Jeffreys-curvature drift
/// `D_β H_Φ[δβ]` for a mode-response direction `δβ = dβ̂/dρ_k`.
///
/// The closure already expects the actual perturbation direction `δβ` (NOT the
/// raw `v_k` the trait hands the provider); the wrapper negates `v_k → δβ = −v_k`
/// before calling, exactly mirroring `BorrowedJointDerivProvider`'s sign
/// convention and the inner `compute_dh` it composes with. Returns `None` when
/// the Jeffreys term is gated out or the family lacks the exact derivatives, so
/// the wrapper falls back to the inner provider's drift unchanged.
type JeffreysHphiDriftFn =
    Arc<dyn Fn(&Array1<f64>) -> Result<Option<Array2<f64>>, String> + Send + Sync>;

/// Jeffreys-`H_Φ`-aware joint derivative provider.
///
/// Wraps an inner Tier-B joint provider (which supplies the likelihood-Hessian
/// drift `D_β H_L[v_k]`) and ADDS the Jeffreys-curvature drift `D_β H_Φ[v_k]` to
/// the first-order trace corrections. This closes the bug where the Tier-B outer
/// LAML gradient omitted `H_Φ`'s ρ-dependence (through β̂): the objective folds
/// `H_Φ` into `½ log|H + S_λ + H_Φ|`, so its exact gradient
///   `½ tr[(H+S_λ+H_Φ)⁻¹ (∂_ρ S_λ + D_β H_L[v_k] + D_β H_Φ[v_k])]`
/// MUST include the `D_β H_Φ[v_k]` term. It is the exact analogue of the Tier-A
/// `FirthAwareGlmDerivatives` (`unified.rs`) `−D(Hφ)[B_k]` first-order term, and
/// of `BarrierDerivativeProvider`'s additive-correction composition pattern.
///
/// SIGN. The trait passes `v_k = H⁻¹(A_kβ̂)`; the mode response is `δβ = −v_k`.
/// We negate before invoking the drift closure, so `corr = + D_β H_Φ[δβ]` is
/// added on top of the inner provider's already-correct likelihood drift.
struct JeffreysHphiAwareJointDerivatives<'a> {
    inner: Box<dyn HessianDerivativeProvider + 'a>,
    drift: JeffreysHphiDriftFn,
    p: usize,
}

impl<'a> JeffreysHphiAwareJointDerivatives<'a> {
    fn new(
        inner: Box<dyn HessianDerivativeProvider + 'a>,
        drift: JeffreysHphiDriftFn,
        p: usize,
    ) -> Self {
        Self { inner, drift, p }
    }

    /// `D_β H_Φ[δβ]` with the trait's `v_k → δβ = −v_k` mode-response convention.
    fn hphi_drift(&self, v_k: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
        let delta = v_k.mapv(|value| -value);
        (self.drift)(&delta)
    }
}

impl HessianDerivativeProvider for JeffreysHphiAwareJointDerivatives<'_> {
    fn hessian_derivative_correction(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let inner = self.inner.hessian_derivative_correction(v_k)?;
        let drift = self.hphi_drift(v_k)?;
        Ok(match (inner, drift) {
            (Some(mut ic), Some(d)) => {
                ic += &d;
                Some(ic)
            }
            (Some(ic), None) => Some(ic),
            (None, Some(d)) => Some(d),
            (None, None) => None,
        })
    }

    fn hessian_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        let inner = self.inner.hessian_derivative_correction_result(v_k)?;
        let drift = self.hphi_drift(v_k)?;
        Ok(match (inner, drift) {
            (Some(DriftDerivResult::Dense(mut dense)), Some(d)) => {
                dense += &d;
                Some(DriftDerivResult::Dense(dense))
            }
            (Some(DriftDerivResult::Operator(operator)), Some(d)) => {
                Some(DriftDerivResult::Operator(Arc::new(
                    crate::solver::estimate::reml::unified::CompositeHyperOperator {
                        dense: Some(d),
                        operators: vec![operator],
                        dim_hint: self.p,
                    },
                )))
            }
            (Some(other), None) => Some(other),
            (None, Some(d)) => Some(DriftDerivResult::Dense(d)),
            (None, None) => None,
        })
    }

    fn hessian_derivative_corrections_result(
        &self,
        v_ks: &[Array1<f64>],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        // Delegate the (possibly batched) inner walk, then fold the per-direction
        // H_Φ drift into each result so the batched path stays consistent with the
        // singular one.
        let inner = self.inner.hessian_derivative_corrections_result(v_ks)?;
        inner
            .into_iter()
            .zip(v_ks.iter())
            .map(|(inner_result, v_k)| {
                let drift = self.hphi_drift(v_k)?;
                Ok(match (inner_result, drift) {
                    (Some(DriftDerivResult::Dense(mut dense)), Some(d)) => {
                        dense += &d;
                        Some(DriftDerivResult::Dense(dense))
                    }
                    (Some(DriftDerivResult::Operator(operator)), Some(d)) => {
                        Some(DriftDerivResult::Operator(Arc::new(
                            crate::solver::estimate::reml::unified::CompositeHyperOperator {
                                dense: Some(d),
                                operators: vec![operator],
                                dim_hint: self.p,
                            },
                        )))
                    }
                    (Some(other), None) => Some(other),
                    (None, Some(d)) => Some(DriftDerivResult::Dense(d)),
                    (None, None) => None,
                })
            })
            .collect()
    }

    fn has_batched_hessian_derivative_corrections(&self) -> bool {
        self.inner.has_batched_hessian_derivative_corrections()
    }

    // SECOND-ORDER (outer Hessian) RESIDUAL GAP. The full second-order Jeffreys
    // drift `D²_β H_Φ[v_k, v_l]` (the analogue of Tier-A's
    // `−D(Hφ)[B_{kl}] − D²(Hφ)[B_k, B_l]`) is NOT yet folded in here: the
    // second-derivative methods delegate to the inner likelihood drift only. This
    // leaves the OUTER HESSIAN's Jeffreys contribution first-order-incomplete, but
    // the FIRST-ORDER outer GRADIENT — the term the line search and KKT
    // certification actually consume — is now exact. ARC/Newton on the outer
    // problem still gets a consistent gradient; the Hessian is a (PD) curvature
    // surrogate as before.
    fn hessian_second_derivative_correction(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.inner
            .hessian_second_derivative_correction(v_k, v_l, u_kl)
    }

    fn hessian_second_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        self.inner
            .hessian_second_derivative_correction_result(v_k, v_l, u_kl)
    }

    fn hessian_second_derivative_corrections_result(
        &self,
        triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        self.inner
            .hessian_second_derivative_corrections_result(triples)
    }

    fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
        self.inner
            .has_batched_hessian_second_derivative_corrections()
    }

    fn has_corrections(&self) -> bool {
        true
    }

    fn outer_hessian_derivative_kernel(
        &self,
    ) -> Option<crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel> {
        // Delegate to the inner provider so the matrix-free outer-HESSIAN route
        // (the `Callback { first, second }` kernel) is preserved. This kernel
        // feeds ONLY the outer Hessian, never the gradient (the gradient's
        // first-order trace flows through `hessian_derivative_correction_result`,
        // which IS wrapped above). The H_Φ SECOND-order drift is the documented
        // residual gap; routing the kernel unchanged keeps the Hessian a
        // consistent PD curvature surrogate without forcing dense assembly.
        self.inner.outer_hessian_derivative_kernel()
    }

    fn family_outer_hessian_operator(
        &self,
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        self.inner.family_outer_hessian_operator()
    }
}

/// Optional bundle of extended (ψ) hyperparameter coordinate data to attach
/// to an `InnerSolution` before calling the unified evaluator.
struct ExtCoordBundle {
    coords: Vec<HyperCoord>,
    ext_ext_fn: Option<Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>>,
    rho_ext_fn: Option<Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>>,
    drift_fn: Option<FixedDriftDerivFn>,
    /// Direction-contracted ψψ second-order hook (#740). When `Some`, the
    /// outer-Hessian operator builder skips the `K²` per-pair ψψ assembly
    /// (`ext_ext_fn`) and applies this once per matvec. `ext_ext_fn` is still
    /// kept as the documented fallback for the dense `compute_outer_hessian`
    /// path and for outer evaluations that do not build the matrix-free
    /// operator.
    contracted_psi_fn: Option<ContractedPsiSecondOrderFn>,
}

struct ScaledHyperOperator {
    inner: Arc<dyn HyperOperator>,
    scale: f64,
}

impl HyperOperator for ScaledHyperOperator {
    fn dim(&self) -> usize {
        self.inner.dim()
    }

    fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
        self.inner.mul_vec(v).mapv(|value| self.scale * value)
    }

    fn bilinear(&self, v: &Array1<f64>, u: &Array1<f64>) -> f64 {
        self.scale * self.inner.bilinear(v, u)
    }

    fn to_dense(&self) -> Array2<f64> {
        self.inner.to_dense().mapv(|value| self.scale * value)
    }

    fn is_implicit(&self) -> bool {
        false
    }
}

fn scale_hypercoord_drift(mut drift: HyperCoordDrift, scale: f64) -> HyperCoordDrift {
    if scale == 1.0 {
        return drift;
    }
    if let Some(ref mut dense) = drift.dense {
        *dense *= scale;
    }
    if let Some(ref mut block_local) = drift.block_local {
        block_local.local *= scale;
    }
    if let Some(operator) = drift.operator.take() {
        drift.operator = Some(Arc::new(ScaledHyperOperator {
            inner: operator,
            scale,
        }));
    }
    drift
}

fn scale_hypercoord(mut coord: HyperCoord, scale: f64) -> HyperCoord {
    if scale == 1.0 {
        return coord;
    }
    coord.g *= scale;
    if let Some(firth_g) = coord.firth_g.as_mut() {
        *firth_g *= scale;
    }
    if let Some(tk_eta_fixed) = coord.tk_eta_fixed.as_mut() {
        *tk_eta_fixed *= scale;
    }
    if let Some(tk_x_fixed) = coord.tk_x_fixed.as_mut() {
        *tk_x_fixed *= scale;
    }
    coord.drift = scale_hypercoord_drift(coord.drift, scale);
    coord
}

fn scale_hypercoord_pair(mut pair: HyperCoordPair, scale: f64) -> HyperCoordPair {
    if scale == 1.0 {
        return pair;
    }
    pair.g *= scale;
    pair.b_mat *= scale;
    if let Some(operator) = pair.b_operator.take() {
        pair.b_operator = Some(Box::new(ScaledHyperOperator {
            inner: Arc::from(operator),
            scale,
        }));
    }
    pair
}

fn scale_drift_deriv_result(result: DriftDerivResult, scale: f64) -> DriftDerivResult {
    if scale == 1.0 {
        return result;
    }
    match result {
        DriftDerivResult::Dense(mut dense) => {
            dense *= scale;
            DriftDerivResult::Dense(dense)
        }
        DriftDerivResult::Operator(operator) => {
            DriftDerivResult::Operator(Arc::new(ScaledHyperOperator {
                inner: operator,
                scale,
            }))
        }
    }
}

impl ExtCoordBundle {
    fn scaled(self, scale: f64) -> Self {
        if scale == 1.0 {
            return self;
        }
        let coords = self
            .coords
            .into_iter()
            .map(|coord| scale_hypercoord(coord, scale))
            .collect();
        let ext_ext_fn = self.ext_ext_fn.map(|callback| {
            Box::new(move |i: usize, j: usize| scale_hypercoord_pair(callback(i, j), scale))
                as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
        });
        let rho_ext_fn = self.rho_ext_fn.map(|callback| {
            Box::new(move |i: usize, j: usize| scale_hypercoord_pair(callback(i, j), scale))
                as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
        });
        let drift_fn = self.drift_fn.map(|callback| {
            Box::new(move |ext_idx: usize, direction: &Array1<f64>| {
                callback(ext_idx, direction).map(|result| scale_drift_deriv_result(result, scale))
            }) as FixedDriftDerivFn
        });
        // The contracted ψψ hook is a (scaled) linear functional of the same
        // family curvature `ext_ext_fn` reproduces, so the `rho_curvature_scale`
        // applies term-for-term: objective/score/ld_s by `scale`, and each
        // `hessian[i]` drift via `scale_drift_deriv_result` (matching how
        // `scale_hypercoord_pair` scales the per-pair `b_mat`/`b_operator`).
        let contracted_psi_fn = self.contracted_psi_fn.map(|callback| {
            Arc::new(move |alpha_psi: &[f64]| {
                callback(alpha_psi).map(|opt| {
                    opt.map(|contracted| ContractedPsiSecondOrder {
                        objective: contracted.objective.mapv(|v| scale * v),
                        score: contracted.score.mapv(|v| scale * v),
                        hessian: contracted
                            .hessian
                            .into_iter()
                            .map(|drift| scale_drift_deriv_result(drift, scale))
                            .collect(),
                        ld_s: contracted.ld_s.mapv(|v| scale * v),
                    })
                })
            }) as ContractedPsiSecondOrderFn
        });
        Self {
            coords,
            ext_ext_fn,
            rho_ext_fn,
            drift_fn,
            contracted_psi_fn,
        }
    }
}

/// Build the canonical unified REML/LAML assembly for a custom-family outer
/// evaluation.
fn build_custom_family_inner_assembly<'dp>(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    beta_flat: &Array1<f64>,
    hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
    include_logdet_h: bool,
    include_logdet_s: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    deriv_provider: Box<dyn HessianDerivativeProvider + 'dp>,
    ext_bundle: Option<ExtCoordBundle>,
    firth_value: Option<f64>,
) -> Result<(crate::estimate::reml::assembly::InnerAssembly<'dp>, usize), String> {
    use crate::estimate::reml::assembly::{
        InnerAssembly, PenaltyBlockDesc, penalty_coords_from_blocks,
    };

    // Collect dense penalty matrices so references stay valid for the assembler.
    let per_block_penalties_dense: Vec<Vec<Array2<f64>>> = {
        use rayon::iter::{IntoParallelIterator, ParallelIterator};
        (0..specs.len())
            .into_par_iter()
            .map(|b| specs[b].penalties.iter().map(|p| p.to_dense()).collect())
            .collect()
    };
    let block_descs: Vec<PenaltyBlockDesc> = (0..specs.len())
        .flat_map(|b| {
            let (start, end) = ranges[b];
            per_block_penalties_dense[b]
                .iter()
                .map(move |dense| PenaltyBlockDesc {
                    matrix: dense,
                    range_start: start,
                    range_end: end,
                })
        })
        .collect();
    let penalty_coords = penalty_coords_from_blocks(&block_descs, total)?;

    // Compute penalty logdet derivatives.
    let per_block_penalties: Vec<&[Array2<f64>]> = per_block_penalties_dense
        .iter()
        .map(|v| v.as_slice())
        .collect();
    let penalty_logdet_ridge = if options.ridge_policy.include_penalty_logdet {
        ridge
    } else {
        0.0
    };
    let penalty_logdet =
        compute_block_penalty_logdet_derivs(per_block, &per_block_penalties, penalty_logdet_ridge)?;

    let n_observations = inner.block_states.first().map(|s| s.eta.len()).unwrap_or(0);

    // Unpack optional ext-coord bundle.
    let (ext_coords, ext_coord_pair_fn, rho_ext_pair_fn, fixed_drift_deriv, contracted_psi_fn) =
        if let Some(bundle) = ext_bundle {
            (
                bundle.coords,
                bundle.ext_ext_fn,
                bundle.rho_ext_fn,
                bundle.drift_fn,
                bundle.contracted_psi_fn,
            )
        } else {
            (Vec::new(), None, None, None, None)
        };

    let ext_dim = ext_coords.len();

    let evaluator = InnerAssembly {
        log_likelihood: inner.log_likelihood,
        // inner.penalty_value includes the 0.5 factor (= 0.5 β̂ᵀSβ̂), but the
        // unified evaluator convention expects the FULL quadratic β̂ᵀSβ̂ and
        // applies 0.5 itself. Double to match the convention.
        penalty_quadratic: 2.0 * inner.penalty_value,
        beta: beta_flat.clone(),
        n_observations,
        hessian_op,
        penalty_coords,
        penalty_logdet,
        dispersion: DispersionHandling::Fixed {
            phi: 1.0,
            include_logdet_h,
            include_logdet_s,
        },
        rho_curvature_scale,
        rho_prior,
        hessian_logdet_correction,
        penalty_subspace_trace,
        deriv_provider: Some(deriv_provider),
        tk_correction: 0.0,
        tk_gradient: None,
        // Tier-B Firth fold (gam#979): the inner mode minimizes
        // `−ℓ + ½βᵀSβ − Φ`, so the LAML cost must subtract the same gated
        // `Φ(β̂)` or the envelope-based analytic outer gradient and the value
        // describe different criteria at every Firth-active mode.
        firth: firth_value.map(crate::estimate::reml::unified::ExactJeffreysTerm::value_only),
        nullspace_dim: None,
        barrier_config: None,
        ext_coords,
        ext_coord_pair_fn,
        rho_ext_pair_fn,
        fixed_drift_deriv,
        contracted_psi_second_order: contracted_psi_fn,
        kkt_residual: inner.kkt_residual.clone(),
        active_constraints: inner.active_constraints.clone(),
    };

    Ok((evaluator, ext_dim))
}

struct FirstOrderTraceSkipOperator {
    inner: Arc<dyn HessianOperator>,
    remaining_first_order_traces: AtomicUsize,
}

impl FirstOrderTraceSkipOperator {
    fn new(inner: Arc<dyn HessianOperator>, skip_count: usize) -> Self {
        Self {
            inner,
            remaining_first_order_traces: AtomicUsize::new(skip_count),
        }
    }

    fn first_order_skip_active(&self) -> bool {
        self.remaining_first_order_traces.load(Ordering::Acquire) > 0
    }

    fn consume_first_order_trace(&self) -> bool {
        let mut current = self.remaining_first_order_traces.load(Ordering::Acquire);
        while current > 0 {
            match self.remaining_first_order_traces.compare_exchange(
                current,
                current - 1,
                Ordering::AcqRel,
                Ordering::Acquire,
            ) {
                Ok(_) => return true,
                Err(actual) => current = actual,
            }
        }
        false
    }
}

impl HessianOperator for FirstOrderTraceSkipOperator {
    fn logdet(&self) -> f64 {
        self.inner.logdet()
    }

    fn trace_hinv_product(&self, a: &Array2<f64>) -> f64 {
        self.inner.trace_hinv_product(a)
    }

    fn as_exact_dense_spectral(&self) -> Option<&DenseSpectralOperator> {
        if self.first_order_skip_active() {
            None
        } else {
            self.inner.as_exact_dense_spectral()
        }
    }

    fn assemble_h_dense_for_tangent_projection(&self) -> Result<Array2<f64>, String> {
        if self.first_order_skip_active() {
            Err("backend does not support tangent projection".to_string())
        } else {
            self.inner.assemble_h_dense_for_tangent_projection()
        }
    }

    fn trace_hinv_operator(&self, op: &dyn HyperOperator) -> f64 {
        self.inner.trace_hinv_operator(op)
    }

    fn trace_hinv_h_k(
        &self,
        a_k: &Array2<f64>,
        third_deriv_correction: Option<&Array2<f64>>,
    ) -> f64 {
        self.inner.trace_hinv_h_k(a_k, third_deriv_correction)
    }

    fn solve(&self, rhs: &Array1<f64>) -> Array1<f64> {
        self.inner.solve(rhs)
    }

    fn solve_multi(&self, rhs: &Array2<f64>) -> Array2<f64> {
        self.inner.solve_multi(rhs)
    }

    fn stochastic_trace_solve(&self, rhs: &Array1<f64>, rel_tol: f64) -> Array1<f64> {
        self.inner.stochastic_trace_solve(rhs, rel_tol)
    }

    fn stochastic_trace_solve_for_probe(
        &self,
        rhs: &Array1<f64>,
        rel_tol: f64,
        probe_id: u64,
        trace_state: Option<&Arc<Mutex<StochasticTraceState>>>,
    ) -> Array1<f64> {
        self.inner
            .stochastic_trace_solve_for_probe(rhs, rel_tol, probe_id, trace_state)
    }

    fn stochastic_trace_solve_multi(&self, rhs: &Array2<f64>, rel_tol: f64) -> Array2<f64> {
        self.inner.stochastic_trace_solve_multi(rhs, rel_tol)
    }

    fn has_matrix_free_trace_cg_operator(&self) -> bool {
        self.inner.has_matrix_free_trace_cg_operator()
    }

    fn trace_hinv_product_cross(&self, a: &Array2<f64>, b: &Array2<f64>) -> f64 {
        self.inner.trace_hinv_product_cross(a, b)
    }

    fn trace_hinv_matrix_operator_cross(
        &self,
        matrix: &Array2<f64>,
        op: &dyn HyperOperator,
    ) -> f64 {
        self.inner.trace_hinv_matrix_operator_cross(matrix, op)
    }

    fn trace_hinv_operator_cross(
        &self,
        left: &dyn HyperOperator,
        right: &dyn HyperOperator,
    ) -> f64 {
        self.inner.trace_hinv_operator_cross(left, right)
    }

    fn trace_logdet_gradient(&self, a: &Array2<f64>) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner.trace_logdet_gradient(a)
        }
    }

    fn xt_logdet_kernel_x_diagonal(&self, x: &DesignMatrix) -> Array1<f64> {
        self.inner.xt_logdet_kernel_x_diagonal(x)
    }

    fn trace_logdet_operator(&self, op: &dyn HyperOperator) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner.trace_logdet_operator(op)
        }
    }

    fn trace_logdet_h_k(
        &self,
        a_k: &Array2<f64>,
        third_deriv_correction: Option<&Array2<f64>>,
    ) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner.trace_logdet_h_k(a_k, third_deriv_correction)
        }
    }

    fn trace_logdet_h_k_operator(
        &self,
        b_k: &dyn HyperOperator,
        third_deriv_correction: Option<&Array2<f64>>,
    ) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner
                .trace_logdet_h_k_operator(b_k, third_deriv_correction)
        }
    }

    fn trace_logdet_block_local(
        &self,
        block: &Array2<f64>,
        scale: f64,
        start: usize,
        end: usize,
    ) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner
                .trace_logdet_block_local(block, scale, start, end)
        }
    }

    fn trace_hinv_block_local(
        &self,
        block: &Array2<f64>,
        scale: f64,
        start: usize,
        end: usize,
    ) -> f64 {
        self.inner.trace_hinv_block_local(block, scale, start, end)
    }

    fn trace_hinv_block_local_cross(
        &self,
        block: &Array2<f64>,
        scale: f64,
        start: usize,
        end: usize,
    ) -> f64 {
        self.inner
            .trace_hinv_block_local_cross(block, scale, start, end)
    }

    fn trace_logdet_hessian_cross(&self, h_i: &Array2<f64>, h_j: &Array2<f64>) -> f64 {
        self.inner.trace_logdet_hessian_cross(h_i, h_j)
    }

    fn trace_logdet_hessian_cross_matrix_operator(
        &self,
        h_i: &Array2<f64>,
        h_j: &dyn HyperOperator,
    ) -> f64 {
        self.inner
            .trace_logdet_hessian_cross_matrix_operator(h_i, h_j)
    }

    fn trace_logdet_hessian_cross_operator(
        &self,
        h_i: &dyn HyperOperator,
        h_j: &dyn HyperOperator,
    ) -> f64 {
        self.inner.trace_logdet_hessian_cross_operator(h_i, h_j)
    }

    fn trace_logdet_hessian_crosses(&self, matrices: &[&Array2<f64>]) -> Array2<f64> {
        self.inner.trace_logdet_hessian_crosses(matrices)
    }

    fn active_rank(&self) -> usize {
        self.inner.active_rank()
    }

    fn dim(&self) -> usize {
        self.inner.dim()
    }

    fn is_dense(&self) -> bool {
        self.inner.is_dense()
    }

    fn prefers_stochastic_trace_estimation(&self) -> bool {
        if self.first_order_skip_active() {
            false
        } else {
            self.inner.prefers_stochastic_trace_estimation()
        }
    }

    fn logdet_traces_match_hinv_kernel(&self) -> bool {
        self.inner.logdet_traces_match_hinv_kernel()
    }

    fn as_dense_spectral(&self) -> Option<&DenseSpectralOperator> {
        if self.first_order_skip_active() {
            None
        } else {
            self.inner.as_dense_spectral()
        }
    }
}

/// Build an `InnerSolution` from joint Hessian data and call the unified evaluator.
///
/// Bridge between the custom family's joint Hessian infrastructure and the
/// unified REML/LAML evaluator, routed through the canonical assembly module.
fn unified_joint_cost_gradient(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
    include_logdet_h: bool,
    include_logdet_s: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    deriv_provider: Box<dyn HessianDerivativeProvider + '_>,
    eval_mode: EvalMode,
    ext_bundle: Option<ExtCoordBundle>,
    first_order_trace_skip: Option<Array1<f64>>,
    // Gated Tier-B Jeffreys value `Φ(β̂)`, folded into the LAML cost
    // (`cost −= Φ`) so the outer criterion matches the Φ-augmented inner
    // objective (gam#979). `None` when the term is unavailable/gated to zero.
    firth_value: Option<f64>,
) -> Result<
    (
        f64,
        Array1<f64>,
        crate::solver::outer_strategy::HessianResult,
    ),
    String,
> {
    let hessian_op: Arc<dyn HessianOperator> = match first_order_trace_skip.as_ref() {
        Some(trace_values) if !trace_values.is_empty() => Arc::new(
            FirstOrderTraceSkipOperator::new(hessian_op, trace_values.len()),
        ),
        _ => hessian_op,
    };
    let (evaluator, ext_dim) = build_custom_family_inner_assembly(
        inner,
        specs,
        per_block,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        deriv_provider,
        ext_bundle,
        firth_value,
    )?;
    let rho_slice = rho
        .as_slice()
        .ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
    let first_order_trace_correction = first_order_trace_skip.map(|trace_values| {
        let gradient_correction = trace_values.mapv(|trace| 0.5 * trace);
        (0.0, gradient_correction, None)
    });
    let result = evaluator.evaluate(rho_slice, eval_mode, first_order_trace_correction)?;

    let cost = result.cost;
    let gradient = result
        .gradient
        .unwrap_or_else(|| Array1::zeros(rho.len() + ext_dim));

    let hessian = result.hessian;

    Ok((cost, gradient, hessian))
}

fn unified_joint_efs_eval(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
    include_logdet_h: bool,
    include_logdet_s: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    deriv_provider: Box<dyn HessianDerivativeProvider + '_>,
    ext_bundle: Option<ExtCoordBundle>,
) -> Result<crate::solver::outer_strategy::EfsEval, String> {
    let (assembly, _) = build_custom_family_inner_assembly(
        inner,
        specs,
        per_block,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        deriv_provider,
        ext_bundle,
        // The EFS screening path evaluates the Φ-less criterion with an
        // unaugmented operator throughout; it stays self-consistent without
        // the Tier-B Firth fold.
        None,
    )?;
    let rho_slice = rho
        .as_slice()
        .ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
    let inner_solution = assembly.build();
    let has_psi = inner_solution
        .ext_coords
        .iter()
        .any(|coord| !coord.is_penalty_like);
    // Always evaluate gradient: the universal-form EFS step
    // `Δρ = log(1 − 2·g_full / q_eff)` reads it directly from the cost
    // gradient slot, so out-of-band cost terms (TK, prior, Firth,
    // barrier, SAS log-δ ridge) shift the multiplicative target through
    // their gradient contribution without needing per-augmentation
    // post-corrections.
    let eval_mode = EvalMode::ValueAndGradient;
    let result = crate::estimate::reml::assembly::evaluate_solution(
        &inner_solution,
        rho_slice,
        eval_mode,
        None,
    )?;

    let gradient = result
        .gradient
        .as_ref()
        .ok_or_else(|| "EFS evaluation did not return the required gradient".to_string())?;
    let gradient_slice = gradient
        .as_slice()
        .ok_or_else(|| "outer gradient must be contiguous for EFS".to_string())?;

    if has_psi {
        let inner_hessian_scale = crate::estimate::reml::unified::hessian_operator_geometric_scale(
            inner_solution.hessian_op.as_ref(),
        );
        let hybrid = crate::estimate::reml::unified::compute_hybrid_efs_update(
            &inner_solution,
            rho_slice,
            gradient_slice,
        );
        Ok(crate::solver::outer_strategy::EfsEval {
            cost: result.cost,
            steps: hybrid.steps,
            beta: Some(inner_solution.beta.clone()),
            psi_gradient: if hybrid.psi_gradient.is_empty() {
                None
            } else {
                Some(Array1::from_vec(hybrid.psi_gradient))
            },
            psi_indices: if hybrid.psi_indices.is_empty() {
                None
            } else {
                Some(hybrid.psi_indices)
            },
            inner_hessian_scale,
            logdet_enclosure_gap: None,
        })
    } else {
        let inner_hessian_scale = crate::estimate::reml::unified::hessian_operator_geometric_scale(
            inner_solution.hessian_op.as_ref(),
        );
        Ok(crate::solver::outer_strategy::EfsEval {
            cost: result.cost,
            steps: crate::estimate::reml::unified::compute_efs_update(
                &inner_solution,
                rho_slice,
                gradient_slice,
            ),
            beta: Some(inner_solution.beta.clone()),
            psi_gradient: None,
            psi_indices: None,
            inner_hessian_scale,
            logdet_enclosure_gap: None,
        })
    }
}

fn joint_penalty_subspace_trace_parts(
    h_joint_unpen: &JointHessianSource,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    total: usize,
    hessian_diagonal_ridge: f64,
    // Pre-scaled outer-REML Jeffreys curvature (already multiplied by
    // `rho_curvature_scale` to live in the same scaled space as `s_lambdas`).
    // Folded into `M = H + Sλ (+ H_Φ)` so the projected logdet AND its trace
    // kernel `(H+Sλ+H_Φ)⁺` match the Jeffreys-augmented operator the LAML score
    // runs on. `None` ⇒ byte-identical released projected logdet.
    scaled_jeffreys_hphi: Option<&Array2<f64>>,
) -> Result<(f64, Option<PenaltySubspaceTrace>), String> {
    if total == 0 {
        return Ok((0.0, None));
    }

    // Structural-null gate: with no positive penalty eigenvalue there is no
    // `log|Sλ|₊` term in the LAML ratio, hence no Hessian-side correction to
    // pair with it — the caller keeps the operator's own logdet untouched.
    // (The kernel itself no longer uses the Sλ eigenvectors: since #901 it is
    // the full spectral `M⁺`, built from M's own eigendecomposition below.)
    let mut s_lambda = Array2::<f64>::zeros((total, total));
    add_joint_penalty_to_matrix(&mut s_lambda, ranges, s_lambdas, 0.0, None);
    let s_evals = s_lambda
        .eigh(Side::Lower)
        .map_err(|e| format!("joint penalty subspace eigendecomposition failed: {e}"))?
        .0;
    let s_threshold = positive_eigenvalue_threshold(s_evals.as_slice().unwrap());
    let rank = (0..total).filter(|&j| s_evals[j] > s_threshold).count();
    if rank == 0 {
        return Ok((0.0, None));
    }

    // ── REML log|H + Sλ|₊ and its trace kernel over the FULL identifiable
    //    subspace range(H + Sλ) ──────────────────────────────────────────────
    //
    // The REML penalty-determinant term is `½ log|H + Sλ|₊`, and its ρ-gradient
    // is the trace `½ tr((H + Sλ)⁻¹ ∂Sλ/∂ρ)`. BOTH must be taken over
    // range(H + Sλ) — the full identifiable subspace — not over range(Sλ).
    //
    // The previous code projected onto range(Sλ): it computed
    // `log|U_Sᵀ(H+Sλ)U_S| = log|M_rr|` and the kernel `M_rr⁻¹`. That DROPS the
    // determinant of the penalty-null block `M_kk = U_kᵀ H U_k` (on ker(Sλ), Sλ
    // vanishes, so this is pure likelihood curvature) and the Schur coupling
    // between the two. `M_kk` is the unpenalized polynomial trend; on a
    // near-collinear design (admixture-cline PCs at small n) its curvature is
    // large and GROWS as the smooth part is shrunk. Omitting it from
    // `log|H+Sλ|` while `½ log|Sλ|₊` is correctly taken over range(Sλ) makes
    // the ρ-derivative of the REML criterion inconsistent in the marginal
    // block: the outer optimizer drives that block's λ → ∞ chasing a
    // flat-increasing profile (gh#752), the coupled inner joint-Newton can no
    // longer certify stationarity on the now-ill-conditioned trend, and the
    // envelope-theorem outer gradient — valid only at a stationary β̂ — diverges
    // on the coupled (logslope) block while the objective stalls, so ARC never
    // reaches a KKT point.
    //
    // The correct generalized determinant (mgcv's treatment) takes both terms
    // over range(H + Sλ): identical to the ordinary log-det / inverse when
    // H + Sλ is non-singular (the well-posed case), and dropping only the truly
    // unidentified directions ker(H) ∩ ker(Sλ) when it is singular — exactly the
    // directions `½ log|Sλ|₊` also omits, keeping value and gradient consistent.
    //
    // To preserve value/gradient consistency the trace kernel must be the
    // FULL pseudo-inverse `M⁺ = (H+Sλ)⁺` itself, carried in spectral form
    // `(U_M, diag(1/σ_a))` over the kept eigenpairs (#901; supersedes the
    // intermediate #752 realization that reduced `M⁺` to its range(Sλ)
    // block). For penalty-supported drifts `∂Sλ/∂ρ` the two coincide:
    //   tr(M⁺ ∂Sλ) = tr(U_Sᵀ M⁺ U_S · U_Sᵀ ∂Sλ U_S) = ∂_ρ log|H+Sλ|₊.
    // But the joint adaptive/ψ hyper-coordinates trace drifts with
    // null(Sλ) support (basis κ-derivatives, the GLM cubic correction
    // `D_β H[v]` through the intercept column), for which the range(Sλ)
    // reduction silently discards the leaked component while the FD of
    // `log|M|₊` keeps it. `tr(M⁺ Ḣ)` is the exact pseudo-logdet derivative
    // for EVERY drift on a constant-rank stratum (first-order eigenvector
    // motion cancels), so one spectral object serves the whole θ-vector.
    // Value and kernel come from the same eigendecomposition of the same
    // materialized `M` so they cannot drift apart.
    //
    // The #752 fix requires the full identifiable-subspace determinant. There
    // is no lower-dimensional fallback that preserves that objective: the old
    // range(Sλ) reduction is exactly the bug, because it drops the penalty-null
    // likelihood determinant. If the dense path is over budget, fail loudly so
    // the caller can choose a different Hessian representation instead of
    // optimizing a different REML surface.
    ensure_exact_joint_hessian_dense_budget(total, "joint penalty subspace logdet")?;
    let m_dense =
        materialize_joint_hessian_source(h_joint_unpen, total, "joint penalty subspace logdet")?;
    let mut m = m_dense;
    add_joint_penalty_to_matrix(&mut m, ranges, s_lambdas, hessian_diagonal_ridge, None);
    if let Some(hphi) = scaled_jeffreys_hphi {
        m += hphi;
    }
    symmetrize_dense_in_place(&mut m);
    let (m_evals, m_evecs) = m.eigh(Side::Lower).map_err(|e| {
        format!("joint penalty subspace full Hessian eigendecomposition failed: {e}")
    })?;
    let m_threshold = positive_eigenvalue_threshold(m_evals.as_slice().unwrap());
    let logdet = exact_pseudo_logdet(m_evals.as_slice().unwrap(), m_threshold);
    // Full Moore–Penrose pseudo-inverse `M⁺` (drop ker(H+Sλ)) in spectral
    // form: kept eigenvectors as the kernel basis, diag(1/σ) as the reduced
    // kernel. In this basis `h_proj_inverse = (U_Mᵀ M U_M)⁻¹ = diag(1/σ)`
    // exactly, so every `PenaltySubspaceTrace` consumer evaluates the one
    // true `tr(M⁺ ·)` / `M⁺`-bilinear — exact for penalty-supported AND
    // null(Sλ)-leaking drifts alike (#901).
    let kept: Vec<usize> = (0..total)
        .filter(|&eig_idx| m_evals[eig_idx] > m_threshold)
        .collect();
    if kept.is_empty() {
        return Ok((0.0, None));
    }
    let r_kept = kept.len();
    let mut u_m = Array2::<f64>::zeros((total, r_kept));
    let mut h_proj_inverse = Array2::<f64>::zeros((r_kept, r_kept));
    for (out_col, &src_col) in kept.iter().enumerate() {
        for row in 0..total {
            u_m[[row, out_col]] = m_evecs[[row, src_col]];
        }
        h_proj_inverse[[out_col, out_col]] = 1.0 / m_evals[src_col];
    }

    Ok((
        logdet,
        Some(PenaltySubspaceTrace {
            u_s: u_m,
            h_proj_inverse,
        }),
    ))
}

/// Shared implementation for the joint exact-Newton and surrogate outer paths.
///
/// Both paths differ only in:
/// - how the joint Hessian source is obtained (exact vs surrogate family methods)
/// - the closure for computing D_β H_L[v] (`compute_dh`)
/// - the closure for computing D²_β H_L[u, v] (`compute_d2h`)
/// - whether a tangent-basis projection is applied to the mode inverse
///
/// This function encapsulates all shared logic: penalty assembly, mode inverse
/// computation, precomputation of joint corrections + second-order traces, and
/// routing through `unified_joint_cost_gradient`.
fn joint_outer_evaluate(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    h_joint_unpen: JointHessianSource,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    moderidge: f64,
    extra_logdet_ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    include_logdet_h: bool,
    include_logdet_s: bool,
    strict_spd: bool,
    project_hessian_logdet: bool,
    eval_mode: EvalMode,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    pseudo_logdet_mode: PseudoLogdetMode,
    compute_dh: &DriftDerivFn<'_>,
    compute_dh_many: Option<&DriftDerivManyFn<'_>>,
    compute_d2h: &DriftSecondDerivFn<'_>,
    compute_d2h_many: Option<&DriftSecondDerivManyFn<'_>>,
    owned_compute_dh: Option<
        Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
    >,
    owned_compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    owned_compute_d2h: Option<
        Arc<
            dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
                + Send
                + Sync,
        >,
    >,
    owned_compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    ext_bundle: Option<ExtCoordBundle>,
    first_order_trace_skip: Option<Array1<f64>>,
    batched_outer_hessian_operator: Option<
        Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
    >,
    // Universal under-identification robustness (always armed when the family can
    // expose an exact joint Hessian). The
    // outer REML logdet AND its trace derivatives must run on the same
    // Jeffreys-augmented Hessian `H + S_λ + H_Φ` the inner Newton converged on,
    // or the LAML value and its analytic gradient describe different objectives.
    // Folding `H_Φ` into the operator's matvec augments the inverse/logdet, but is
    // NOT by itself sufficient: `H_Φ` depends on ρ THROUGH β̂, so the trace
    // contraction also needs its mode-response drift `D_β H_Φ[v_k]` — supplied
    // separately via `jeffreys_hphi_drift` and folded into the first-order trace
    // by `JeffreysHphiAwareJointDerivatives`. `None` means this evaluation has
    // no active Jeffreys curvature (empty system, unavailable exact derivatives,
    // or the conditioning gate proved the term zero), not a user-selected
    // robustness-off mode.
    // Gated Jeffreys VALUE `Φ(β̂)` paired with the curvature `H_Φ` from the same
    // term evaluation. The value is folded into the LAML cost (`cost −= Φ`) so
    // the outer criterion is the Laplace approximation of the SAME
    // Firth-augmented objective the inner Newton converged on (gam#979).
    robust_jeffreys_phi_hphi: Option<(f64, Array2<f64>)>,
    // Companion mode-response drift `D_β H_Φ[δβ]` for the outer gradient's trace
    // identity. `Some` exactly when `robust_jeffreys_phi_hphi` is `Some` (same
    // under-identified span); installing it wraps the derivative provider so the
    // first-order trace gains the `½ tr[(H+S_λ+H_Φ)⁻¹ D_β H_Φ[v_k]]` term that
    // makes the analytic gradient match the augmented objective. `None` ⇒ the
    // provider is used unwrapped.
    jeffreys_hphi_drift: Option<JeffreysHphiDriftFn>,
) -> Result<OuterObjectiveEvalResult, String> {
    let joint_trace_diagonal_ridge = moderidge + if !strict_spd { extra_logdet_ridge } else { 0.0 };
    let scaled_joint_trace_diagonal_ridge = rho_curvature_scale * joint_trace_diagonal_ridge;

    let (robust_jeffreys_phi, robust_jeffreys_hphi): (Option<f64>, Option<Array2<f64>>) =
        match robust_jeffreys_phi_hphi {
            Some((phi, hphi)) => (Some(phi), Some(hphi)),
            None => (None, None),
        };
    // Pre-scale the outer-REML Jeffreys curvature into the same rescaled space as
    // the penalties so the projected-logdet path and the operator agree. `None`
    // (flag OFF / no under-identified span) keeps the released outer REML exact.
    let scaled_robust_jeffreys_hphi: Option<Array2<f64>> = robust_jeffreys_hphi
        .as_ref()
        .map(|hphi| hphi.mapv(|value| rho_curvature_scale * value));

    // Build derivative provider from the caller-supplied closures.
    let base_provider_box: Box<dyn HessianDerivativeProvider + '_> =
        if let (Some(owned_dh), Some(owned_d2h)) = (owned_compute_dh, owned_compute_d2h) {
            Box::new(OwnedJointDerivProvider {
                compute_dh: owned_dh,
                compute_dh_many: owned_compute_dh_many,
                compute_d2h: owned_d2h,
                compute_d2h_many: owned_compute_d2h_many,
                family_outer_hessian_operator: batched_outer_hessian_operator.clone(),
            })
        } else {
            Box::new(BorrowedJointDerivProvider {
                compute_dh,
                compute_dh_many,
                compute_d2h,
                compute_d2h_many,
                family_outer_hessian_operator: batched_outer_hessian_operator.clone(),
            })
        };

    // Install the Jeffreys-`H_Φ` mode-response drift on top of the likelihood
    // drift whenever the Jeffreys term is active. This is the term that makes the
    // analytic outer gradient match the augmented objective `½ log|H+S_λ+H_Φ|`;
    // without it the gradient omits `D_β H_Φ[v_k]` and the line search / KKT
    // certification drifts in exactly the near-separating regime this machinery
    // exists for. `None` ⇒ provider used unwrapped (byte-identical released path).
    let provider_box: Box<dyn HessianDerivativeProvider + '_> = match jeffreys_hphi_drift {
        Some(drift) => Box::new(JeffreysHphiAwareJointDerivatives::new(
            base_provider_box,
            drift,
            total,
        )),
        None => base_provider_box,
    };

    let scaled_s_lambdas: Vec<Array2<f64>> = inner
        .s_lambdas
        .iter()
        .map(|matrix| {
            if rho_curvature_scale == 1.0 {
                matrix.clone()
            } else {
                matrix.mapv(|value| rho_curvature_scale * value)
            }
        })
        .collect();

    let hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator> =
        if use_joint_matrix_free_path(total, joint_observation_count(&inner.block_states)) {
            let ranges_vec = ranges.to_vec();
            let s_lambdas = Arc::new(scaled_s_lambdas.clone());
            let trace_diagonal_ridge = scaled_joint_trace_diagonal_ridge
                + rho_curvature_scale * JOINT_TRACE_STABILITY_RIDGE;
            match &h_joint_unpen {
                JointHessianSource::Dense(h_joint) => {
                    let h_joint = Arc::new(h_joint.clone());
                    let apply_h = Arc::clone(&h_joint);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    let apply_hphi = robust_jeffreys_hphi.clone();
                    let hphi_scale = rho_curvature_scale;
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = apply_h.dot(v);
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            if let Some(hphi) = apply_hphi.as_ref() {
                                let jeffreys = hphi.dot(v);
                                out.scaled_add(hphi_scale, &jeffreys);
                            }
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
                JointHessianSource::Operator { apply, .. } => {
                    let apply_h = Arc::clone(apply);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    let apply_hphi = robust_jeffreys_hphi.clone();
                    let hphi_scale = rho_curvature_scale;
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = match apply_h(v) {
                                Ok(out) => out,
                                Err(error) => {
                                    log::warn!(
                                        "joint exact-newton operator matvec failed during outer trace construction: {error}"
                                    );
                                    Array1::<f64>::from_elem(total, f64::NAN)
                                }
                            };
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            if let Some(hphi) = apply_hphi.as_ref() {
                                let jeffreys = hphi.dot(v);
                                out.scaled_add(hphi_scale, &jeffreys);
                            }
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
            }
        } else {
            let mut j_for_traces = materialize_joint_hessian_source(
                &h_joint_unpen,
                total,
                "joint exact-newton Hessian materialization",
            )?;
            add_joint_penalty_to_matrix(
                &mut j_for_traces,
                ranges,
                &scaled_s_lambdas,
                scaled_joint_trace_diagonal_ridge,
                None,
            );
            if let Some(hphi) = robust_jeffreys_hphi.as_ref() {
                j_for_traces.scaled_add(rho_curvature_scale, hphi);
            }
            Arc::new(
                BlockCoupledOperator::from_joint_hessian_with_mode(
                    &j_for_traces,
                    pseudo_logdet_mode,
                )
                .map_err(|e| format!("BlockCoupledOperator from joint Hessian: {e}"))?,
            )
        };

    let (projected_logdet_correction, penalty_subspace_trace) = if project_hessian_logdet
        && include_logdet_h
        && include_logdet_s
        && pseudo_logdet_mode == PseudoLogdetMode::Smooth
    {
        let (projected_logdet, kernel) = joint_penalty_subspace_trace_parts(
            &h_joint_unpen,
            ranges,
            &scaled_s_lambdas,
            total,
            scaled_joint_trace_diagonal_ridge,
            scaled_robust_jeffreys_hphi.as_ref(),
        )?;
        let correction = projected_logdet - hessian_op.logdet();
        if kernel.is_some() {
            log::debug!(
                "[OUTER hessian-route] joint penalty subspace trace installed correction={:.6e}",
                correction
            );
        }
        (correction, kernel.map(Arc::new))
    } else {
        (0.0, None)
    };
    let hessian_logdet_correction = hessian_logdet_correction + projected_logdet_correction;

    let expected_theta_dim = rho.len()
        + ext_bundle
            .as_ref()
            .map(|bundle| bundle.coords.len())
            .unwrap_or(0);
    let has_penalty_subspace_trace = penalty_subspace_trace.is_some();

    // Option C: when the caller already has the batched first-order
    // logdet traces, let the unified VGH path keep all mode-response,
    // second-order, and Hessian work, but short-circuit only the
    // soon-discarded first-order trace calls. The projected-subspace
    // trace path is left untouched because the Hessian shares that
    // kernel and it is not routed through HessianOperator trace methods.
    // Bind the gating flag before `penalty_subspace_trace` is consumed by
    // the call below so the trace-skip choice does not depend on a moved
    // value (was: `if penalty_subspace_trace.is_none()` evaluated AFTER
    // the trace had already been forwarded to `unified_joint_cost_gradient`).
    let first_order_trace_skip = if penalty_subspace_trace.is_none() {
        first_order_trace_skip
    } else {
        None
    };
    let (objective, grad, outer_hessian) = unified_joint_cost_gradient(
        inner,
        specs,
        per_block,
        rho,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        provider_box,
        eval_mode,
        ext_bundle.map(|bundle| bundle.scaled(rho_curvature_scale)),
        // Option C: when the caller already has the batched first-order
        // logdet traces, let the unified VGH path keep all mode-response,
        // second-order, and Hessian work, but short-circuit only the
        // soon-discarded first-order trace calls. The projected-subspace
        // trace path is left untouched because the Hessian shares that
        // kernel and it is not routed through HessianOperator trace methods.
        if has_penalty_subspace_trace {
            None
        } else {
            first_order_trace_skip
        },
        robust_jeffreys_phi,
    )?;
    if !objective.is_finite() {
        log::warn!(
            "joint outer evaluation produced non-finite objective: log_likelihood={} penalty_value={} block_logdet_h={} block_logdet_s={} include_logdet_h={} include_logdet_s={} rho_curvature_scale={}",
            inner.log_likelihood,
            inner.penalty_value,
            inner.block_logdet_h,
            inner.block_logdet_s,
            include_logdet_h,
            include_logdet_s,
            rho_curvature_scale,
        );
        return Err(CustomFamilyError::NumericalFailure {
            reason: "joint outer evaluation produced a non-finite objective".to_string(),
        }
        .into());
    }
    if grad.iter().any(|value| !value.is_finite()) {
        return Err(CustomFamilyError::NumericalFailure {
            reason: "joint outer evaluation produced a non-finite gradient".to_string(),
        }
        .into());
    }
    if grad.len() != expected_theta_dim {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "joint outer evaluation returned gradient length {}, expected {}",
                grad.len(),
                expected_theta_dim
            ),
        }
        .into());
    }
    match &outer_hessian {
        crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
            if hessian.iter().any(|value| !value.is_finite()) {
                return Err(CustomFamilyError::NumericalFailure {
                    reason: "joint outer evaluation produced a non-finite Hessian".to_string(),
                }
                .into());
            }
            if hessian.nrows() != expected_theta_dim || hessian.ncols() != expected_theta_dim {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "joint outer evaluation returned Hessian shape {}x{}, expected {}x{}",
                        hessian.nrows(),
                        hessian.ncols(),
                        expected_theta_dim,
                        expected_theta_dim
                    ),
                }
                .into());
            }
        }
        crate::solver::outer_strategy::HessianResult::Operator(op) => {
            if op.dim() != expected_theta_dim {
                return Err(format!(
                    "joint outer evaluation returned operator Hessian dim {}, expected {}",
                    op.dim(),
                    expected_theta_dim
                ));
            }
        }
        crate::solver::outer_strategy::HessianResult::Unavailable => {}
    }

    let warm = ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|st| st.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(inner)),
    };

    Ok(OuterObjectiveEvalResult {
        objective,
        gradient: grad,
        outer_hessian,
        warm_start: warm,
        inner_converged: inner.converged,
    })
}

fn joint_outer_evaluate_efs(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    h_joint_unpen: JointHessianSource,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    moderidge: f64,
    extra_logdet_ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    include_logdet_h: bool,
    include_logdet_s: bool,
    strict_spd: bool,
    project_hessian_logdet: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    pseudo_logdet_mode: PseudoLogdetMode,
    compute_dh: &DriftDerivFn<'_>,
    compute_dh_many: Option<&DriftDerivManyFn<'_>>,
    compute_d2h: &DriftSecondDerivFn<'_>,
    compute_d2h_many: Option<&DriftSecondDerivManyFn<'_>>,
    owned_compute_dh: Option<
        Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
    >,
    owned_compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    owned_compute_d2h: Option<
        Arc<
            dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
                + Send
                + Sync,
        >,
    >,
    owned_compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    ext_bundle: Option<ExtCoordBundle>,
) -> Result<crate::solver::outer_strategy::EfsEval, String> {
    let joint_trace_diagonal_ridge = moderidge + if !strict_spd { extra_logdet_ridge } else { 0.0 };
    let scaled_joint_trace_diagonal_ridge = rho_curvature_scale * joint_trace_diagonal_ridge;

    let provider_box: Box<dyn HessianDerivativeProvider + '_> =
        if let (Some(owned_dh), Some(owned_d2h)) = (owned_compute_dh, owned_compute_d2h) {
            Box::new(OwnedJointDerivProvider {
                compute_dh: owned_dh,
                compute_dh_many: owned_compute_dh_many,
                compute_d2h: owned_d2h,
                compute_d2h_many: owned_compute_d2h_many,
                family_outer_hessian_operator: None,
            })
        } else {
            Box::new(BorrowedJointDerivProvider {
                compute_dh,
                compute_dh_many,
                compute_d2h,
                compute_d2h_many,
                family_outer_hessian_operator: None,
            })
        };

    let scaled_s_lambdas: Vec<Array2<f64>> = inner
        .s_lambdas
        .iter()
        .map(|matrix| {
            if rho_curvature_scale == 1.0 {
                matrix.clone()
            } else {
                matrix.mapv(|value| rho_curvature_scale * value)
            }
        })
        .collect();

    let hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator> =
        if use_joint_matrix_free_path(total, joint_observation_count(&inner.block_states)) {
            let ranges_vec = ranges.to_vec();
            let s_lambdas = Arc::new(scaled_s_lambdas.clone());
            let trace_diagonal_ridge = scaled_joint_trace_diagonal_ridge
                + rho_curvature_scale * JOINT_TRACE_STABILITY_RIDGE;
            match &h_joint_unpen {
                JointHessianSource::Dense(h_joint) => {
                    let h_joint = Arc::new(h_joint.clone());
                    let apply_h = Arc::clone(&h_joint);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = apply_h.dot(v);
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
                JointHessianSource::Operator { apply, .. } => {
                    let apply_h = Arc::clone(apply);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = match apply_h(v) {
                                Ok(out) => out,
                                Err(error) => {
                                    log::warn!(
                                        "joint exact-newton operator matvec failed during fixed-point trace construction: {error}"
                                    );
                                    Array1::<f64>::from_elem(total, f64::NAN)
                                }
                            };
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
            }
        } else {
            let mut j_for_traces = materialize_joint_hessian_source(
                &h_joint_unpen,
                total,
                "joint exact-newton Hessian materialization for fixed-point evaluation",
            )?;
            add_joint_penalty_to_matrix(
                &mut j_for_traces,
                ranges,
                &scaled_s_lambdas,
                scaled_joint_trace_diagonal_ridge,
                None,
            );
            Arc::new(
                BlockCoupledOperator::from_joint_hessian_with_mode(
                    &j_for_traces,
                    pseudo_logdet_mode,
                )
                .map_err(|e| format!("BlockCoupledOperator from joint Hessian: {e}"))?,
            )
        };

    let (projected_logdet_correction, penalty_subspace_trace) = if project_hessian_logdet
        && include_logdet_h
        && include_logdet_s
        && pseudo_logdet_mode == PseudoLogdetMode::Smooth
    {
        let (projected_logdet, kernel) = joint_penalty_subspace_trace_parts(
            &h_joint_unpen,
            ranges,
            &scaled_s_lambdas,
            total,
            scaled_joint_trace_diagonal_ridge,
            None,
        )?;
        let correction = projected_logdet - hessian_op.logdet();
        if kernel.is_some() {
            log::debug!(
                "[OUTER hessian-route] joint EFS penalty subspace trace installed correction={:.6e}",
                correction
            );
        }
        (correction, kernel.map(Arc::new))
    } else {
        (0.0, None)
    };
    let hessian_logdet_correction = hessian_logdet_correction + projected_logdet_correction;

    unified_joint_efs_eval(
        inner,
        specs,
        per_block,
        rho,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        provider_box,
        ext_bundle.map(|bundle| bundle.scaled(rho_curvature_scale)),
    )
}

/// Evaluate the rho-only custom-family outer objective through the unified
/// joint hyperpath with no external ψ coordinates attached.
fn outerobjectivegradienthessian_internal<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho: &Array1<f64>,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
    let derivative_blocks = vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
    evaluate_custom_family_hyper_internal(
        family,
        specs,
        options,
        penalty_counts,
        rho,
        &derivative_blocks,
        warm_start,
        rho_prior,
        eval_mode,
    )
    .map_err(String::from)
}

fn outerobjectiveefs<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho: &Array1<f64>,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: crate::types::RhoPrior,
) -> Result<
    (
        crate::solver::outer_strategy::EfsEval,
        ConstrainedWarmStart,
        bool,
    ),
    String,
> {
    let include_logdet_h = include_exact_newton_logdet_h(family, options);
    let include_logdet_s = include_exact_newton_logdet_s(family, options);
    let strict_spd = use_exact_newton_strict_spd(family);
    let per_block = split_log_lambdas(rho, penalty_counts)?;
    let mut inner = inner_blockwise_fit(family, specs, &per_block, options, warm_start)?;
    if !inner.converged {
        log::warn!(
            "[OUTER] custom-family EFS inner solve did not converge after {} cycle(s); \
             skipping EFS derivative assembly for theta_dim={}",
            inner.cycles,
            rho.len(),
        );
        return nonconverged_outer_efs_result(
            &inner,
            rho,
            rho.len(),
            include_logdet_h,
            include_logdet_s,
            "custom-family EFS non-converged inner solve",
        );
    }
    let ridge = effective_solverridge(options.ridge_floor);
    let moderidge = if options.ridge_policy.include_quadratic_penalty {
        ridge
    } else {
        0.0
    };
    let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
        && !options.ridge_policy.include_quadratic_penalty
    {
        ridge
    } else {
        0.0
    };

    refresh_all_block_etas(family, specs, &mut inner.block_states)?;
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, end)| *end).unwrap_or(0);

    let efs_eval = {
        if let Some(joint_bundle) = build_joint_hessian_closures(
            family,
            &inner.block_states,
            specs,
            total,
            options,
            inner.joint_workspace.clone(),
        )? {
            let JointHessianBundle {
                source: h_joint_unpen,
                beta_flat,
                compute_dh,
                compute_dh_many,
                compute_d2h,
                compute_d2h_many,
                owned_compute_dh,
                owned_compute_dh_many,
                owned_compute_d2h,
                owned_compute_d2h_many,
                rho_curvature_scale,
                hessian_logdet_correction,
            } = joint_bundle;
            joint_outer_evaluate_efs(
                &inner,
                specs,
                &per_block,
                rho,
                &beta_flat,
                h_joint_unpen,
                &ranges,
                total,
                ridge,
                moderidge,
                extra_logdet_ridge,
                rho_curvature_scale,
                hessian_logdet_correction,
                include_logdet_h,
                include_logdet_s,
                strict_spd,
                family.use_projected_penalty_logdet(),
                options,
                rho_prior.clone(),
                family.pseudo_logdet_mode(),
                compute_dh.as_ref(),
                compute_dh_many.as_deref(),
                compute_d2h.as_ref(),
                compute_d2h_many.as_deref(),
                owned_compute_dh,
                owned_compute_dh_many,
                owned_compute_d2h,
                owned_compute_d2h_many,
                None,
            )
        } else {
            if family.requires_joint_outer_hyper_path() {
                return Err(
                        "outer hyper fixed-point evaluation requires a joint exact path for this family"
                            .to_string(),
                    );
            }
            if specs.len() != 1 {
                return Err(
                        "generic fixed-point outer fallback is only valid for single-block families; multi-block families must provide a joint outer path"
                            .to_string(),
                    );
            }

            let eval = family.evaluate(&inner.block_states)?;
            let block_idx = 0;
            let spec = &specs[block_idx];
            let work = &eval.blockworking_sets[block_idx];
            let p = spec.design.ncols();
            let mut diagonal_design = None::<DesignMatrix>;
            let h_joint_unpen = match work {
                BlockWorkingSet::Diagonal {
                    working_response: _,
                    working_weights,
                } => with_block_geometry(
                    family,
                    &inner.block_states,
                    spec,
                    block_idx,
                    |x_dyn, _| {
                        let w = floor_positiveworking_weights(working_weights, options.minweight);
                        let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
                        diagonal_design = Some(x_dyn.clone());
                        Ok(xtwx)
                    },
                )?,
                BlockWorkingSet::ExactNewton {
                    gradient: _,
                    hessian,
                } => {
                    if hessian.nrows() != p || hessian.ncols() != p {
                        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                            "block {block_idx} exact-newton Hessian shape mismatch in fixed-point outer evaluation: got {}x{}, expected {}x{}",
                            hessian.nrows(),
                            hessian.ncols(),
                            p,
                            p
                        ) }.into());
                    }
                    hessian.to_dense()
                }
            };
            let beta_flat = inner.block_states[block_idx].beta.clone();
            let compute_dh = |direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
                if !include_logdet_h {
                    return Ok(None);
                }
                match work {
                    BlockWorkingSet::ExactNewton { .. } => {
                        match family.exact_newton_hessian_directional_derivative(
                            &inner.block_states,
                            block_idx,
                            direction,
                        )? {
                            Some(h_exact) => {
                                Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                                    h_exact,
                                    p,
                                    &format!(
                                        "block {block_idx} exact-newton dH shape mismatch in fixed-point outer evaluation"
                                    ),
                                )?)))
                            }
                            None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                "missing exact-newton dH callback for block {block_idx} while fixed-point evaluation requires H_beta term"
                            ) }.into()),
                        }
                    }
                    BlockWorkingSet::Diagonal {
                        working_response: _,
                        working_weights,
                    } => {
                        let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
                                    format!(
                                        "missing dynamic design for block {block_idx} diagonal fixed-point correction"
                                    )
                                })?;
                        let wwork =
                            floor_positiveworking_weights(working_weights, options.minweight);
                        let x_dense = x_dyn.to_dense();
                        let n = x_dense.nrows();

                        let mut d_eta = x_dyn.matrixvectormultiply(direction);
                        let geom = family.block_geometry_directional_derivative(
                            &inner.block_states,
                            block_idx,
                            spec,
                            direction,
                        )?;
                        let mut correction_mat = Array2::<f64>::zeros((p, p));

                        if let Some(geom_dir) = geom {
                            d_eta += &geom_dir.d_offset;
                            if let Some(dx) = geom_dir.d_design {
                                d_eta += &fast_av(&dx, &beta_flat);
                                let mut wx = x_dense.clone();
                                let mut wdx = dx.clone();
                                ndarray::Zip::from(wx.rows_mut())
                                    .and(wdx.rows_mut())
                                    .and(wwork.view())
                                    .par_for_each(|mut wxr, mut wdxr, &wi| {
                                        if wi != 1.0 {
                                            wxr.mapv_inplace(|v| v * wi);
                                            wdxr.mapv_inplace(|v| v * wi);
                                        }
                                    });
                                correction_mat += &fast_atb(&dx, &wx);
                                correction_mat += &fast_atb(&x_dense, &wdx);
                            }
                        }

                        let dw = family
                                    .diagonalworking_weights_directional_derivative(
                                        &inner.block_states,
                                        block_idx,
                                        &d_eta,
                                    )?
                                    .ok_or_else(|| {
                                        format!(
                                            "missing diagonal dW callback for block {block_idx} while fixed-point evaluation requires H_beta term"
                                        )
                                    })?;
                        if dw.len() != n {
                            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                                "block {block_idx} diagonal dW length mismatch in fixed-point outer evaluation: got {}, expected {}",
                                dw.len(),
                                n
                            ) }.into());
                        }
                        let mut scaled_x = x_dense.clone();
                        ndarray::Zip::from(scaled_x.rows_mut())
                            .and(&dw)
                            .par_for_each(|mut sr, &dwi| sr.mapv_inplace(|v| v * dwi));
                        correction_mat += &fast_atb(&x_dense, &scaled_x);

                        Ok(Some(DriftDerivResult::Dense(correction_mat)))
                    }
                }
            };
            let compute_d2h = |u: &Array1<f64>,
                               v: &Array1<f64>|
             -> Result<Option<DriftDerivResult>, String> {
                if !include_logdet_h {
                    return Ok(None);
                }
                match work {
                    BlockWorkingSet::ExactNewton { .. } => {
                        match family.exact_newton_hessian_second_directional_derivative(
                            &inner.block_states,
                            block_idx,
                            u,
                            v,
                        )? {
                            Some(h_exact) => {
                                Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                                    h_exact,
                                    p,
                                    &format!(
                                        "block {block_idx} exact-newton d2H shape mismatch in fixed-point outer evaluation"
                                    ),
                                )?)))
                            }
                            None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                "missing exact-newton d2H callback for block {block_idx} while fixed-point evaluation requires H_beta_beta term"
                            ) }.into()),
                        }
                    }
                    BlockWorkingSet::Diagonal { .. } => {
                        let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
                            format!(
                                "missing dynamic design for block {block_idx} diagonal fixed-point second correction"
                            )
                        })?;
                        let x_dense = x_dyn.to_dense();
                        let n = x_dense.nrows();
                        let reject_second_order_geometry =
                            |label: &str,
                             geom: Option<BlockGeometryDirectionalDerivative>|
                             -> Result<(), String> {
                                if let Some(geom_dir) = geom {
                                    let has_offset =
                                        geom_dir.d_offset.iter().any(|value| *value != 0.0);
                                    if geom_dir.d_design.is_some() || has_offset {
                                        return Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                            "block {block_idx} diagonal d2H requires second-order block-geometry derivatives for {label}; use an exact-newton or joint outer path"
                                        ) }.into());
                                    }
                                }
                                Ok(())
                            };
                        reject_second_order_geometry(
                            "first direction",
                            family.block_geometry_directional_derivative(
                                &inner.block_states,
                                block_idx,
                                spec,
                                u,
                            )?,
                        )?;
                        reject_second_order_geometry(
                            "second direction",
                            family.block_geometry_directional_derivative(
                                &inner.block_states,
                                block_idx,
                                spec,
                                v,
                            )?,
                        )?;
                        let d_eta_u = x_dyn.matrixvectormultiply(u);
                        let d_eta_v = x_dyn.matrixvectormultiply(v);
                        let d2w = family
                            .diagonalworking_weights_second_directional_derivative(
                                &inner.block_states,
                                block_idx,
                                &d_eta_u,
                                &d_eta_v,
                            )?
                            .ok_or_else(|| {
                                format!(
                                    "missing diagonal d2W callback for block {block_idx} while fixed-point evaluation requires H_beta_beta term"
                                )
                            })?;
                        if d2w.len() != n {
                            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                                "block {block_idx} diagonal d2W length mismatch in fixed-point outer evaluation: got {}, expected {}",
                                d2w.len(),
                                n
                            ) }.into());
                        }
                        let mut scaled_x = x_dense.clone();
                        ndarray::Zip::from(scaled_x.rows_mut())
                            .and(&d2w)
                            .par_for_each(|mut sr, &d2wi| sr.mapv_inplace(|value| value * d2wi));
                        Ok(Some(DriftDerivResult::Dense(fast_atb(&x_dense, &scaled_x))))
                    }
                }
            };
            joint_outer_evaluate_efs(
                &inner,
                specs,
                &per_block,
                rho,
                &beta_flat,
                JointHessianSource::Dense(h_joint_unpen),
                &ranges,
                total,
                ridge,
                moderidge,
                extra_logdet_ridge,
                1.0,
                0.0,
                include_logdet_h,
                include_logdet_s,
                strict_spd,
                family.use_projected_penalty_logdet(),
                options,
                rho_prior.clone(),
                family.pseudo_logdet_mode(),
                &compute_dh,
                None,
                &compute_d2h,
                None,
                None,
                None,
                None,
                None,
                None,
            )
        }
    }?;

    let warm = ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|state| state.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(&inner)),
    };

    Ok((efs_eval, warm, inner.converged))
}

fn normalize_outer_eval_error_detail(error: &str) -> &str {
    // Any `String` round-tripped through `CustomFamilyError::From<String>`
    // gets re-wrapped as `InvalidInput { context: "custom-family string
    // boundary", … }`, which `Display`s as `custom-family invalid input
    // in custom-family string boundary: <reason>`. Strip that "boundary"
    // wrapper first, then the historical bare `custom-family invalid
    // input: ` form, so the `last objective error: …` summary surfaces
    // the inner reason root cause once — not the doubly-wrapped form
    // that masked the synthetic-failure marker the outer-objective error
    // contract pins.
    let stripped = error
        .strip_prefix("custom-family invalid input in custom-family string boundary: ")
        .unwrap_or(error);
    stripped
        .strip_prefix("custom-family invalid input: ")
        .unwrap_or(stripped)
}

// ═══════════════════════════════════════════════════════════════════════════
//  Section: joint outer hyper surface — unified calculus for [rho, psi]
// ═══════════════════════════════════════════════════════════════════════════
//
// The callers have already applied the current spatial coordinates `psi` when
// constructing `family`, `specs`, and `derivative_blocks`, so the explicit
// input into the section below is still only the smoothing vector
// `rho_current`. Mathematically, however, the surface being differentiated
// is the full joint profiled/Laplace objective in
//
//     theta = [rho, psi].
//
// The exact outer calculus is unified across all hypercoordinates:
//
//     J(theta)
//     = V(beta^(theta), theta)
//       + 0.5 log|H(beta^(theta), theta)|
//       - 0.5 log|S(theta)|_+,
//
// with stationarity and joint curvature
//
//     F(beta, theta) := V_beta(beta, theta) = 0,
//     H(beta, theta) := V_beta_beta(beta, theta).
//
// For each theta_i we need the fixed-beta objects
//
//     V_i, g_i := F_i, H_i,
//
// and for each pair (i, j)
//
//     V_ij, g_ij, H_ij,
//
// together with the beta-curvature contractions
//
//     D_beta H[u], D_beta^2 H[u, v], T_i[u] := D_beta H_i[u].
//
// These determine the exact joint mode responses
//
//     beta_i  = -H^{-1} g_i,
//     beta_ij = -H^{-1}(g_ij + H_i beta_j + H_j beta_i + D_beta H[beta_i] beta_j),
//
// and the total Hessian drifts
//
//     dot H_i
//     = H_i + D_beta H[beta_i],
//
//     ddot H_ij
//     = H_ij
//       + T_i[beta_j]
//       + T_j[beta_i]
//       + D_beta H[beta_ij]
//       + D_beta^2 H[beta_i, beta_j].
//
// Therefore the exact joint outer derivatives are
//
//     J_i
//     = V_i
//       + 0.5 tr(H^{-1} dot H_i)
//       - 0.5 partial_i log|S(theta)|_+,
//
//     J_ij
//     = (V_ij - g_i^T H^{-1} g_j)
//       + 0.5 [ tr(H^{-1} ddot H_ij)
//               - tr(H^{-1} dot H_j H^{-1} dot H_i) ]
//       - 0.5 partial^2_{ij} log|S(theta)|_+.
//
// In this unified view rho and psi differ only in the likelihood-side
// fixed-beta derivative objects contributed by the family. The generic exact
// assembler always adds realized penalty motion through `S(theta)` for every
// hypercoordinate:
//
// - `rho` coordinates usually have zero likelihood-side objects and pick up
//   their fixed-beta derivatives entirely from `S_rho` / `S_{rho rho}`
// - `psi` coordinates contribute likelihood-side objects from the family's
//   joint exact psi hooks and may also pick up extra penalty terms through
//   `S_psi`, `S_{rho psi}`, and `S_{psi psi}` when realized penalties move
//   with `psi`
//
// The implementation below follows this unified calculus directly. Once a
// family supplies the joint fixed-beta psi objects and the mixed
// `D_beta H_psi[u]` contraction, exact joint hyper evaluation treats `rho`
// and `psi` identically and returns the full profiled/Laplace Hessian over
// `theta = [rho, psi]`.
//
// ═══════════════════════════════════════════════════════════════════════════
//  Unified HyperCoord builders for ψ coordinates
// ═══════════════════════════════════════════════════════════════════════════

/// Assemble the penalty derivative matrix S_ψ = Σ_k exp(ρ_k) ∂S_k/∂ψ
/// in the *block-local* coefficient space (p_block × p_block).
///
/// When the derivative carries multi-penalty components the sum iterates
/// over all `(penalty_idx, s_part)` pairs.  When only a single
/// `penalty_index` is stored the derivative `s_psi` is scaled by that
/// penalty's current lambda.  If neither is present, the derivative is
/// zero (the ψ coordinate does not move any realized penalty).
fn assemble_block_local_s_psi(
    deriv: &CustomFamilyBlockPsiDerivative,
    per_block_rho: &Array1<f64>,
    p_block: usize,
) -> Array2<f64> {
    if let Some(ref components) = deriv.s_psi_penalty_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        for (penalty_idx, s_part) in components {
            s_part.add_scaled_to(per_block_rho[*penalty_idx].exp(), &mut s);
        }
        return s;
    }
    if let Some(ref components) = deriv.s_psi_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        for (penalty_idx, s_part) in components {
            s.scaled_add(per_block_rho[*penalty_idx].exp(), s_part);
        }
        s
    } else if let Some(penalty_idx) = deriv.penalty_index {
        deriv.s_psi.mapv(|v| per_block_rho[penalty_idx].exp() * v)
    } else {
        Array2::<f64>::zeros((p_block, p_block))
    }
}

/// Assemble the second penalty derivative matrix S_{ψ_i ψ_j} in block-local
/// coefficient space.
///
/// This mirrors the psi/psi branch of `joint_theta_penaltysecond_matrix` but
/// returns the block-local matrix directly instead of embedding it into the
/// full flattened coefficient space.
fn assemble_block_local_s_psi_psi(
    deriv_i: &CustomFamilyBlockPsiDerivative,
    local_j: usize,
    per_block_rho: &Array1<f64>,
    p_block: usize,
) -> Array2<f64> {
    if let Some(ref parts) = deriv_i.s_psi_psi_penalty_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        if let Some(pair_parts) = parts.get(local_j) {
            for (penalty_idx, s_part) in pair_parts {
                s_part.add_scaled_to(per_block_rho[*penalty_idx].exp(), &mut s);
            }
        }
        return s;
    }
    if let Some(ref parts) = deriv_i.s_psi_psi_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        if let Some(pair_parts) = parts.get(local_j) {
            for (penalty_idx, s_part) in pair_parts {
                s.scaled_add(per_block_rho[*penalty_idx].exp(), s_part);
            }
        }
        s
    } else if let Some(ref parts) = deriv_i.s_psi_psi {
        if let Some(s_part) = parts.get(local_j) {
            if let Some(penalty_index) = deriv_i.penalty_index {
                s_part.mapv(|v| per_block_rho[penalty_index].exp() * v)
            } else {
                Array2::<f64>::zeros((p_block, p_block))
            }
        } else {
            Array2::<f64>::zeros((p_block, p_block))
        }
    } else {
        Array2::<f64>::zeros((p_block, p_block))
    }
}

/// Build `HyperCoord` objects for ψ (custom family) hyperparameters.
///
/// Converts family-provided (a^ℓ, q, L) objects and penalty derivatives
/// into the unified (a, g, B, ld_s) format. Each ψ coordinate produces
/// one `HyperCoord` in the flattened joint coefficient space.
///
/// The mapping from family objects to HyperCoord is:
///
///   a    = a^ℓ_ψ + 0.5 β̂^T S_ψ β̂
///   g    = q_ψ + S_ψ β̂
///   B    = L_ψ + S_ψ
///   ld_s = tr(S₊⁻¹ S_ψ)
///
/// where S_ψ is the assembled penalty derivative in joint coefficient space.
pub fn build_psi_hyper_coords<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    synced_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    beta_flat: &Array1<f64>,
    rho: &[f64],
    penalty_counts: &[usize],
    s_logdet_blocks: Option<&[PenaltyPseudologdet]>,
    hessian_beta_independent: bool,
    psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Result<Vec<HyperCoord>, String> {
    let ranges = block_param_ranges(specs);
    let total = beta_flat.len();
    let per_block = split_log_lambdas(&Array1::from_vec(rho.to_vec()), penalty_counts)?;

    let mut coords = Vec::new();
    let mut psi_global = 0usize;

    let build_psi_hyper_coords_start = std::time::Instant::now();
    let total_axes: usize = derivative_blocks.iter().map(|b| b.len()).sum();

    let batched_terms: Option<Vec<ExactNewtonJointPsiTerms>> = match psi_workspace.as_ref() {
        Some(workspace) => workspace.first_order_terms_all()?,
        None => None,
    };

    // EXPLICIT ∂_ρ H_Φ context (gam#854). The joint-Jeffreys curvature `H_Φ` is
    // built from the JOINT Hessian `H_joint(β, ρ)`, so for a family whose
    // `H_joint` depends on a ψ hyperparameter (the adaptive penalty's `λ_m`/`ε_m`,
    // or any penalty folded into `H_joint`) it depends on ρ EXPLICITLY, not only
    // through β̂. The augmented-LAML score `½ tr[(H+S_λ+H_Φ)⁻¹ ∂_ρ(H+S_λ+H_Φ)]` then
    // needs the explicit term `∂_ρ_i H_Φ|_β` added to each ψ coord's drift (the
    // mode-response part `D_β H_Φ[v_k]` is already folded in elsewhere). We form it
    // from the SAME pieces the value path uses — the full identifiable Jeffreys span
    // `Z_J` and the snapshot joint Hessian `H_joint(β̂)` — once per evaluation, and
    // contract it per coord with `∂_ρ_i H_joint|_β` (the coord drift `dense_b`) and
    // `∂_ρ_i Hdot[e_a]|_β` (the family's ψ-Hessian directional derivative). `None`
    // unless the family uses the Jeffreys term and exposes a dense joint Hessian, so
    // every non-Jeffreys / operator-only family is byte-unchanged.
    let jeffreys_hphi_ctx: Option<(Array2<f64>, Array2<f64>)> = if family
        .joint_jeffreys_term_required()
        && derivative_blocks.iter().any(|block| !block.is_empty())
    {
        match (
            build_joint_jeffreys_subspace(specs, &ranges)?,
            family.exact_newton_joint_hessian_with_specs(synced_states, specs)?,
        ) {
            (Some(z), Some(h))
                if z.nrows() == total && h.nrows() == total && h.ncols() == total =>
            {
                Some((z, h))
            }
            _ => None,
        }
    } else {
        None
    };

    for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
        let (start, end) = ranges[block_idx];
        let p_block = end - start;

        for deriv in block_derivs.iter() {
            // 1. Get family-provided likelihood objects (joint flattened space).
            let psi_terms = if let Some(batched) = batched_terms.as_ref() {
                batched[psi_global].clone()
            } else if let Some(workspace) = psi_workspace.as_ref() {
                if let Some(terms) = workspace.first_order_terms(psi_global)? {
                    terms
                } else {
                    family
                        .exact_newton_joint_psi_terms(
                            synced_states,
                            specs,
                            derivative_blocks,
                            psi_global,
                        )?
                        .unwrap_or_else(|| ExactNewtonJointPsiTerms::zeros(total))
                }
            } else {
                family
                    .exact_newton_joint_psi_terms(
                        synced_states,
                        specs,
                        derivative_blocks,
                        psi_global,
                    )?
                    .unwrap_or_else(|| ExactNewtonJointPsiTerms::zeros(total))
            };

            // 2. Assemble S_ψ from penalty derivatives (block-local, not embedded).
            let s_psi_local = assemble_block_local_s_psi(deriv, &per_block[block_idx], p_block);

            // 3. Build HyperCoord using block-local S_ψ (avoids full p×p materialization).
            let beta_block = beta_flat.slice(ndarray::s![start..end]);
            let s_psi_beta_local = s_psi_local.dot(&beta_block);
            let a = psi_terms.objective_psi + 0.5 * beta_block.dot(&s_psi_beta_local);
            // Embed s_psi_beta into full p-vector for the score.
            let mut s_psi_beta = Array1::zeros(total);
            s_psi_beta
                .slice_mut(ndarray::s![start..end])
                .assign(&s_psi_beta_local);
            let g = &psi_terms.score_psi + &s_psi_beta;
            let ld_s = if let Some(blocks) = s_logdet_blocks {
                blocks[block_idx].tau_gradient_component(&s_psi_local)
            } else {
                0.0
            };

            // Build drift: use block-local representation when possible to avoid
            // materializing full p×p dense matrices.
            let drift = if psi_terms.hessian_psi_operator.is_some() {
                // No dense Hessian contribution — penalty is block-local, operator
                // (if present) handles the likelihood part. O(p_block²) fast path.
                HyperCoordDrift::from_block_local_and_operator(
                    s_psi_local,
                    start,
                    end,
                    total,
                    psi_terms.hessian_psi_operator,
                )
            } else {
                // Dense Hessian term exists (e.g., from non-implicit family).
                // Must add block-local penalty into the dense matrix.
                let mut dense_b = psi_terms.hessian_psi;
                dense_b
                    .slice_mut(ndarray::s![start..end, start..end])
                    .scaled_add(1.0, &s_psi_local);
                // `dense_b` is now `∂_ρ_i H_joint|_β`. Add the explicit Jeffreys term
                // `∂_ρ_i H_Φ|_β` (gam#854) using it as the H_joint perturbation, the
                // family's base directional Hessian derivative `Hdot[e_a]`, and the
                // ψ-Hessian directional derivative `∂_ρ_i Hdot[e_a]|_β`. The helper
                // returns zeros when the conditioning gate skips the term or the
                // family lacks the exact directional derivatives, so a clean /
                // well-conditioned fit is byte-unchanged.
                if let Some((z_j, h_joint)) = jeffreys_hphi_ctx.as_ref() {
                    let explicit_hphi =
                        crate::estimate::reml::jeffreys_subspace::joint_jeffreys_hphi_explicit_param_derivative(
                            h_joint.view(),
                            z_j.view(),
                            &dense_b,
                            |dir: &Array1<f64>| {
                                family.exact_newton_joint_hessian_directional_derivative_with_specs(
                                    synced_states,
                                    specs,
                                    dir,
                                )
                            },
                            |dir: &Array1<f64>| {
                                family.exact_newton_joint_psihessian_directional_derivative(
                                    synced_states,
                                    specs,
                                    derivative_blocks,
                                    psi_global,
                                    dir,
                                )
                            },
                        )?;
                    dense_b += &explicit_hphi;
                }
                HyperCoordDrift::from_parts(Some(dense_b), psi_terms.hessian_psi_operator)
            };

            coords.push(HyperCoord {
                a,
                g,
                drift,
                ld_s,
                b_depends_on_beta: !hessian_beta_independent,
                is_penalty_like: false,
                firth_g: None,
                tk_eta_fixed: None,
                tk_x_fixed: None,
            });

            psi_global += 1;
        }
    }

    log::info!(
        "[STAGE] build_psi_hyper_coords axis_count={} workspace_present={} elapsed={:.3}s",
        total_axes,
        psi_workspace.is_some(),
        build_psi_hyper_coords_start.elapsed().as_secs_f64(),
    );

    Ok(coords)
}

/// Build the direction-contracted ψψ second-order hook for the profiled θ-HVP
/// (#740).
///
/// Returns `Some(hook)` only when the family's psi workspace supplies a
/// combined-direction likelihood kernel (`second_order_terms_contracted`);
/// otherwise `None`, which keeps the outer-Hessian operator on the exact
/// per-pair `ext_ext_fn` assembly.
///
/// The hook produces, for the ψ-direction weights `α_ψ`, the
/// [`ContractedPsiSecondOrder`] ψψ-block contraction: it sums the family
/// likelihood contraction (from the workspace) with the generic ψψ penalty
/// motion, mirroring exactly the `α`-contraction of the per-pair `ext_ext`
/// callback's penalty terms (`½βᵀS_{ψiψj}β` into `objective`, `S_{ψiψj}β` into
/// `score`, `S_{ψiψj}` as a `BlockLocalDrift` into `hessian`, and the
/// `tau_hessian_component` into `ld_s`). Same-block-only, matching `ext_ext`.
///
/// `pub(crate)` so the #740 in-crate gate
/// `bernoulli_contracted_psi_hook_matches_per_pair_with_penalty` can assert the
/// generic penalty fold here equals `Σ_j α_j · build_psi_pair_callbacks().ext_ext(i, j)`.
pub(crate) fn build_contracted_psi_hook(
    specs: &[ParameterBlockSpec],
    derivative_blocks: SharedDerivativeBlocks,
    beta_flat: &Array1<f64>,
    rho: &[f64],
    penalty_counts: &[usize],
    s_logdet_blocks: Option<&[PenaltyPseudologdet]>,
    psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Result<Option<ContractedPsiSecondOrderFn>, String> {
    // The contraction is a representation/cost choice for the family likelihood
    // ψψ second-order; without a contracted family kernel there is nothing to
    // accelerate, so decline (the per-pair `ext_ext_fn` path stays).
    let Some(workspace) = psi_workspace else {
        return Ok(None);
    };

    let total = beta_flat.len();
    let ranges = block_param_ranges(specs);
    let per_block = Arc::new(split_log_lambdas(
        &Array1::from_vec(rho.to_vec()),
        penalty_counts,
    )?);
    let beta_arc = Arc::new(beta_flat.clone());
    let ranges_arc = Arc::new(ranges);
    let s_logdet_block_cache = Arc::new(s_logdet_blocks.map(|blocks| blocks.to_vec()));

    // ψ → (block, local) location and block-local S_ψ for every ψ axis, built
    // once. `s_local` (block-local S_ψ) is reused for the τ-Hessian and as the
    // first leg of the bilinear `tr(S⁺ S_ψi S⁺ S_ψj)` penalty-logdet term.
    struct PsiAxis {
        block: usize,
        local: usize,
        start: usize,
        end: usize,
        s_psi_local: Array2<f64>,
    }
    let mut axes: Vec<PsiAxis> = Vec::new();
    for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
        let (start, end) = ranges_arc[block_idx];
        let p_block = end - start;
        for (local_idx, deriv) in block_derivs.iter().enumerate() {
            let s_psi_local = assemble_block_local_s_psi(deriv, &per_block[block_idx], p_block);
            axes.push(PsiAxis {
                block: block_idx,
                local: local_idx,
                start,
                end,
                s_psi_local,
            });
        }
    }
    let axes = Arc::new(axes);
    let psi_dim = axes.len();
    if psi_dim == 0 {
        return Ok(None);
    }

    let derivative_blocks = Arc::clone(&derivative_blocks);

    let hook = move |alpha_psi: &[f64]| -> Result<Option<ContractedPsiSecondOrder>, String> {
        if alpha_psi.len() != psi_dim {
            return Err(format!(
                "contracted ψψ hook: alpha_psi length {} != psi_dim {psi_dim}",
                alpha_psi.len()
            ));
        }
        // Family likelihood ψψ contraction (one combined-direction row pass).
        // Declining here (e.g. a σ-aux axis carried weight) declines the whole
        // hook so the operator builder keeps the per-pair assembly.
        let Some(likelihood) = workspace.second_order_terms_contracted(alpha_psi)? else {
            return Ok(None);
        };
        let mut objective = likelihood.objective;
        let mut score = likelihood.score;
        let mut ld_s = Array1::<f64>::zeros(psi_dim);
        // Per-output-row penalty drift `Σ_j α_j S_{ψi ψj}` (block-local),
        // composed onto the likelihood `hessian[i]` operator below.
        let mut hessian: Vec<DriftDerivResult> = likelihood.hessian;
        if objective.len() != psi_dim || score.nrows() != psi_dim || hessian.len() != psi_dim {
            return Err(format!(
                "contracted ψψ hook: family kernel shape mismatch (objective={}, score_rows={}, hessian={}, psi_dim={psi_dim})",
                objective.len(),
                score.nrows(),
                hessian.len(),
            ));
        }

        for (i, axis_i) in axes.iter().enumerate() {
            let p_block = axis_i.end - axis_i.start;
            let beta_block = beta_arc.slice(s![axis_i.start..axis_i.end]).to_owned();
            // Combined same-block penalty second derivative
            //   S_{ψi ψ(α)}_local = Σ_{j: block_j == block_i} α_j S_{ψi ψj}_local,
            // and the combined first-leg penalty derivative
            //   S_ψ(α)_local = Σ_{j: block_j == block_i} α_j S_ψj_local
            // (the second leg of the bilinear penalty-logdet cross term).
            let mut s_psi_psi_alpha = Array2::<f64>::zeros((p_block, p_block));
            let mut s_psi_alpha = Array2::<f64>::zeros((p_block, p_block));
            for (j, axis_j) in axes.iter().enumerate() {
                let aj = alpha_psi[j];
                if aj == 0.0 || axis_j.block != axis_i.block {
                    continue;
                }
                let deriv_i = &derivative_blocks[axis_i.block][axis_i.local];
                let s_ij = assemble_block_local_s_psi_psi(
                    deriv_i,
                    axis_j.local,
                    &per_block[axis_i.block],
                    p_block,
                );
                s_psi_psi_alpha.scaled_add(aj, &s_ij);
                s_psi_alpha.scaled_add(aj, &axis_j.s_psi_local);
            }

            // objective += 0.5 βᵀ S_{ψi ψ(α)} β  (matches ext_ext `a`).
            let s_beta = s_psi_psi_alpha.dot(&beta_block);
            objective[i] += 0.5 * beta_block.dot(&s_beta);
            // score[i] (block-local slice) += S_{ψi ψ(α)} β  (matches ext_ext `g`).
            {
                let mut score_local = score.row_mut(i);
                let mut slot = score_local.slice_mut(s![axis_i.start..axis_i.end]);
                slot += &s_beta;
            }
            // hessian[i] += S_{ψi ψ(α)} as a block-local drift (matches the
            // ext_ext `b_operator` BlockLocalDrift composite).
            let block_drift: Arc<dyn HyperOperator> =
                Arc::new(crate::solver::estimate::reml::unified::BlockLocalDrift {
                    local: s_psi_psi_alpha.clone(),
                    start: axis_i.start,
                    end: axis_i.end,
                    total_dim: total,
                });
            let combined = match std::mem::replace(
                &mut hessian[i],
                DriftDerivResult::Operator(Arc::clone(&block_drift)),
            ) {
                DriftDerivResult::Operator(existing) => DriftDerivResult::Operator(Arc::new(
                    crate::solver::estimate::reml::unified::CompositeHyperOperator {
                        dense: None,
                        operators: vec![existing, block_drift],
                        dim_hint: total,
                    },
                )),
                DriftDerivResult::Dense(dense) => DriftDerivResult::Operator(Arc::new(
                    crate::solver::estimate::reml::unified::CompositeHyperOperator {
                        dense: Some(dense),
                        operators: vec![block_drift],
                        dim_hint: total,
                    },
                )),
            };
            hessian[i] = combined;

            // ld_s[i] += Σ_j α_j tau_hessian_component(S_ψi, S_ψj, S_{ψiψj})
            //         = tau_hessian_component(S_ψi, S_ψ(α), S_{ψi ψ(α)})
            // by the (linearity in the second leg + bilinearity of the cross)
            // of the τ-Hessian; matches the ext_ext `ld_s` contraction.
            if let Some(ref logdet_blocks) = *s_logdet_block_cache {
                let pld = &logdet_blocks[axis_i.block];
                ld_s[i] = pld.tau_hessian_component(
                    &axis_i.s_psi_local,
                    &s_psi_alpha,
                    Some(&s_psi_psi_alpha),
                );
            }
        }

        Ok(Some(ContractedPsiSecondOrder {
            objective,
            score,
            hessian,
            ld_s,
        }))
    };

    Ok(Some(Arc::new(hook) as ContractedPsiSecondOrderFn))
}

/// Build pair callbacks for ψ-ψ and ρ-ψ Hessian entries.
///
/// Returns two closures:
///
/// 1. **ext-ext** `(psi_i, psi_j) -> HyperCoordPair`: second-order
///    fixed-β objects for a pair of ψ coordinates.
///
/// 2. **rho-ext** `(rho_k, psi_j) -> HyperCoordPair`: mixed second-order
///    fixed-β objects for a ρ-ψ pair.
///
/// The closures capture (via `Arc`) shared references to penalty derivatives,
/// family state, and the penalty pseudo-inverse needed for logdet terms.
///
/// # Arguments
///
/// * `family` - The custom family instance (must be `Send + Sync + 'static`).
/// * `synced_states` - Synchronized block states at the current inner mode.
/// * `specs` - Parameter block specifications.
/// * `derivative_blocks` - Per-block ψ derivative payloads.
/// * `beta_flat` - Flattened joint coefficient vector at the inner mode.
/// * `rho` - Current log-smoothing parameters (flat).
/// * `penalty_counts` - Number of penalties per block.
/// * `s_logdet_blocks` - Optional exact block-local pseudologdet eigenspaces.
pub fn build_psi_pair_callbacks<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    synced_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    derivative_blocks: SharedDerivativeBlocks,
    beta_flat: &Array1<f64>,
    rho: &[f64],
    penalty_counts: &[usize],
    s_logdet_blocks: Option<&[PenaltyPseudologdet]>,
    psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Result<
    (
        Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>,
        Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>,
    ),
    String,
> {
    // Precompute shared data into Arc-wrapped clones for the closures.
    let ranges = block_param_ranges(specs);
    let total = beta_flat.len();
    let per_block = Arc::new(split_log_lambdas(
        &Array1::from_vec(rho.to_vec()),
        penalty_counts,
    )?);
    let specs_arc = Arc::new(specs.to_vec());
    let beta_arc = Arc::new(beta_flat.clone());
    let synced_arc = Arc::new(synced_states.to_vec());
    let ranges_arc = Arc::new(ranges);
    let family_arc = Arc::new(family.clone());

    let s_logdet_block_cache = Arc::new(s_logdet_blocks.map(|blocks| blocks.to_vec()));

    struct PsiPenaltyCacheEntry {
        block_idx: usize,
        local_idx: usize,
        start: usize,
        end: usize,
        /// Block-local S_ψ matrix, stored for use with `PenaltyPseudologdet` methods.
        s_local: Option<Array2<f64>>,
    }

    struct RhoPenaltyCacheEntry {
        block_idx: usize,
        penalty_idx: usize,
        start: usize,
        end: usize,
        /// Unscaled penalty matrix S_k for use with `PenaltyPseudologdet::rho_tau_hessian_component`.
        s_k_unscaled: Array2<f64>,
    }

    // Build the psi coordinate cache once. These block-local S_psi matrices are
    // reused by ψψ and ρψ callbacks, avoiding repeated assembly inside the
    // O(q²) ext-ext loop.
    let mut psi_penalty_cache: Vec<PsiPenaltyCacheEntry> = Vec::new();
    for (block_idx, block_derivs) in derivative_blocks.iter().enumerate() {
        let (start, end) = ranges_arc[block_idx];
        let p_block = end - start;
        for (local_idx, deriv) in block_derivs.iter().enumerate() {
            let s_local = assemble_block_local_s_psi(deriv, &per_block[block_idx], p_block);
            // Store the block-local S_ψ matrix when penalty logdet is active;
            // PenaltyPseudologdet methods will handle pseudoinverse and leakage internally.
            let s_local_opt = if s_logdet_block_cache.is_some() {
                Some(s_local)
            } else {
                None
            };
            psi_penalty_cache.push(PsiPenaltyCacheEntry {
                block_idx,
                local_idx,
                start,
                end,
                s_local: s_local_opt,
            });
        }
    }
    let psi_penalty_cache = Arc::new(psi_penalty_cache);

    let mut rho_penalty_cache: Vec<RhoPenaltyCacheEntry> = Vec::new();
    for (block_idx, &count) in penalty_counts.iter().enumerate() {
        let (start, end) = ranges_arc[block_idx];
        for penalty_idx in 0..count {
            let s_k_unscaled = specs_arc[block_idx].penalties[penalty_idx].to_dense();
            rho_penalty_cache.push(RhoPenaltyCacheEntry {
                block_idx,
                penalty_idx,
                start,
                end,
                s_k_unscaled,
            });
        }
    }
    let rho_penalty_cache = Arc::new(rho_penalty_cache);

    // ψ-ψ pair callback
    let ext_ext = {
        let per_block = Arc::clone(&per_block);
        let derivative_blocks = Arc::clone(&derivative_blocks);
        let specs_arc = Arc::clone(&specs_arc);
        let beta_arc = Arc::clone(&beta_arc);
        let synced_arc = Arc::clone(&synced_arc);
        let s_logdet_block_cache = Arc::clone(&s_logdet_block_cache);
        let psi_penalty_cache = Arc::clone(&psi_penalty_cache);
        let family_arc = Arc::clone(&family_arc);
        let psi_workspace = psi_workspace.clone();

        Box::new(move |psi_i: usize, psi_j: usize| -> HyperCoordPair {
            // Defensive bounds check: callers in the unified outer solver only ever
            // pass indices in `0..psi_penalty_cache.len()`, but treating an OOB
            // request as a documented zero-pair sentinel keeps integration code
            // (which may probe spurious coordinate pairs while building joint
            // Hessian sparsity patterns) panic-free.
            if psi_i >= psi_penalty_cache.len() || psi_j >= psi_penalty_cache.len() {
                return HyperCoordPair::zero();
            }
            let cache_i = &psi_penalty_cache[psi_i];
            let cache_j = &psi_penalty_cache[psi_j];

            // Get family-provided second-order likelihood terms.
            let psi2 = if let Some(workspace) = psi_workspace.as_ref() {
                workspace.second_order_terms(psi_i, psi_j).ok().flatten()
            } else {
                family_arc
                    .exact_newton_joint_psisecond_order_terms(
                        &synced_arc,
                        &specs_arc,
                        &derivative_blocks,
                        psi_i,
                        psi_j,
                    )
                    .ok()
                    .flatten()
            };

            let (obj_ll, score_ll, hess_ll, hess_ll_op) = match psi2 {
                Some(t) => (
                    t.objective_psi_psi,
                    t.score_psi_psi,
                    t.hessian_psi_psi,
                    t.hessian_psi_psi_operator,
                ),
                None => (
                    0.0,
                    Array1::zeros(total),
                    Array2::zeros((total, total)),
                    None,
                ),
            };

            let mut a = obj_ll;
            let mut g = score_ll;
            let mut b_mat = hess_ll;
            let mut b_operator = hess_ll_op;

            // Assemble S_{ψ_i ψ_j} only on the touched block.
            let ld_s = if cache_i.block_idx == cache_j.block_idx {
                let p_block = cache_i.end - cache_i.start;
                let deriv_i = &derivative_blocks[cache_i.block_idx][cache_i.local_idx];
                let s_local = assemble_block_local_s_psi_psi(
                    deriv_i,
                    cache_j.local_idx,
                    &per_block[cache_i.block_idx],
                    p_block,
                );

                let beta_block = beta_arc.slice(s![cache_i.start..cache_i.end]).to_owned();
                let s_ij_beta_local = s_local.dot(&beta_block);
                a += 0.5 * beta_block.dot(&s_ij_beta_local);
                {
                    let mut g_local = g.slice_mut(s![cache_i.start..cache_i.end]);
                    g_local += &s_ij_beta_local;
                }
                // The S_{ψ_i ψ_j} block contribution attaches to the dense
                // Hessian when the family returned a dense `b_mat`, and to
                // the operator-backed Hessian (via a `BlockLocalDrift`
                // composite) when the family returned `hessian_psi_psi`
                // empty alongside an operator. Slicing into a `(0, 0)`
                // dense matrix would otherwise panic in the matrix-free
                // path that survival-marginal-slope and other operator-
                // backed families use.
                if b_mat.nrows() > 0 {
                    let mut b_local =
                        b_mat.slice_mut(s![cache_i.start..cache_i.end, cache_i.start..cache_i.end]);
                    b_local += &s_local;
                } else {
                    let block_drift: Arc<dyn HyperOperator> =
                        Arc::new(crate::solver::estimate::reml::unified::BlockLocalDrift {
                            local: s_local.clone(),
                            start: cache_i.start,
                            end: cache_i.end,
                            total_dim: total,
                        });
                    b_operator = Some(match b_operator.take() {
                        Some(existing) => {
                            let existing_arc: Arc<dyn HyperOperator> = Arc::from(existing);
                            Box::new(
                                crate::solver::estimate::reml::unified::CompositeHyperOperator {
                                    dense: None,
                                    operators: vec![existing_arc, block_drift],
                                    dim_hint: total,
                                },
                            ) as Box<dyn HyperOperator>
                        }
                        None => Box::new(crate::solver::estimate::reml::unified::BlockLocalDrift {
                            local: s_local.clone(),
                            start: cache_i.start,
                            end: cache_i.end,
                            total_dim: total,
                        }) as Box<dyn HyperOperator>,
                    });
                }

                if let Some(ref logdet_blocks) = *s_logdet_block_cache {
                    let pld = &logdet_blocks[cache_i.block_idx];
                    let s_psi_i = cache_i
                        .s_local
                        .as_ref()
                        .expect("psi cache should include S_psi when penalty logdet is active");
                    let s_psi_j = cache_j
                        .s_local
                        .as_ref()
                        .expect("psi cache should include S_psi when penalty logdet is active");
                    // τ-Hessian: tr(S⁺ S_{ψi ψj}) − tr(S⁺ S_ψi S⁺ S_ψj) + 2 tr(Σ₊⁻² L_i L_j^T)
                    pld.tau_hessian_component(s_psi_i, s_psi_j, Some(&s_local))
                } else {
                    0.0
                }
            } else {
                0.0
            };

            HyperCoordPair {
                a,
                g,
                b_mat,
                b_operator,
                ld_s,
            }
        }) as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
    };

    // ρ-ψ pair callback
    let rho_ext = {
        let per_block = Arc::clone(&per_block);
        let derivative_blocks = Arc::clone(&derivative_blocks);
        let beta_arc = Arc::clone(&beta_arc);
        let psi_penalty_cache = Arc::clone(&psi_penalty_cache);
        let rho_penalty_cache = Arc::clone(&rho_penalty_cache);
        let s_logdet_block_cache = Arc::clone(&s_logdet_block_cache);

        Box::new(move |rho_k: usize, psi_j: usize| -> HyperCoordPair {
            if rho_k >= rho_penalty_cache.len() || psi_j >= psi_penalty_cache.len() {
                return HyperCoordPair::zero();
            }
            let rho_cache = &rho_penalty_cache[rho_k];
            let psi_cache = &psi_penalty_cache[psi_j];
            let mut a = 0.0;
            let mut g = Array1::<f64>::zeros(total);
            let mut b_mat = Array2::<f64>::zeros((total, total));

            // S_{ρ_k, ψ_j} = λ_k ∂S_k/∂ψ_j.
            // Only nonzero when both coordinates share the same block and the
            // ψ derivative touches the k-th penalty.
            let ld_s = if rho_cache.block_idx == psi_cache.block_idx {
                let p_block = rho_cache.end - rho_cache.start;
                let deriv = &derivative_blocks[psi_cache.block_idx][psi_cache.local_idx];
                let lambda_k = per_block[rho_cache.block_idx][rho_cache.penalty_idx].exp();
                let local = if let Some(ref components) = deriv.s_psi_penalty_components {
                    let mut m = Array2::<f64>::zeros((p_block, p_block));
                    for (penalty_idx, s_part) in components {
                        if *penalty_idx == rho_cache.penalty_idx {
                            s_part.add_scaled_to(lambda_k, &mut m);
                        }
                    }
                    m
                } else if let Some(ref components) = deriv.s_psi_components {
                    let mut m = Array2::<f64>::zeros((p_block, p_block));
                    for (penalty_idx, s_part) in components {
                        if *penalty_idx == rho_cache.penalty_idx {
                            m.scaled_add(lambda_k, s_part);
                        }
                    }
                    m
                } else if deriv.penalty_index == Some(rho_cache.penalty_idx) {
                    deriv.s_psi.mapv(|v| lambda_k * v)
                } else {
                    Array2::<f64>::zeros((p_block, p_block))
                };

                let beta_block = beta_arc
                    .slice(s![rho_cache.start..rho_cache.end])
                    .to_owned();
                let s_kj_beta_local = local.dot(&beta_block);
                a = 0.5 * beta_block.dot(&s_kj_beta_local);
                {
                    let mut g_local = g.slice_mut(s![rho_cache.start..rho_cache.end]);
                    g_local += &s_kj_beta_local;
                }
                {
                    let mut b_local = b_mat.slice_mut(s![
                        rho_cache.start..rho_cache.end,
                        rho_cache.start..rho_cache.end
                    ]);
                    b_local += &local;
                }

                if let Some(ref logdet_blocks) = *s_logdet_block_cache {
                    let pld = &logdet_blocks[rho_cache.block_idx];
                    let s_psi_j = psi_cache
                        .s_local
                        .as_ref()
                        .expect("psi cache should include S_psi when penalty logdet is active");
                    // ∂S_k/∂ψ_j (unscaled): extract from local by dividing out λ_k.
                    let ds_k_dpsi = if lambda_k.abs() > 1e-300 {
                        Some(local.mapv(|v| v / lambda_k))
                    } else {
                        None
                    };
                    // Mixed ρ×τ Hessian: λ_k [tr(S⁺ ∂S_k/∂ψ_j) − tr(S⁺ S_k S⁺ S_ψj)]
                    pld.rho_tau_hessian_component(
                        &rho_cache.s_k_unscaled,
                        lambda_k,
                        s_psi_j,
                        ds_k_dpsi.as_ref(),
                    )
                } else {
                    0.0
                }
            } else {
                0.0
            };

            HyperCoordPair {
                a,
                g,
                b_mat,
                b_operator: None,
                ld_s,
            }
        }) as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
    };

    Ok((ext_ext, rho_ext))
}

/// Build the M_i[u] = D_β B_i[u] callback for ψ coordinates.
///
/// This wraps `family.exact_newton_joint_psihessian_directional_derivative`
/// into the unified `FixedDriftDerivFn` signature. For each external
/// (ψ) coordinate index `ext_idx`, calling `f(ext_idx, &direction)` returns
/// `Some(D_β H_ψ[u])` when the family provides it, or `None` otherwise.
///
/// The returned closure also adds the penalty-side β-drift when the ψ
/// coordinate moves realized penalties: `D_β S_ψ[u] = 0` for ψ that
/// only enters via the likelihood, so the penalty contribution vanishes
/// and the callback delegates entirely to the family hook. (Penalty
/// matrices S_ψ do not depend on β, so their β-directional derivative
/// is zero.)
///
/// # Returns
///
/// `Some(callback)` when the family potentially provides the drift term.
/// `None` when the family is Gaussian (B_i is β-independent for all
/// coordinates, so M_i ≡ 0).
pub fn build_psi_drift_deriv_callback<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    synced_states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    derivative_blocks_arc: SharedDerivativeBlocks,
    hessian_beta_independent: bool,
    psi_workspace: Option<Arc<dyn ExactNewtonJointPsiWorkspace>>,
) -> Option<FixedDriftDerivFn> {
    if hessian_beta_independent {
        // Likelihood Hessian is β-independent; M_i ≡ 0.
        return None;
    }

    let synced_arc = Arc::new(synced_states.to_vec());
    let specs_arc = Arc::new(specs.to_vec());
    let family_arc = Arc::new(family.clone());
    let psi_workspace = psi_workspace;

    Some(Box::new(
        move |ext_idx: usize, direction: &Array1<f64>| -> Option<DriftDerivResult> {
            // The family hook takes a psi index (0-based within ψ coordinates)
            // and a flattened coefficient direction.
            if let Some(workspace) = psi_workspace.as_ref() {
                workspace
                    .hessian_directional_derivative(ext_idx, direction)
                    .ok()
                    .flatten()
            } else {
                family_arc
                    .exact_newton_joint_psihessian_directional_derivative(
                        &synced_arc,
                        &specs_arc,
                        &derivative_blocks_arc,
                        ext_idx,
                        direction,
                    )
                    .ok()
                    .flatten()
                    .map(DriftDerivResult::Dense)
            }
        },
    ))
}

fn evaluate_custom_family_hyper_internal<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho_current: &Array1<f64>,
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, CustomFamilyError> {
    evaluate_custom_family_hyper_internal_shared(
        family,
        specs,
        options,
        penalty_counts,
        rho_current,
        Arc::new(derivative_blocks.to_vec()),
        warm_start,
        rho_prior,
        eval_mode,
    )
}

fn evaluate_custom_family_hyper_internal_shared<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho_current: &Array1<f64>,
    derivative_blocks: SharedDerivativeBlocks,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, CustomFamilyError> {
    if derivative_blocks.len() != specs.len() {
        crate::bail_dim_custom!(
            "joint hyper derivative block count mismatch: got {}, expected {}",
            derivative_blocks.len(),
            specs.len()
        );
    }

    if penalty_counts.len() != specs.len() {
        crate::bail_dim_custom!(
            "joint hyper penalty-count block mismatch: got {}, expected {}",
            penalty_counts.len(),
            specs.len()
        );
    }
    let rho_dim = penalty_counts.iter().sum::<usize>();
    let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
    if rho_current.len() != rho_dim {
        crate::bail_dim_custom!(
            "joint hyper rho dimension mismatch: got {}, expected {} (psi={})",
            rho_current.len(),
            rho_dim,
            psi_dim
        );
    }

    // ── Common setup: inner solve, ridge, refresh, ranges ──
    let include_logdet_h = include_exact_newton_logdet_h(family, options);
    let include_logdet_s = include_exact_newton_logdet_s(family, options);
    let strict_spd = use_exact_newton_strict_spd(family);
    let per_block = split_log_lambdas(rho_current, penalty_counts)?;
    let psi_safe_warm_start =
        warm_start_without_cached_inner_for_psi_derivatives(warm_start, psi_dim > 0);
    let mut inner = inner_blockwise_fit(
        family,
        specs,
        &per_block,
        options,
        psi_safe_warm_start.as_ref().or(warm_start),
    )?;
    if !inner.converged {
        let theta_dim = rho_dim + psi_dim;
        return Err(CustomFamilyError::UnsupportedConfiguration {
            reason: format!(
                "custom-family inner solve did not converge after {} cycle(s); \
             refusing to expose profile objective derivatives for theta_dim={} \
             (rho_dim={}, psi_dim={}). The analytic outer gradient/Hessian \
             require the inner KKT equation F_beta(beta, theta)=0; returning \
             a value with zero or shape-only derivatives is mathematically \
             inconsistent.",
                inner.cycles, theta_dim, rho_dim, psi_dim
            ),
        });
    }
    let ridge = effective_solverridge(options.ridge_floor);
    let moderidge = if options.ridge_policy.include_quadratic_penalty {
        ridge
    } else {
        0.0
    };
    let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
        && !options.ridge_policy.include_quadratic_penalty
    {
        ridge
    } else {
        0.0
    };

    refresh_all_block_etas(family, specs, &mut inner.block_states)?;
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);

    // ── Try to obtain a joint Hessian and route through the unified evaluator ──
    //
    // When psi_dim > 0, exact Newton is required because the ψ derivative
    // callbacks use exact Newton trait methods. When psi_dim == 0,
    // build_joint_hessian_closures handles both exact Newton and surrogate.
    let cthf_internal_psi_branch_start = std::time::Instant::now();
    if psi_dim > 0 {
        log::info!(
            "[STAGE] cthf_internal psi_dim={} eval_mode={:?} pre_unified elapsed={:.3}s",
            psi_dim,
            eval_mode,
            cthf_internal_psi_branch_start.elapsed().as_secs_f64(),
        );
        // ψ coordinates present: require exact Newton Hessian for consistency
        // with the psi derivative callbacks.
        let beta_flat = flatten_state_betas(&inner.block_states, specs);
        let synced_joint_states = Arc::new(synchronized_states_from_flat_beta(
            family,
            specs,
            &inner.block_states,
            &beta_flat,
        )?);
        let hessian_workspace = match inner.joint_workspace.clone() {
            Some(workspace) => Some(workspace),
            None => family.exact_newton_joint_hessian_workspace_with_options(
                synced_joint_states.as_ref(),
                specs,
                options,
            )?,
        };
        // Outer-eval entry: prime per-row jet caches before the ext-coord
        // par_iter — see `warm_up_outer_caches` doc.
        if let Some(workspace) = hessian_workspace.as_ref() {
            workspace.warm_up_outer_caches()?;
        }
        let (
            h_joint_unpen,
            rho_curvature_scale,
            hessian_logdet_correction,
            use_outer_curvature_derivatives,
        ) = if let Some(curvature) = family.exact_newton_outer_curvature(&inner.block_states)? {
            (
                JointHessianSource::Dense(symmetrized_square_matrix(
                    curvature.hessian,
                    total,
                    "joint exact-newton Hessian shape mismatch in joint hyper evaluator (rescaled)",
                )?),
                curvature.rho_curvature_scale,
                curvature.hessian_logdet_correction,
                true,
            )
        } else {
            let h_joint_unpen = if let Some(workspace) = hessian_workspace.as_ref() {
                exact_newton_joint_hessian_source_from_workspace(
                    workspace,
                    total,
                    MaterializationIntent::OuterEvaluation,
                    "joint exact-newton operator mismatch in joint hyper evaluator",
                )?
            } else {
                None
            };
            (
                match h_joint_unpen {
                    Some(source) => Some(source),
                    None => exact_newton_joint_hessian_symmetrized(
                        family,
                        &inner.block_states,
                        specs,
                        total,
                        "joint exact-newton Hessian shape mismatch in joint hyper evaluator",
                    )
                    .map(|source| source.map(JointHessianSource::Dense))?,
                }
                .ok_or_else(|| -> CustomFamilyError {
                    "joint exact-newton Hessian unavailable for full [rho, psi] outer calculus"
                        .to_string()
                        .into()
                })?,
                1.0,
                0.0,
                false,
            )
        };

        // Build the exact pseudologdet eigenspace for each penalty block so
        // the value, ψ gradient, ψψ Hessian, and ρψ mixed block all
        // differentiate the same log|S|_+ objective.
        let s_logdet_blocks = if include_logdet_s {
            use rayon::iter::{IntoParallelIterator, ParallelIterator};
            let block_results: Vec<Result<PenaltyPseudologdet, String>> = (0..specs.len())
                .into_par_iter()
                .map(|b| {
                    let spec = &specs[b];
                    let p = spec.design.ncols();
                    let lambdas = per_block[b].mapv(f64::exp);
                    let mut s_lambda = Array2::<f64>::zeros((p, p));
                    for (k, s) in spec.penalties.iter().enumerate() {
                        s.add_scaled_to(lambdas[k], &mut s_lambda);
                    }
                    let ridge_hint = if options.ridge_policy.include_penalty_logdet {
                        for d in 0..p {
                            s_lambda[[d, d]] += ridge;
                        }
                        Some(ridge)
                    } else {
                        None
                    };
                    // No metadata-based structural-nullity hint: the
                    // PenaltyPseudologdet classifier derives the positive
                    // eigenspace from the assembled spectrum alone (issues
                    // #192/#318).
                    PenaltyPseudologdet::from_assembled(s_lambda, ridge_hint)
                })
                .collect();
            let blocks: Result<Vec<_>, _> = block_results.into_iter().collect();
            Some(blocks?)
        } else {
            None
        };

        // Build ψ HyperCoords, pair callbacks, and drift derivative callback.
        let hessian_beta_independent = !family.exact_newton_joint_hessian_beta_dependent();
        let psi_workspace = if eval_mode != EvalMode::ValueOnly
            && (eval_mode == EvalMode::ValueGradientHessian
                || family.exact_newton_joint_psi_workspace_for_first_order_terms())
        {
            family.exact_newton_joint_psi_workspace_with_options(
                synced_joint_states.as_ref(),
                specs,
                derivative_blocks.as_ref(),
                options,
            )?
        } else {
            None
        };

        let rho_slice = rho_current
            .as_slice()
            .ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
        let ext_bundle = if eval_mode == EvalMode::ValueOnly {
            None
        } else {
            let psi_coords = build_psi_hyper_coords(
                family,
                synced_joint_states.as_ref(),
                specs,
                derivative_blocks.as_ref(),
                &beta_flat,
                rho_slice,
                penalty_counts,
                s_logdet_blocks.as_deref(),
                hessian_beta_independent,
                psi_workspace.clone(),
            )?;

            let (ext_ext_fn, rho_ext_fn, drift_fn, contracted_psi_fn) =
                if eval_mode == EvalMode::ValueGradientHessian {
                    let (ext_ext_fn, rho_ext_fn) = build_psi_pair_callbacks(
                        family,
                        synced_joint_states.as_ref(),
                        specs,
                        Arc::clone(&derivative_blocks),
                        &beta_flat,
                        rho_slice,
                        penalty_counts,
                        s_logdet_blocks.as_deref(),
                        psi_workspace.clone(),
                    )?;
                    // #740: build the direction-contracted ψψ hook from the same psi
                    // workspace + penalty data the per-pair `ext_ext_fn` uses, so the
                    // matrix-free outer-Hessian operator collapses the `K²` per-pair
                    // ψψ assembly to one combined-direction family row pass per
                    // matvec. `None` (no contracted family kernel) keeps the exact
                    // per-pair `ext_ext_fn` path. Built before the drift callback
                    // moves `psi_workspace`.
                    let contracted_psi_fn = build_contracted_psi_hook(
                        specs,
                        Arc::clone(&derivative_blocks),
                        &beta_flat,
                        rho_slice,
                        penalty_counts,
                        s_logdet_blocks.as_deref(),
                        psi_workspace.clone(),
                    )?;
                    let drift_fn = build_psi_drift_deriv_callback(
                        family,
                        synced_joint_states.as_ref(),
                        specs,
                        Arc::clone(&derivative_blocks),
                        hessian_beta_independent,
                        psi_workspace,
                    );
                    (
                        Some(ext_ext_fn),
                        Some(rho_ext_fn),
                        drift_fn,
                        contracted_psi_fn,
                    )
                } else {
                    (None, None, None, None)
                };

            Some(ExtCoordBundle {
                coords: psi_coords,
                ext_ext_fn,
                rho_ext_fn,
                drift_fn,
                contracted_psi_fn,
            })
        };

        // Build derivative provider for the ρ coordinates (D_β H[v]).
        let compute_dh = exact_newton_dh_closure(
            family,
            Arc::clone(&synced_joint_states),
            specs,
            total,
            use_outer_curvature_derivatives,
            if use_outer_curvature_derivatives {
                1.0
            } else {
                rho_curvature_scale
            },
            hessian_workspace.clone(),
        );
        let compute_dh_many = if use_outer_curvature_derivatives {
            None
        } else {
            exact_newton_dh_many_closure(rho_curvature_scale, hessian_workspace.clone())
        };
        let compute_d2h = exact_newton_d2h_closure(
            family,
            Arc::clone(&synced_joint_states),
            specs,
            total,
            use_outer_curvature_derivatives,
            if use_outer_curvature_derivatives {
                1.0
            } else {
                rho_curvature_scale
            },
            hessian_workspace.clone(),
        );
        let owned_compute_dh = exact_newton_dh_closure_owned(
            family.clone(),
            Arc::clone(&synced_joint_states),
            specs.to_vec(),
            total,
            use_outer_curvature_derivatives,
            if use_outer_curvature_derivatives {
                1.0
            } else {
                rho_curvature_scale
            },
            hessian_workspace.clone(),
        );
        let owned_compute_dh_many = if use_outer_curvature_derivatives {
            None
        } else {
            exact_newton_dh_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
        };
        let owned_compute_d2h = exact_newton_d2h_closure_owned(
            family.clone(),
            Arc::clone(&synced_joint_states),
            specs.to_vec(),
            total,
            use_outer_curvature_derivatives,
            if use_outer_curvature_derivatives {
                1.0
            } else {
                rho_curvature_scale
            },
            hessian_workspace.clone(),
        );
        let compute_d2h_many = if use_outer_curvature_derivatives {
            None
        } else {
            exact_newton_d2h_many_closure(rho_curvature_scale, hessian_workspace.clone())
        };
        let owned_compute_d2h_many = if use_outer_curvature_derivatives {
            None
        } else {
            exact_newton_d2h_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
        };

        // Route through the unified path (joint_outer_evaluate → reml_laml_evaluate).
        let eval_result = joint_outer_evaluate(
            &inner,
            specs,
            &per_block,
            rho_current,
            &beta_flat,
            h_joint_unpen,
            &ranges,
            total,
            ridge,
            moderidge,
            extra_logdet_ridge,
            rho_curvature_scale,
            hessian_logdet_correction,
            include_logdet_h,
            include_logdet_s,
            strict_spd,
            // ψ-bearing path (matern/duchon marginal-slope kernel length-scales):
            // use the projected #752 generalized determinant for value AND
            // gradient AND Hessian — all produced by this single call, so they are
            // consistent by construction. This is the route the clustered-PC
            // matern bernoulli/survival marginal-slope fits take, where the
            // range(Sλ)-only determinant dropped the penalty-null trend likelihood
            // determinant and froze the outer gradient (gam#808/#787). No batched
            // override is possible here (it is gated to psi_dim==0).
            family.use_projected_penalty_logdet(),
            eval_mode,
            options,
            rho_prior.clone(),
            family.pseudo_logdet_mode(),
            &compute_dh,
            compute_dh_many.as_deref(),
            &compute_d2h,
            compute_d2h_many.as_deref(),
            Some(owned_compute_dh),
            owned_compute_dh_many,
            Some(owned_compute_d2h),
            owned_compute_d2h_many,
            ext_bundle,
            None,
            custom_family_batched_outer_hessian_operator(
                family,
                synced_joint_states.as_ref(),
                specs,
                derivative_blocks.as_ref(),
                rho_current,
                hessian_workspace.clone(),
                eval_mode,
            )?,
            custom_family_outer_jeffreys_hphi(family, &inner.block_states, specs, &ranges)?,
            custom_family_outer_jeffreys_hphi_drift(family, &inner.block_states, specs, &ranges)?,
        )?;

        // The unified evaluator produces gradient/Hessian of size (rho_dim + psi_dim),
        // with ρ coordinates first and ψ coordinates appended — matching the expected
        // output order of CustomFamilyJointHyperResult.
        log::info!(
            "[STAGE] cthf_internal psi_dim={} eval_mode={:?} post_unified elapsed={:.3}s",
            psi_dim,
            eval_mode,
            cthf_internal_psi_branch_start.elapsed().as_secs_f64(),
        );
        return Ok(eval_result);
    }

    // ── ρ-only path (psi_dim == 0): route through unified evaluator ──
    //
    // Batched fast-path: if the family overrides `batched_outer_gradient_terms`,
    // factor H once at the family level and amortize all K trace computations in
    // a single streaming pass. Runs in both `ValueAndGradient` and
    // `ValueGradientHessian` modes; in VGH the Hessian still flows through the
    // standard joint_outer_evaluate path below and only the gradient is
    // replaced. See `BatchedOuterGradientTerms`. The replacement is permitted
    // only when it differentiates the same objective: if robust Jeffreys
    // curvature is nonzero, the unified H_phi-aware evaluator owns the gradient.
    let has_configured_rho_prior = !matches!(rho_prior, crate::types::RhoPrior::Flat);
    let robust_jeffreys_hphi =
        custom_family_outer_jeffreys_hphi(family, &inner.block_states, specs, &ranges)?;
    let batched_gradient_contract_allows_override = batched_outer_gradient_contract_allows_override(
        robust_jeffreys_hphi.as_ref().map(|(_phi, hphi)| hphi),
    );
    let mut batched_gradient_override: Option<Array1<f64>> = None;
    if !has_configured_rho_prior
        && batched_gradient_contract_allows_override
        && (eval_mode == EvalMode::ValueAndGradient || eval_mode == EvalMode::ValueGradientHessian)
    {
        let beta_flat_for_batch = flatten_state_betas(&inner.block_states, specs);
        let synced_states_for_batch = synchronized_states_from_flat_beta(
            family,
            specs,
            &inner.block_states,
            &beta_flat_for_batch,
        )?;
        let workspace_for_batch = match inner.joint_workspace.clone() {
            Some(workspace) => Some(workspace),
            None => family
                .exact_newton_joint_hessian_workspace_with_options(
                    &synced_states_for_batch,
                    specs,
                    options,
                )
                .ok()
                .flatten(),
        };
        let derivative_blocks_for_batch =
            vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
        if let Ok(Some(batch)) = family.batched_outer_gradient_terms(
            &synced_states_for_batch,
            specs,
            &derivative_blocks_for_batch,
            rho_current,
            options,
            workspace_for_batch.clone(),
        ) {
            // Sanity check: batched output must match (rho_dim + psi_dim).
            let expected = rho_dim + psi_dim;
            if batch.objective_theta.len() == expected
                && batch.trace_h_inv_hdot.len() == expected
                && batch.trace_s_pinv_sdot.len() == expected
                && let Some(joint_bundle_value_only) = build_joint_hessian_closures(
                    family,
                    &inner.block_states,
                    specs,
                    total,
                    options,
                    inner.joint_workspace.clone(),
                )?
            {
                let mut gradient = Array1::<f64>::zeros(expected);
                for j in 0..expected {
                    let trace_term = if include_logdet_h {
                        0.5 * batch.trace_h_inv_hdot[j]
                    } else {
                        0.0
                    };
                    let det_term = if include_logdet_s {
                        0.5 * batch.trace_s_pinv_sdot[j]
                    } else {
                        0.0
                    };
                    gradient[j] = batch.objective_theta[j] + trace_term - det_term;
                }
                if eval_mode == EvalMode::ValueGradientHessian {
                    batched_gradient_override = Some(gradient);
                } else {
                    let JointHessianBundle {
                        source: h_joint_unpen,
                        beta_flat,
                        compute_dh,
                        compute_dh_many,
                        compute_d2h,
                        compute_d2h_many,
                        owned_compute_dh: _,
                        owned_compute_dh_many: _,
                        owned_compute_d2h: _,
                        owned_compute_d2h_many: _,
                        rho_curvature_scale,
                        hessian_logdet_correction,
                    } = joint_bundle_value_only;
                    let value_only = joint_outer_evaluate(
                        &inner,
                        specs,
                        &per_block,
                        rho_current,
                        &beta_flat,
                        h_joint_unpen,
                        &ranges,
                        total,
                        ridge,
                        moderidge,
                        extra_logdet_ridge,
                        rho_curvature_scale,
                        hessian_logdet_correction,
                        include_logdet_h,
                        include_logdet_s,
                        strict_spd,
                        // VALUE/GRADIENT CONSISTENCY: this `value_only` is paired
                        // with the family's BATCHED gradient (computed just above),
                        // which evaluates the logdet derivative through the
                        // family's `pseudo_logdet_mode` spectral operator (Smooth
                        // `r_ε` for BMS) — an internally exact antiderivative pair
                        // (value `log r_ε`, gradient `φ'=r_ε'/r_ε`). The value must
                        // therefore use the SAME spectral convention, NOT the
                        // projected #752 generalized determinant, or value and the
                        // batched gradient would describe different objectives under
                        // rank deficiency. The projected determinant is used on the
                        // non-batched path (the ψ-bearing matern marginal-slope
                        // route, gam#808/#787), where joint_outer_evaluate produces
                        // a matched projected value AND gradient in one call.
                        false,
                        EvalMode::ValueOnly,
                        options,
                        crate::types::RhoPrior::Flat,
                        family.pseudo_logdet_mode(),
                        compute_dh.as_ref(),
                        compute_dh_many.as_deref(),
                        compute_d2h.as_ref(),
                        compute_d2h_many.as_deref(),
                        None,
                        None,
                        None,
                        None,
                        None,
                        None,
                        None,
                        robust_jeffreys_hphi.clone(),
                        // ValueOnly: the gradient is supplied separately below, so
                        // the H_Φ mode-response drift (a gradient-only term) is not
                        // needed here.
                        None,
                    )?;
                    return Ok(OuterObjectiveEvalResult {
                        objective: value_only.objective,
                        gradient,
                        outer_hessian: crate::solver::outer_strategy::HessianResult::Unavailable,
                        warm_start: value_only.warm_start,
                        inner_converged: inner.converged,
                    });
                }
            }
        }
    }

    // Try build_joint_hessian_closures which handles both exact Newton and
    // surrogate Hessian sources, then call joint_outer_evaluate with no
    // extended coordinates.
    if let Some(joint_bundle) = build_joint_hessian_closures(
        family,
        &inner.block_states,
        specs,
        total,
        options,
        inner.joint_workspace.clone(),
    )? {
        let JointHessianBundle {
            source: h_joint_unpen,
            beta_flat,
            compute_dh,
            compute_dh_many,
            compute_d2h,
            compute_d2h_many,
            owned_compute_dh,
            owned_compute_dh_many,
            owned_compute_d2h,
            owned_compute_d2h_many,
            rho_curvature_scale,
            hessian_logdet_correction,
        } = joint_bundle;
        let eval_result = joint_outer_evaluate(
            &inner,
            specs,
            &per_block,
            rho_current,
            &beta_flat,
            h_joint_unpen,
            &ranges,
            total,
            ridge,
            moderidge,
            extra_logdet_ridge,
            rho_curvature_scale,
            hessian_logdet_correction,
            include_logdet_h,
            include_logdet_s,
            strict_spd,
            // VALUE/GRADIENT CONSISTENCY: when a batched (Smooth-mode) gradient
            // override is pending, it will replace `eval_result.gradient` below,
            // so the value (and outer Hessian) here must use the SAME spectral
            // convention as that gradient — the family's `pseudo_logdet_mode`
            // (Smooth `r_ε`), NOT the projected #752 generalized determinant. The
            // projected determinant is used only when no batched override is
            // active (the ψ-bearing matern marginal-slope route, gam#808/#787),
            // where this call produces a matched projected value+gradient+Hessian.
            if batched_gradient_override.is_some() {
                false
            } else {
                family.use_projected_penalty_logdet()
            },
            eval_mode,
            options,
            rho_prior.clone(),
            family.pseudo_logdet_mode(),
            compute_dh.as_ref(),
            compute_dh_many.as_deref(),
            compute_d2h.as_ref(),
            compute_d2h_many.as_deref(),
            owned_compute_dh,
            owned_compute_dh_many,
            owned_compute_d2h,
            owned_compute_d2h_many,
            None, // no ext_coords when psi_dim == 0
            None,
            custom_family_batched_outer_hessian_operator(
                family,
                &inner.block_states,
                specs,
                derivative_blocks.as_ref(),
                rho_current,
                inner.joint_workspace.clone(),
                eval_mode,
            )?,
            custom_family_outer_jeffreys_hphi(family, &inner.block_states, specs, &ranges)?,
            custom_family_outer_jeffreys_hphi_drift(family, &inner.block_states, specs, &ranges)?,
        )?;

        let mut eval_result = eval_result;
        if let Some(batched_grad) = batched_gradient_override.take()
            && batched_grad.len() == eval_result.gradient.len()
        {
            eval_result.gradient = batched_grad;
        }
        return Ok(eval_result);
    }

    // Joint Hessian unavailable via either exact Newton or surrogate.
    // The generic fallback is only mathematically defensible for single-block
    // families — multi-block families with coupled likelihood curvature require
    // the joint path.
    if family.requires_joint_outer_hyper_path() {
        return Err(
            "outer hyper-derivative evaluation requires a joint exact path for this family"
                .to_string()
                .into(),
        );
    }

    // Generic fallback: single-block only. Extract the per-block Hessian and
    // route through joint_outer_evaluate with the single block as the "joint"
    // system.
    if specs.len() != 1 {
        return Err(
            "generic outer fallback is only valid for single-block families; multi-block families must provide a joint outer path"
                .to_string()
                .into(),
        );
    }
    let eval = family.evaluate(&inner.block_states)?;
    let b = 0;
    let spec = &specs[b];
    let work = &eval.blockworking_sets[b];
    let p = spec.design.ncols();
    let mut diagonal_design = None::<DesignMatrix>;
    let h_joint_unpen = match work {
        BlockWorkingSet::Diagonal {
            working_response: _,
            working_weights,
        } => with_block_geometry(family, &inner.block_states, spec, b, |x_dyn, _| {
            let w = floor_positiveworking_weights(working_weights, options.minweight);
            let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
            diagonal_design = Some(x_dyn.clone());
            Ok(xtwx)
        })?,
        BlockWorkingSet::ExactNewton {
            gradient: _,
            hessian,
        } => {
            if hessian.nrows() != p || hessian.ncols() != p {
                crate::bail_dim_custom!(
                    "block {b} exact-newton Hessian shape mismatch in outer gradient: got {}x{}, expected {}x{}",
                    hessian.nrows(),
                    hessian.ncols(),
                    p,
                    p
                );
            }
            hessian.to_dense()
        }
    };

    let beta_flat = inner.block_states[b].beta.clone();

    // Build a derivative provider that computes D_β H_L[direction] on demand.
    let compute_dh = |direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
        if !include_logdet_h {
            return Ok(None);
        }
        match work {
            BlockWorkingSet::ExactNewton { .. } => {
                match family.exact_newton_hessian_directional_derivative(
                    &inner.block_states,
                    b,
                    direction,
                )? {
                    Some(h_exact) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                        h_exact,
                        p,
                        &format!("block {b} exact-newton dH shape mismatch"),
                    )?))),
                    None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                        "missing exact-newton dH callback for block {b} while REML gradient requires H_beta term"
                    ) }.into()),
                }
            }
            BlockWorkingSet::Diagonal {
                working_response: _,
                working_weights,
            } => {
                let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
                    format!("missing dynamic design for block {b} diagonal correction")
                })?;
                let wwork = floor_positiveworking_weights(working_weights, options.minweight);
                let x_dense = x_dyn.to_dense();
                let n = x_dense.nrows();

                let mut d_eta = x_dyn.matrixvectormultiply(direction);
                let geom = family.block_geometry_directional_derivative(
                    &inner.block_states,
                    b,
                    spec,
                    direction,
                )?;
                let mut correction_mat = Array2::<f64>::zeros((p, p));

                if let Some(geom_dir) = geom {
                    d_eta += &geom_dir.d_offset;
                    if let Some(dx) = geom_dir.d_design {
                        d_eta += &dx.dot(&beta_flat);
                        let mut wx = x_dense.clone();
                        let mut wdx = dx.clone();
                        ndarray::Zip::from(wx.rows_mut())
                            .and(wdx.rows_mut())
                            .and(wwork.view())
                            .par_for_each(|mut wxr, mut wdxr, &wi| {
                                if wi != 1.0 {
                                    wxr.mapv_inplace(|v| v * wi);
                                    wdxr.mapv_inplace(|v| v * wi);
                                }
                            });
                        // Same X'(W·Y) pattern as the parallel sibling at
                        // line ~9258; route through faer for SIMD GEMM
                        // (n × p² flops at large-scale moderate scale).
                        correction_mat += &fast_atb(&dx, &wx);
                        correction_mat += &fast_atb(&x_dense, &wdx);
                    }
                }

                let dw = family
                    .diagonalworking_weights_directional_derivative(
                        &inner.block_states,
                        b,
                        &d_eta,
                    )?
                    .ok_or_else(|| {
                        format!(
                            "missing diagonal dW callback for block {b} while REML gradient requires H_beta term"
                        )
                    })?;
                if dw.len() != n {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "block {b} diagonal dW length mismatch: got {}, expected {}",
                            dw.len(),
                            n
                        ),
                    }
                    .into());
                }
                let mut scaled_x = x_dense.clone();
                ndarray::Zip::from(scaled_x.rows_mut())
                    .and(&dw)
                    .par_for_each(|mut sr, &dwi| sr.mapv_inplace(|v| v * dwi));
                // X'(diag(dW)·X) outer correction term — faer route, same
                // rationale as above.
                correction_mat += &fast_atb(&x_dense, &scaled_x);

                Ok(Some(DriftDerivResult::Dense(correction_mat)))
            }
        }
    };

    // Build a derivative provider that computes D²_β H_L[u, v] on demand.
    let compute_d2h = |u: &Array1<f64>,
                       v: &Array1<f64>|
     -> Result<Option<DriftDerivResult>, String> {
        if !include_logdet_h {
            return Ok(None);
        }
        match work {
            BlockWorkingSet::ExactNewton { .. } => {
                match family.exact_newton_hessian_second_directional_derivative(
                    &inner.block_states,
                    b,
                    u,
                    v,
                )? {
                    Some(h_exact) => Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                        h_exact,
                        p,
                        &format!("block {b} exact-newton d2H shape mismatch"),
                    )?))),
                    None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                        "missing exact-newton d2H callback for block {b} while REML Hessian requires H_beta_beta term"
                    ) }.into()),
                }
            }
            BlockWorkingSet::Diagonal {
                working_response: _,
                working_weights: _,
            } => {
                let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
                    format!("missing dynamic design for block {b} diagonal second correction")
                })?;
                let x_dense = x_dyn.to_dense();
                let n = x_dense.nrows();

                let reject_second_order_geometry = |label: &str,
                                                    geom: Option<
                    BlockGeometryDirectionalDerivative,
                >|
                 -> Result<(), String> {
                    if let Some(geom_dir) = geom {
                        let has_offset = geom_dir.d_offset.iter().any(|value| *value != 0.0);
                        if geom_dir.d_design.is_some() || has_offset {
                            return Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                "block {b} diagonal d2H requires second-order block-geometry derivatives for {label}; use an exact-newton or joint outer path"
                            ) }.into());
                        }
                    }
                    Ok(())
                };
                reject_second_order_geometry(
                    "first direction",
                    family.block_geometry_directional_derivative(
                        &inner.block_states,
                        b,
                        spec,
                        u,
                    )?,
                )?;
                reject_second_order_geometry(
                    "second direction",
                    family.block_geometry_directional_derivative(
                        &inner.block_states,
                        b,
                        spec,
                        v,
                    )?,
                )?;

                let d_eta_u = x_dyn.matrixvectormultiply(u);
                let d_eta_v = x_dyn.matrixvectormultiply(v);
                let d2w = family
                    .diagonalworking_weights_second_directional_derivative(
                        &inner.block_states,
                        b,
                        &d_eta_u,
                        &d_eta_v,
                    )?
                    .ok_or_else(|| {
                        format!(
                            "missing diagonal d2W callback for block {b} while REML Hessian requires H_beta_beta term"
                        )
                    })?;
                if d2w.len() != n {
                    return Err(CustomFamilyError::DimensionMismatch {
                        reason: format!(
                            "block {b} diagonal d2W length mismatch: got {}, expected {}",
                            d2w.len(),
                            n
                        ),
                    }
                    .into());
                }
                let mut scaled_x = x_dense.clone();
                ndarray::Zip::from(scaled_x.rows_mut())
                    .and(&d2w)
                    .par_for_each(|mut sr, &d2wi| sr.mapv_inplace(|value| value * d2wi));
                Ok(Some(DriftDerivResult::Dense(fast_atb(&x_dense, &scaled_x))))
            }
        }
    };

    let eval_result = joint_outer_evaluate(
        &inner,
        specs,
        &per_block,
        rho_current,
        &beta_flat,
        JointHessianSource::Dense(h_joint_unpen),
        &ranges,
        total,
        ridge,
        moderidge,
        extra_logdet_ridge,
        1.0,
        0.0,
        include_logdet_h,
        include_logdet_s,
        strict_spd,
        family.use_projected_penalty_logdet(),
        eval_mode,
        options,
        rho_prior,
        family.pseudo_logdet_mode(),
        &compute_dh,
        None,
        &compute_d2h,
        None,
        None,
        None,
        None,
        None,
        None, // no ext_coords for generic single-block fallback
        None,
        custom_family_batched_outer_hessian_operator(
            family,
            &inner.block_states,
            specs,
            derivative_blocks.as_ref(),
            rho_current,
            inner.joint_workspace.clone(),
            eval_mode,
        )?,
        robust_jeffreys_hphi,
        custom_family_outer_jeffreys_hphi_drift(family, &inner.block_states, specs, &ranges)?,
    )?;

    Ok(eval_result)
}

pub fn evaluate_custom_family_joint_hyper<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    rho_current: &Array1<f64>,
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    warm_start: Option<&CustomFamilyWarmStart>,
    eval_mode: EvalMode,
) -> Result<CustomFamilyJointHyperResult, CustomFamilyError> {
    let penalty_counts = validate_blockspecs(specs)?;
    let has_psi_derivatives = derivative_blocks.iter().any(|block| !block.is_empty());
    let (eval_options, strict_warm_start) =
        derivative_quality_options_and_warm_start(options, warm_start, has_psi_derivatives);
    let eval_result = evaluate_custom_family_hyper_internal(
        family,
        specs,
        &eval_options,
        &penalty_counts,
        rho_current,
        derivative_blocks,
        strict_warm_start
            .as_ref()
            .map(|w| &w.inner)
            .or_else(|| warm_start.map(|w| &w.inner)),
        crate::types::RhoPrior::Flat,
        eval_mode,
    )?;
    Ok(outer_eval_result_to_joint_hyper_result(eval_result))
}

pub(crate) fn evaluate_custom_family_joint_hyper_shared<
    F: CustomFamily + Clone + Send + Sync + 'static,
>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    rho_current: &Array1<f64>,
    derivative_blocks: SharedDerivativeBlocks,
    warm_start: Option<&CustomFamilyWarmStart>,
    eval_mode: EvalMode,
) -> Result<CustomFamilyJointHyperResult, CustomFamilyError> {
    let penalty_counts = validate_blockspecs(specs)?;
    let has_psi_derivatives = derivative_blocks.iter().any(|block| !block.is_empty());
    let (eval_options, strict_warm_start) =
        derivative_quality_options_and_warm_start(options, warm_start, has_psi_derivatives);
    let eval_result = evaluate_custom_family_hyper_internal_shared(
        family,
        specs,
        &eval_options,
        &penalty_counts,
        rho_current,
        derivative_blocks,
        strict_warm_start
            .as_ref()
            .map(|w| &w.inner)
            .or_else(|| warm_start.map(|w| &w.inner)),
        crate::types::RhoPrior::Flat,
        eval_mode,
    )?;
    Ok(outer_eval_result_to_joint_hyper_result(eval_result))
}

fn derivative_quality_options_and_warm_start(
    options: &BlockwiseFitOptions,
    warm_start: Option<&CustomFamilyWarmStart>,
    has_psi_derivatives: bool,
) -> (BlockwiseFitOptions, Option<CustomFamilyWarmStart>) {
    const DIRECT_JOINT_HYPER_INNER_TOL_FLOOR: f64 = 1e-10;
    const DIRECT_JOINT_HYPER_MIN_CYCLES: usize = 200;

    let mut eval_options = options.clone();
    // The alignment exists so exact joint-hyper evaluations with real ψ
    // coordinates resolve the inner solve at the outer optimizer's requested
    // derivative scale. With zero ψ-derivative blocks this API is just the
    // rho-only outer surface; mutating its inner tolerance makes the direct
    // joint-hyper path evaluate a different function than the rho-only path.
    if !has_psi_derivatives {
        return (eval_options, None);
    }
    //
    // Do not hard-force f64-precision KKT solves for every ψ-bearing model:
    // large-scale survival marginal-slope fits have row-summed objectives
    // around 1e5-1e6, so `1e-10 * objective` asks the inner loop to resolve
    // gradient components far below the outer optimizer's own `outer_tol`.
    // Matching the inner target to the outer target keeps the IFT gradient
    // noise below the requested optimization accuracy without rejecting all
    // startup seeds after hundreds of accepted but numerically flat Newton
    // steps.
    let direct_joint_hyper_inner_tol = eval_options
        .outer_tol
        .max(DIRECT_JOINT_HYPER_INNER_TOL_FLOOR);
    let tolerance_differs = eval_options.inner_tol != direct_joint_hyper_inner_tol;
    let tightening = eval_options.inner_tol > direct_joint_hyper_inner_tol;
    let align = eval_options.inner_max_cycles > 1 && tolerance_differs;
    let psi_safe_warm_start = warm_start_without_cached_inner_for_psi_derivatives(
        warm_start.map(|warm| &warm.inner),
        true,
    )
    .map(|inner| CustomFamilyWarmStart { inner });
    if !align {
        return (eval_options, psi_safe_warm_start);
    }
    eval_options.inner_tol = direct_joint_hyper_inner_tol;
    if tightening {
        eval_options.inner_max_cycles = eval_options
            .inner_max_cycles
            .max(DIRECT_JOINT_HYPER_MIN_CYCLES);
    }
    (eval_options, psi_safe_warm_start)
}

pub(crate) fn joint_hyper_options_for_outer_tolerance(
    options: &BlockwiseFitOptions,
    outer_tol: f64,
) -> BlockwiseFitOptions {
    let mut eval_options = options.clone();
    eval_options.outer_tol = eval_options.outer_tol.max(outer_tol);
    eval_options
}

fn evaluate_custom_family_joint_hyper_efs_internal_shared<
    F: CustomFamily + Clone + Send + Sync + 'static,
>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho_current: &Array1<f64>,
    derivative_blocks: SharedDerivativeBlocks,
    warm_start: Option<&ConstrainedWarmStart>,
) -> Result<
    (
        crate::solver::outer_strategy::EfsEval,
        ConstrainedWarmStart,
        bool,
    ),
    CustomFamilyError,
> {
    if derivative_blocks.len() != specs.len() {
        crate::bail_dim_custom!(
            "joint hyper derivative block count mismatch: got {}, expected {}",
            derivative_blocks.len(),
            specs.len()
        );
    }
    if penalty_counts.len() != specs.len() {
        crate::bail_dim_custom!(
            "joint hyper penalty-count block mismatch: got {}, expected {}",
            penalty_counts.len(),
            specs.len()
        );
    }

    let rho_dim = penalty_counts.iter().sum::<usize>();
    let psi_dim = derivative_blocks.iter().map(Vec::len).sum::<usize>();
    if psi_dim == 0 {
        return Err(CustomFamilyError::InvalidInput {
            context: "evaluate_custom_family_joint_hyper_efs",
            reason: "joint hyper EFS requires at least one ψ coordinate".to_string(),
        });
    }
    if rho_current.len() != rho_dim {
        crate::bail_dim_custom!(
            "joint hyper rho dimension mismatch: got {}, expected {} (psi={})",
            rho_current.len(),
            rho_dim,
            psi_dim
        );
    }

    let include_logdet_h = include_exact_newton_logdet_h(family, options);
    let include_logdet_s = include_exact_newton_logdet_s(family, options);
    let strict_spd = use_exact_newton_strict_spd(family);
    let per_block = split_log_lambdas(rho_current, penalty_counts)?;
    let psi_safe_warm_start = warm_start_without_cached_inner_for_psi_derivatives(warm_start, true);
    let mut inner = inner_blockwise_fit(
        family,
        specs,
        &per_block,
        options,
        psi_safe_warm_start.as_ref().or(warm_start),
    )?;
    if !inner.converged {
        let theta_dim = rho_dim + psi_dim;
        log::warn!(
            "[OUTER] custom-family joint-hyper EFS inner solve did not converge after {} cycle(s); \
             skipping joint-hyper EFS derivative assembly for theta_dim={} (rho_dim={}, psi_dim={})",
            inner.cycles,
            theta_dim,
            rho_dim,
            psi_dim,
        );
        return nonconverged_outer_efs_result(
            &inner,
            rho_current,
            theta_dim,
            include_logdet_h,
            include_logdet_s,
            "custom-family joint-hyper EFS non-converged inner solve",
        )
        .map_err(CustomFamilyError::from);
    }
    let ridge = effective_solverridge(options.ridge_floor);
    let moderidge = if options.ridge_policy.include_quadratic_penalty {
        ridge
    } else {
        0.0
    };
    let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
        && !options.ridge_policy.include_quadratic_penalty
    {
        ridge
    } else {
        0.0
    };

    refresh_all_block_etas(family, specs, &mut inner.block_states)?;
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);

    let beta_flat = flatten_state_betas(&inner.block_states, specs);
    let synced_joint_states = Arc::new(synchronized_states_from_flat_beta(
        family,
        specs,
        &inner.block_states,
        &beta_flat,
    )?);
    let hessian_workspace = family.exact_newton_joint_hessian_workspace_with_options(
        synced_joint_states.as_ref(),
        specs,
        options,
    )?;
    // Outer-eval entry: prime per-row jet caches before the ext-coord
    // par_iter — see `warm_up_outer_caches` doc.
    if let Some(workspace) = hessian_workspace.as_ref() {
        workspace.warm_up_outer_caches()?;
    }
    let (
        h_joint_unpen,
        rho_curvature_scale,
        hessian_logdet_correction,
        use_outer_curvature_derivatives,
    ) = if let Some(curvature) = family.exact_newton_outer_curvature(&inner.block_states)? {
        (
            JointHessianSource::Dense(symmetrized_square_matrix(
                curvature.hessian,
                total,
                "joint exact-newton Hessian shape mismatch in joint hyper EFS evaluator (rescaled)",
            )?),
            curvature.rho_curvature_scale,
            curvature.hessian_logdet_correction,
            true,
        )
    } else {
        let h_joint_unpen = if let Some(workspace) = hessian_workspace.as_ref() {
            exact_newton_joint_hessian_source_from_workspace(
                workspace,
                total,
                MaterializationIntent::OuterEvaluation,
                "joint exact-newton operator mismatch in joint hyper EFS evaluator",
            )?
        } else {
            None
        };
        (
            match h_joint_unpen {
                Some(source) => Some(source),
                None => exact_newton_joint_hessian_symmetrized(
                    family,
                    &inner.block_states,
                    specs,
                    total,
                    "joint exact-newton Hessian shape mismatch in joint hyper EFS evaluator",
                )
                .map(|source| source.map(JointHessianSource::Dense))?,
            }
            .ok_or_else(|| -> CustomFamilyError {
                "joint exact-newton Hessian unavailable for full [rho, psi] fixed-point outer calculus"
                    .to_string()
                    .into()
            })?,
            1.0,
            0.0,
            false,
        )
    };

    let s_logdet_blocks = if include_logdet_s {
        use rayon::iter::{IntoParallelIterator, ParallelIterator};
        let block_results: Vec<Result<PenaltyPseudologdet, String>> = (0..specs.len())
            .into_par_iter()
            .map(|b| {
                let spec = &specs[b];
                let p = spec.design.ncols();
                let lambdas = per_block[b].mapv(f64::exp);
                let mut s_lambda = Array2::<f64>::zeros((p, p));
                for (k, s) in spec.penalties.iter().enumerate() {
                    s.add_scaled_to(lambdas[k], &mut s_lambda);
                }
                let ridge_hint = if options.ridge_policy.include_penalty_logdet {
                    for d in 0..p {
                        s_lambda[[d, d]] += ridge;
                    }
                    Some(ridge)
                } else {
                    None
                };
                // No metadata-based structural-nullity hint: the
                // PenaltyPseudologdet classifier derives the positive
                // eigenspace from the assembled spectrum alone (issues
                // #192/#318).
                PenaltyPseudologdet::from_assembled(s_lambda, ridge_hint)
            })
            .collect();
        let blocks: Result<Vec<_>, _> = block_results.into_iter().collect();
        Some(blocks?)
    } else {
        None
    };

    let hessian_beta_independent = !family.exact_newton_joint_hessian_beta_dependent();
    let psi_workspace = if family.exact_newton_joint_psi_workspace_for_first_order_terms() {
        family.exact_newton_joint_psi_workspace_with_options(
            synced_joint_states.as_ref(),
            specs,
            derivative_blocks.as_ref(),
            options,
        )?
    } else {
        None
    };
    let rho_slice = rho_current
        .as_slice()
        .ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
    let psi_coords = build_psi_hyper_coords(
        family,
        synced_joint_states.as_ref(),
        specs,
        derivative_blocks.as_ref(),
        &beta_flat,
        rho_slice,
        penalty_counts,
        s_logdet_blocks.as_deref(),
        hessian_beta_independent,
        psi_workspace.clone(),
    )?;
    let ext_bundle = ExtCoordBundle {
        coords: psi_coords,
        ext_ext_fn: None,
        rho_ext_fn: None,
        drift_fn: None,
        contracted_psi_fn: None,
    };

    let compute_dh = exact_newton_dh_closure(
        family,
        Arc::clone(&synced_joint_states),
        specs,
        total,
        use_outer_curvature_derivatives,
        if use_outer_curvature_derivatives {
            1.0
        } else {
            rho_curvature_scale
        },
        hessian_workspace.clone(),
    );
    let compute_dh_many = if use_outer_curvature_derivatives {
        None
    } else {
        exact_newton_dh_many_closure(rho_curvature_scale, hessian_workspace.clone())
    };
    let compute_d2h = exact_newton_d2h_closure(
        family,
        Arc::clone(&synced_joint_states),
        specs,
        total,
        use_outer_curvature_derivatives,
        if use_outer_curvature_derivatives {
            1.0
        } else {
            rho_curvature_scale
        },
        hessian_workspace.clone(),
    );
    let owned_compute_dh = exact_newton_dh_closure_owned(
        family.clone(),
        Arc::clone(&synced_joint_states),
        specs.to_vec(),
        total,
        use_outer_curvature_derivatives,
        if use_outer_curvature_derivatives {
            1.0
        } else {
            rho_curvature_scale
        },
        hessian_workspace.clone(),
    );
    let owned_compute_dh_many = if use_outer_curvature_derivatives {
        None
    } else {
        exact_newton_dh_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
    };
    let owned_compute_d2h = exact_newton_d2h_closure_owned(
        family.clone(),
        Arc::clone(&synced_joint_states),
        specs.to_vec(),
        total,
        use_outer_curvature_derivatives,
        if use_outer_curvature_derivatives {
            1.0
        } else {
            rho_curvature_scale
        },
        hessian_workspace.clone(),
    );
    let compute_d2h_many = if use_outer_curvature_derivatives {
        None
    } else {
        exact_newton_d2h_many_closure(rho_curvature_scale, hessian_workspace.clone())
    };
    let owned_compute_d2h_many = if use_outer_curvature_derivatives {
        None
    } else {
        exact_newton_d2h_many_closure_owned(rho_curvature_scale, hessian_workspace.clone())
    };

    let efs_eval = joint_outer_evaluate_efs(
        &inner,
        specs,
        &per_block,
        rho_current,
        &beta_flat,
        h_joint_unpen,
        &ranges,
        total,
        ridge,
        moderidge,
        extra_logdet_ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        include_logdet_h,
        include_logdet_s,
        strict_spd,
        // ψ-bearing EFS path: projected #752 generalized determinant for value
        // and gradient (matched in this single _efs call). Same root-cause fix as
        // the VGH ψ path (gam#808/#787); no batched override here.
        family.use_projected_penalty_logdet(),
        options,
        crate::types::RhoPrior::Flat,
        family.pseudo_logdet_mode(),
        &compute_dh,
        compute_dh_many.as_deref(),
        &compute_d2h,
        compute_d2h_many.as_deref(),
        Some(owned_compute_dh),
        owned_compute_dh_many,
        Some(owned_compute_d2h),
        owned_compute_d2h_many,
        Some(ext_bundle),
    )
    .map_err(CustomFamilyError::from)?;

    let warm = ConstrainedWarmStart {
        rho: rho_current.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|state| state.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(&inner)),
    };

    Ok((efs_eval, warm, inner.converged))
}

/// Evaluate the joint custom-family hyper-surface in fixed-point form for the
/// outer EFS / hybrid-EFS planners.
pub fn evaluate_custom_family_joint_hyper_efs<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    rho_current: &Array1<f64>,
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    warm_start: Option<&CustomFamilyWarmStart>,
) -> Result<CustomFamilyJointHyperEfsResult, CustomFamilyError> {
    // Borrowed entry point: lift the `&[Vec<…>]` derivative blocks into a
    // `SharedDerivativeBlocks` (`Arc<Vec<Vec<…>>>`) and delegate to the single
    // source of truth. All validation, the empty-block fast path, and the
    // internal evaluator dispatch live in `…_efs_shared`.
    evaluate_custom_family_joint_hyper_efs_shared(
        family,
        specs,
        options,
        rho_current,
        Arc::new(derivative_blocks.to_vec()),
        warm_start,
    )
}

pub(crate) fn evaluate_custom_family_joint_hyper_efs_shared<
    F: CustomFamily + Clone + Send + Sync + 'static,
>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    rho_current: &Array1<f64>,
    derivative_blocks: SharedDerivativeBlocks,
    warm_start: Option<&CustomFamilyWarmStart>,
) -> Result<CustomFamilyJointHyperEfsResult, CustomFamilyError> {
    let penalty_counts = validate_blockspecs(specs)?;
    if derivative_blocks.len() != specs.len() {
        crate::bail_dim_custom!(
            "joint hyper derivative block count mismatch: got {}, expected {}",
            derivative_blocks.len(),
            specs.len()
        );
    }
    let (efs_eval, warm_start, inner_converged) = if derivative_blocks.iter().all(Vec::is_empty) {
        outerobjectiveefs(
            family,
            specs,
            options,
            &penalty_counts,
            rho_current,
            warm_start.map(|w| &w.inner),
            crate::types::RhoPrior::Flat,
        )
        .map_err(CustomFamilyError::from)?
    } else {
        evaluate_custom_family_joint_hyper_efs_internal_shared(
            family,
            specs,
            options,
            &penalty_counts,
            rho_current,
            derivative_blocks,
            warm_start.map(|w| &w.inner),
        )?
    };
    Ok(outer_efs_result_to_joint_hyper_efs_result(
        efs_eval,
        warm_start,
        inner_converged,
    ))
}

fn block_param_ranges(specs: &[ParameterBlockSpec]) -> Vec<(usize, usize)> {
    block_offsets_from_specs(specs)
        .iter()
        .map(|r| (r.start, r.end))
        .collect()
}

/// Build the joint Jeffreys/Firth basis `Z_J` (block-diagonal stack of each
/// block's per-block span) for the universal robustness term.
///
/// Each block contributes its FULL reduced coefficient span (`I_p` per block) —
/// the principled cure. Because the Jeffreys score is `O(1)` against the data's
/// `O(n)` Fisher information, applying it on the full span is the `O(1/n)` Firth
/// bias correction on data-identified directions (no bias on genuine smooth
/// fits) and the missing `O(1)`-bounding curvature on ANY near-separating
/// direction — penalized (`range(S)`) or not (`ker(S)`) — so the inner objective
/// becomes coercive with a finite unique minimizer. The previous `ker(S)`-only
/// scoping could not reach a near-separation on a penalized spline direction,
/// which was the residual BMS-probit pathology.
///
/// The per-block bases are embedded block-diagonally into the joint
/// `total_p x m_total` matrix. Returns `None` only for an empty system.
///
/// The Jeffreys conditioning gate, not the smoothing penalty null space,
/// decides whether this basis contributes at the current iterate.
fn build_joint_jeffreys_subspace(
    specs: &[ParameterBlockSpec],
    ranges: &[(usize, usize)],
) -> Result<Option<Array2<f64>>, String> {
    let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    if total_p == 0 {
        return Ok(None);
    }
    let mut per_block: Vec<Array2<f64>> = Vec::with_capacity(specs.len());
    let mut m_total = 0usize;
    for (b, _spec) in specs.iter().enumerate() {
        let (start, end) = ranges[b];
        let p_block = end - start;
        // Full identifiable-span Jeffreys: `Z_J = I_{p_block}` over the entire
        // reduced block coefficient space. The aggregate penalty only fixes the
        // block dimension; the span no longer depends on `ker(S)`.
        let aggregate = Array2::<f64>::zeros((p_block, p_block));
        let subspace = crate::estimate::reml::jeffreys_subspace::jeffreys_subspace_from_penalty(
            aggregate.view(),
        )?;
        m_total += subspace.span_dim();
        per_block.push(subspace.columns);
    }
    if m_total == 0 {
        return Ok(None);
    }
    let mut z_joint = Array2::<f64>::zeros((total_p, m_total));
    let mut col_cursor = 0usize;
    for (b, columns) in per_block.iter().enumerate() {
        let (start, _) = ranges[b];
        let m_block = columns.ncols();
        let p_block = columns.nrows();
        for j in 0..m_block {
            for i in 0..p_block {
                z_joint[[start + i, col_cursor + j]] = columns[[i, j]];
            }
        }
        col_cursor += m_block;
    }
    Ok(Some(z_joint))
}

/// CHEAP, matrix-free conditioning pre-check: can the always-on Jeffreys term be
/// PROVABLY skipped at this working point WITHOUT forming the dense joint Hessian
/// `H` or running the `O(p³)` reduced eigendecomposition?
///
/// This is the perf gate in front of the expensive `custom_family_joint_jeffreys_*`
/// formation. On the FULL span (`Z_J = I`) the reduced information is `H_id = H`,
/// so the conditioning gate only needs `H`'s extreme eigenvalues — and those can
/// be bounded conservatively from a few Hessian-vector products against the SAME
/// `joint_hessian_source` operator the inner Newton already built (matrix-free on
/// the large-`p` path, dense otherwise). When the conservative bounds clear both
/// gates with a safe margin (see `jeffreys_term_skippable_via_matvec`), the exact
/// gate is CERTAIN to return the zero term, so the caller skips the dense `H`
/// materialization, the `Z_JᵀHZ_J` build, the eigendecomposition, the `∇Φ`/`H_Φ`
/// assembly, and the Q1 outer drift entirely — returning the EXACT-ZERO term,
/// byte-identical to the gated-off dense path. Returns `false` (never skip)
/// whenever the cheap bounds are unresolved or merely near the gate, so any fit
/// where the term might bite still flows to the exact formation.
///
/// Matrix-free preservation: the pre-check issues only `O(p·k)` (`k≤12`) matvecs
/// through `source` and forms nothing dense at `p`-scale; on a well-conditioned
/// large-`p` matrix-free fit (the common case) it returns `true` and NOTHING
/// dense is ever built — preserving the matrix-free path the dense `H_id`
/// formation was defeating. Only on a genuinely near-separating large-`p` fit
/// (rare) does it return `false` and fall through to the inherent `O(p²)` dense
/// `H_id`/`H_Φ` formation, where that cost is justified.
fn jeffreys_term_skippable_for_source(
    source: &JointHessianSource,
    total_p: usize,
) -> Result<bool, String> {
    // Below the dense-eigh-is-cheap threshold the inner `jeffreys_term_skippable_via_matvec`
    // short-circuits to `false` anyway; bail early so small fits (e.g. BMS p≈51)
    // pay nothing for the pre-check and run the exact dense path unchanged.
    if total_p < crate::estimate::reml::jeffreys_subspace::CHEAP_CONDITIONING_PRECHECK_MIN_DIM {
        return Ok(false);
    }
    // Matrix-free Hessian-vector product against the SAME observed information the
    // exact gate sees. `joint_jeffreys_term`'s reduced information is `Z_JᵀHZ_J`
    // with `Z_J = I`, i.e. exactly the UNRIDGED likelihood joint Hessian `H` that
    // `exact_newton_joint_hessian_with_specs` materializes; the `Operator::apply`
    // / `Dense` here is that SAME `H` (the workspace's `hessian_matvec`, which the
    // dense source also reconstructs). So the pre-check estimates the spectrum of
    // precisely the matrix the dense path eigendecomposes — the skip decision and
    // the exact gate are consistent by construction, with no ridge discrepancy
    // (the solver's separate ridged solve operator is not involved here).
    let hv = |v: &Array1<f64>| -> Result<Array1<f64>, String> {
        match source {
            JointHessianSource::Dense(matrix) => Ok(matrix.dot(v)),
            JointHessianSource::Operator { apply, .. } => apply(v),
        }
    };
    crate::estimate::reml::jeffreys_subspace::jeffreys_term_skippable_via_matvec(hv, total_p)
}

/// Evaluate ONLY the Jeffreys objective value `Phi = 1/2 log|Z_J^T H Z_J|` at
/// the current working point. Cheaper than the full term (no directional
/// derivatives), used to keep the trust-region accept/reject objective
/// consistent with the Jeffreys-modified Newton step. Returns `0.0` when there
/// is no coefficient system, the family exposes no exact joint Hessian,
/// or the reduced Fisher information is not yet SPD (the value contribution is
/// then simply omitted for that trial point — the step machinery still bounds
/// the coefficient, and the next accepted cycle re-folds a finite value).
fn custom_family_joint_jeffreys_value<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    ranges: &[(usize, usize)],
    z_joint: &Array2<f64>,
) -> f64 {
    let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    if total_p == 0 || z_joint.ncols() == 0 {
        return 0.0;
    }
    let h_joint = match family.exact_newton_joint_hessian_with_specs(states, specs) {
        Ok(Some(h)) if h.nrows() == total_p && h.ncols() == total_p => h,
        _ => return 0.0,
    };
    match crate::estimate::reml::jeffreys_subspace::joint_jeffreys_term(
        h_joint.view(),
        z_joint.view(),
        |_direction: &Array1<f64>| Ok(None),
    ) {
        Ok((phi, _grad, _hphi)) => phi,
        Err(_) => 0.0,
    }
}

/// Evaluate the family-general Jeffreys term `(Phi, grad, H_Phi)` at the current
/// working point from the coupled joint Hessian (Tier-B path). Returns `None`
/// when there is no coefficient system or the family does not expose an
/// exact joint Hessian (in which case the term is inapplicable and the caller
/// proceeds unchanged).
fn custom_family_joint_jeffreys_term<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    ranges: &[(usize, usize)],
    z_joint: &Array2<f64>,
) -> Result<Option<(f64, Array1<f64>, Array2<f64>)>, String> {
    let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    if total_p == 0 || z_joint.ncols() == 0 {
        return Ok(None);
    }
    let h_joint = match family.exact_newton_joint_hessian_with_specs(states, specs)? {
        Some(h) => h,
        None => return Ok(None),
    };
    if h_joint.nrows() != total_p || h_joint.ncols() != total_p {
        return Ok(None);
    }
    let term = crate::estimate::reml::jeffreys_subspace::joint_jeffreys_term(
        h_joint.view(),
        z_joint.view(),
        |direction: &Array1<f64>| {
            family.exact_newton_joint_hessian_directional_derivative_with_specs(
                states, specs, direction,
            )
        },
    )?;
    Ok(Some(term))
}

/// Outer-REML full-span Jeffreys curvature `H_Φ` for the coupled joint Hessian.
/// Returns `None` when there is no coefficient system or the family exposes no
/// exact joint Hessian.
///
/// This is the OUTER-path companion to the inner-Newton wiring: the LAML score
/// uses `log|H + S_λ + H_Φ|` and its analytic ρ-derivatives
/// `tr((H+S_λ+H_Φ)⁻¹ ∂_ρ(H+S_λ+H_Φ))`.
///
/// CORRECTNESS NOTE (was a bug — see `custom_family_outer_jeffreys_hphi_drift`).
/// `H_Φ` has no EXPLICIT ρ-dependence, but it DOES depend on ρ implicitly through
/// the mode β̂(ρ): `H_Φ = H_Φ(β̂(ρ))` because it is built from `H_id = Z_Jᵀ H Z_J`
/// and `D_a = Z_Jᵀ ∂_a H Z_J`, both functions of β̂. So the exact outer gradient
/// of `½ log|H+S_λ+H_Φ|` carries a `½ tr[(·)⁻¹ D_β H_Φ[v_k]]` drift term ALONGSIDE
/// the likelihood drift `D_β H[v_k]`. Folding `H_Φ` into the `HessianOperator`
/// (the `(·)⁻¹` kernel and `logdet()`) is necessary but NOT sufficient: the
/// trace contraction must ALSO include `D_β H_Φ[v_k]`, supplied by the companion
/// drift wrapper. Without it the analytic gradient describes a DIFFERENT objective
/// than the value, breaking the line search / KKT certification exactly in the
/// near-separating regime where the Jeffreys term is active.
fn custom_family_outer_jeffreys_hphi<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    ranges: &[(usize, usize)],
) -> Result<Option<(f64, Array2<f64>)>, String> {
    if !family.joint_jeffreys_term_required() {
        return Ok(None);
    }
    let z_joint = match build_joint_jeffreys_subspace(specs, ranges)? {
        Some(z) => z,
        None => return Ok(None),
    };
    // Return the gated VALUE alongside the curvature: the outer LAML must fold
    // `−Φ(β̂)` into its cost (the inner mode is Φ-augmented-stationary, so the
    // envelope identity only holds for the Φ-folded criterion — gam#979), and
    // value/curvature must come from the SAME term evaluation.
    let phi_and_hphi = custom_family_joint_jeffreys_term(family, states, specs, ranges, &z_joint)?
        .map(|(phi, _grad, hphi)| (phi, hphi));
    Ok(phi_and_hphi)
}

fn batched_outer_gradient_contract_allows_override(
    robust_jeffreys_hphi: Option<&Array2<f64>>,
) -> bool {
    match robust_jeffreys_hphi {
        None => true,
        Some(hphi) => hphi.iter().all(|value| *value == 0.0),
    }
}

/// Build the Tier-B Jeffreys-curvature drift closure `D_β H_Φ[δβ]` for the outer
/// gradient, evaluated at the current outer point (states = β̂(ρ)).
///
/// THE FIX. The outer LAML objective folds `H_Φ` into `½ log|H + S_λ + H_Φ|`;
/// because `H_Φ` depends on ρ through β̂, the exact gradient's trace contraction
/// must include `½ tr[(H+S_λ+H_Φ)⁻¹ D_β H_Φ[v_k]]`. The released Tier-B path
/// supplied ONLY the likelihood-Hessian drift `D_β H[v_k]`, so the analytic
/// gradient omitted `H_Φ`'s mode-response drift — wrong precisely when Jeffreys
/// is active. This returns the missing drift as a `Send + Sync + 'static` closure
/// the `JeffreysHphiAwareJointDerivatives` wrapper folds into the first-order
/// trace, mirroring Tier-A's `FirthAwareGlmDerivatives` `−D(Hφ)[B_k]` term.
///
/// The closure takes the mode-response direction `δβ = dβ̂/dρ_k` (the wrapper
/// performs `v_k → δβ = −v_k`) and returns `D_β H_Φ[δβ]`. Returns `None` when
/// there is no coefficient system — i.e. exactly when
/// `custom_family_outer_jeffreys_hphi` itself returns `None`. The per-direction
/// conditioning gate and floored
/// pseudo-inverse inside `joint_jeffreys_hphi_directional_derivative` reproduce
/// the value path's, so when the value's `H_Φ` is zero (gated/clean fit) the
/// drift is identically zero too.
fn custom_family_outer_jeffreys_hphi_drift<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    ranges: &[(usize, usize)],
) -> Result<Option<JeffreysHphiDriftFn>, String> {
    if !family.joint_jeffreys_term_required() {
        return Ok(None);
    }
    let z_joint = match build_joint_jeffreys_subspace(specs, ranges)? {
        Some(z) => z,
        None => return Ok(None),
    };
    let total_p = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    if total_p == 0 || z_joint.ncols() == 0 {
        return Ok(None);
    }
    // Snapshot the joint Hessian H(β̂) at the current outer point. If the family
    // exposes no exact joint Hessian the Jeffreys term is inapplicable (matching
    // `custom_family_joint_jeffreys_term`), so no drift is installed.
    let h_joint = match family.exact_newton_joint_hessian_with_specs(states, specs)? {
        Some(h) => h,
        None => return Ok(None),
    };
    if h_joint.nrows() != total_p || h_joint.ncols() != total_p {
        return Ok(None);
    }
    // Own everything the closure needs so it is `'static + Send + Sync`. β̂ is
    // fixed across the single outer evaluation, so capturing the snapshot states
    // is correct; the closure recomputes the exact directional derivatives of the
    // joint Hessian at that point for each mode-response direction.
    let family_owned = family.clone();
    let states_owned: Vec<ParameterBlockState> = states.to_vec();
    let specs_owned: Vec<ParameterBlockSpec> = specs.to_vec();
    let z_columns = z_joint.clone();
    let drift: JeffreysHphiDriftFn = Arc::new(move |delta: &Array1<f64>| {
        crate::estimate::reml::jeffreys_subspace::joint_jeffreys_hphi_directional_derivative(
            h_joint.view(),
            z_columns.view(),
            delta,
            |direction: &Array1<f64>| {
                family_owned.exact_newton_joint_hessian_directional_derivative_with_specs(
                    &states_owned,
                    &specs_owned,
                    direction,
                )
            },
            |u: &Array1<f64>, v: &Array1<f64>| {
                family_owned.exact_newton_joint_hessian_second_directional_derivative_with_specs(
                    &states_owned,
                    &specs_owned,
                    u,
                    v,
                )
            },
        )
        .map(Some)
    });
    Ok(Some(drift))
}

const JOINT_MATRIX_FREE_MIN_DIM: usize = 512;
const JOINT_MATRIX_FREE_MIN_ROWS: usize = 50_000;
const JOINT_MATRIX_FREE_MIN_DIM_AT_LARGE_N: usize = 128;
const JOINT_MATRIX_FREE_MIN_LINEAR_WORK: usize = 4_000_000;
const JOINT_TRACE_STABILITY_RIDGE: f64 = 1e-10;
const JOINT_PCG_MAX_ITER_MULTIPLIER: usize = 4;

pub(crate) fn joint_exact_analytic_outer_hessian_available() -> bool {
    true
}

fn joint_observation_count(states: &[ParameterBlockState]) -> usize {
    states
        .iter()
        .map(|state| state.eta.len())
        .max()
        .unwrap_or(0)
}

/// Whether the unified evaluator will pick the matrix-free joint Hessian path
/// for a problem of size `(total_p, total_n)`. Exposed at crate scope so
/// families with matrix-free operators can branch their `coefficient_hessian_cost`
/// estimate on the same predicate the evaluator will use at fit time.
///
/// For large-scale row counts with only tens of coefficients, exact
/// materialization is bounded by `total_p` Hessian-vector products and then a
/// tiny dense factorization. That is cheaper and more predictable than PCG when
/// each matrix-free product streams all rows through expensive FLEX marginal-
/// slope kernels and the initial joint Hessian is ill-conditioned. Keep the
/// matrix-free route for genuinely wide joint systems, where `total_p` dense
/// products and factorization dominate.
pub(crate) fn use_joint_matrix_free_path(total_p: usize, total_n: usize) -> bool {
    total_p >= JOINT_MATRIX_FREE_MIN_DIM
        || (total_n >= JOINT_MATRIX_FREE_MIN_ROWS
            && total_p >= JOINT_MATRIX_FREE_MIN_DIM_AT_LARGE_N)
        || (total_p >= JOINT_MATRIX_FREE_MIN_DIM_AT_LARGE_N
            && total_n.saturating_mul(total_p) >= JOINT_MATRIX_FREE_MIN_LINEAR_WORK)
}

fn apply_joint_block_penalty(
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    vector: &Array1<f64>,
    diagonal_ridge: f64,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Array1<f64> {
    let mut out = Array1::<f64>::zeros(vector.len());
    apply_joint_block_penalty_into(
        ranges,
        s_lambdas,
        vector,
        diagonal_ridge,
        &mut out,
        joint_full_width,
    );
    out
}

/// In-place variant of [`apply_joint_block_penalty`]. Caller supplies the
/// output buffer to eliminate per-call allocation.
///
/// Uses `fast_av_view_into` to write directly into the per-block slice of
/// `out`, avoiding the per-block intermediate `Array1` from `fast_av`. At
/// large scale this is invoked inside the PCG matvec closure (called
/// once per CG iter, hundreds-to-thousands of times per outer iter per
/// the perf-scout report).
fn apply_joint_block_penalty_into(
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    vector: &Array1<f64>,
    diagonal_ridge: f64,
    out: &mut Array1<f64>,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) {
    assert_eq!(out.len(), vector.len());
    assert!(s_lambdas.len() <= ranges.len());
    out.fill(0.0);

    if s_lambdas.len() <= 1 {
        for (b, s_lambda) in s_lambdas.iter().enumerate() {
            let (start, end) = ranges[b];
            let block = vector.slice(s![start..end]);
            let mut out_slice = out.slice_mut(s![start..end]);
            crate::linalg::faer_ndarray::fast_av_view_into(s_lambda, &block, out_slice.view_mut());
        }
        if diagonal_ridge > 0.0 {
            out.scaled_add(diagonal_ridge, vector);
        }
        if let Some(bundle) = joint_full_width
            && !bundle.is_empty()
        {
            bundle.add_apply_into(vector.view(), out);
        }
        return;
    }

    if out.as_slice_mut().is_none() {
        for (b, s_lambda) in s_lambdas.iter().enumerate() {
            let (start, end) = ranges[b];
            let block = vector.slice(s![start..end]);
            let mut out_slice = out.slice_mut(s![start..end]);
            crate::linalg::faer_ndarray::fast_av_view_into(s_lambda, &block, out_slice.view_mut());
        }
        if diagonal_ridge > 0.0 {
            out.scaled_add(diagonal_ridge, vector);
        }
        if let Some(bundle) = joint_full_width
            && !bundle.is_empty()
        {
            bundle.add_apply_into(vector.view(), out);
        }
        return;
    }

    {
        let out_values = out
            .as_slice_mut()
            .expect("joint penalty output should be contiguous");
        let mut out_blocks = Vec::with_capacity(s_lambdas.len());
        let mut remaining = out_values;
        let mut cursor = 0usize;
        for &(start, end) in ranges.iter().take(s_lambdas.len()) {
            assert!(start >= cursor);
            assert!(end >= start);
            let (_, after_gap) = remaining.split_at_mut(start - cursor);
            let (out_block, after_block) = after_gap.split_at_mut(end - start);
            out_blocks.push(out_block);
            remaining = after_block;
            cursor = end;
        }

        use rayon::prelude::*;

        out_blocks
            .into_par_iter()
            .enumerate()
            .for_each(|(b, out_block)| {
                let (start, end) = ranges[b];
                let block = vector.slice(s![start..end]);
                let out_view = ArrayViewMut1::from(out_block);
                crate::linalg::faer_ndarray::fast_av_view_into(&s_lambdas[b], &block, out_view);
            });
    }

    if diagonal_ridge > 0.0 {
        if let (Some(out_values), Some(vector_values)) = (out.as_slice_mut(), vector.as_slice()) {
            use rayon::prelude::*;

            out_values
                .par_iter_mut()
                .zip(vector_values.par_iter())
                .for_each(|(out_value, vector_value)| {
                    *out_value += diagonal_ridge * *vector_value;
                });
        } else {
            out.scaled_add(diagonal_ridge, vector);
        }
    }

    if let Some(bundle) = joint_full_width
        && !bundle.is_empty()
    {
        bundle.add_apply_into(vector.view(), out);
    }
}

/// Penalty-aware Jacobi preconditioner used by every matrix-free PCG path
/// in the inner coefficient solve.
///
/// Builds `diag(H) + Σ_k gershgorin(S_k(λ)) + ridge`, clamped at 1e-10, where
/// `gershgorin(S)[i] = Σ_j |S[i,j]|` is the absolute row-sum (Gershgorin
/// radius) of each penalty block. This strictly dominates `diag(S)` for any
/// penalty with off-diagonal mass — the high-order difference / thin-plate
/// smooths (the cubic-Duchon `[mass, tension, stiffness]` triple, orders
/// [1,2,3] in `WigglePenaltyConfig::cubic_triple_operator_default`) are
/// strongly off-diagonal-dominant, so `S[i,i]` alone understates the
/// operator's true row scale by orders of magnitude there.
///
/// Why the row-sum and not just the diagonal: a plain Jacobi (diagonal-only)
/// preconditioner collapses to `diag(S_λ)` exactly in the saturated-softmax
/// regime, where the data Fisher weight `W = diag(p) − ppᵀ → 0` near the
/// simplex boundary and the data part of `diag(H)` vanishes. When the penalty
/// is off-diagonal-dominant, `diag(S_λ)` is a poor spectral match for
/// `H + S_λ`, leaving PCG with a large effective condition number and only
/// geometric (linear) convergence — the multinomial-penguins grind in #715.
/// The Gershgorin row-sum diagonal tracks the operator's per-coordinate scale
/// (`|S| 𝟙` bounds `S`'s action), tightening the preconditioned spectrum and
/// cutting CG iterations sharply in that regime. It is `≥ diag(S)` entrywise
/// for SPD `S`, so it stays strictly positive and SPD: it changes only the
/// PCG trajectory, never the converged Newton step or the KKT certificate
/// (PCG converges to the same `(H + S_λ)⁻¹ rhs` under any SPD preconditioner).
/// Design docs sometimes call this the "triple-operator penalty
/// preconditioner"; in code it is the single, unified preconditioner shared by
/// all PCG callsites.
///
/// Callers in the PIRLS inner Newton PCG path feed the result as the diagonal
/// rescale every CG iteration: PCG applies `M^{-1}` to residuals directly.
/// Do not square-root or trace-normalize these entries, and do not apply a
/// second preconditioner-side rescale to the returned Newton step.
fn positive_joint_diagonal_entry(value: f64) -> f64 {
    if value.is_finite() && value > 1.0e-10 {
        value
    } else {
        1.0e-10
    }
}

fn joint_penalty_preconditioner_diag(
    base_diagonal: &Array1<f64>,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    diagonal_ridge: f64,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) -> Array1<f64> {
    assert!(s_lambdas.len() <= ranges.len());
    let mut diag = base_diagonal.clone();
    for (b, s_lambda) in s_lambdas.iter().enumerate() {
        let (start, end) = ranges[b];
        assert_eq!(s_lambda.nrows(), end - start);
        assert_eq!(s_lambda.ncols(), end - start);
        // Gershgorin radius: the absolute row-sum `Σ_j |S[i,j]|` of the penalty
        // block, not just its diagonal `S[i,i]`. For an off-diagonal-dominant
        // smooth penalty (high-order difference / thin-plate) this tracks the
        // operator's true per-coordinate scale, where `S[i,i]` understates it.
        // For SPD `S` the row-sum is `≥ |S[i,i]| = S[i,i]`, so the result still
        // strictly dominates the plain-diagonal preconditioner and stays SPD.
        for (local_idx, global_idx) in (start..end).enumerate() {
            let row_abs_sum: f64 = s_lambda
                .row(local_idx)
                .iter()
                .map(|value| value.abs())
                .sum();
            diag[global_idx] += row_abs_sum;
        }
    }
    if diagonal_ridge > 0.0 {
        for value in &mut diag {
            *value += diagonal_ridge;
        }
    }
    if let Some(bundle) = joint_full_width
        && !bundle.is_empty()
    {
        bundle.add_diag(&mut diag);
    }
    diag.mapv(positive_joint_diagonal_entry)
}

fn log_joint_pcg_diagnostics(
    cycle: usize,
    total_p: usize,
    total_n: usize,
    preconditioner_diag: &Array1<f64>,
    info: &crate::linalg::utils::PcgSolveInfo,
) {
    let (diag_min, diag_max) = preconditioner_diag.iter().fold(
        (f64::INFINITY, 0.0_f64),
        |(min_value, max_value), &value| {
            if value.is_finite() {
                (min_value.min(value), max_value.max(value))
            } else {
                (min_value, max_value)
            }
        },
    );
    let diag_ratio = if diag_min.is_finite() && diag_min > 0.0 && diag_max.is_finite() {
        Some(diag_max / diag_min)
    } else {
        None
    };
    log::info!(
        "[PIRLS/blockwise joint-Newton/PCG] cycle={} p={} n={} iters={} rel_res={:.3e} res0={:.3e} res_final={:.3e} res_ratio={:.3e} ritz_cond~{} jacobi_diag_ratio~{}",
        cycle,
        total_p,
        total_n,
        info.iterations,
        info.relative_residual_norm,
        info.initial_residual_norm,
        info.final_residual_norm,
        info.residual_reduction,
        info.condition_estimate
            .map(|value| format!("{value:.3e}"))
            .unwrap_or_else(|| "NA".to_string()),
        diag_ratio
            .map(|value| format!("{value:.3e}"))
            .unwrap_or_else(|| "NA".to_string()),
    );
}

fn add_joint_penalty_to_matrix(
    matrix: &mut Array2<f64>,
    ranges: &[(usize, usize)],
    s_lambdas: &[Array2<f64>],
    diagonal_ridge: f64,
    joint_full_width: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
) {
    for (b, s_lambda) in s_lambdas.iter().enumerate() {
        let (start, end) = ranges[b];
        let mut block = matrix.slice_mut(s![start..end, start..end]);
        block += s_lambda;
    }
    if diagonal_ridge > 0.0 {
        for d in 0..matrix.nrows() {
            matrix[[d, d]] += diagonal_ridge;
        }
    }
    if let Some(bundle) = joint_full_width
        && !bundle.is_empty()
    {
        bundle.add_to_matrix(matrix);
    }
}

fn flatten_state_betas(
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
) -> Array1<f64> {
    let total = specs.iter().map(|s| s.design.ncols()).sum::<usize>();
    let mut beta = Array1::<f64>::zeros(total);
    let ranges = block_param_ranges(specs);
    for (b, (start, end)) in ranges.into_iter().enumerate() {
        beta.slice_mut(ndarray::s![start..end])
            .assign(&states[b].beta);
    }
    beta
}

fn set_states_from_flat_beta(
    states: &mut [ParameterBlockState],
    specs: &[ParameterBlockSpec],
    beta_flat: &Array1<f64>,
) -> Result<(), String> {
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    if beta_flat.len() != total {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "flat beta length mismatch: got {}, expected {}",
                beta_flat.len(),
                total
            ),
        }
        .into());
    }
    for (b, (start, end)) in ranges.into_iter().enumerate() {
        states[b]
            .beta
            .assign(&beta_flat.slice(ndarray::s![start..end]).to_owned());
    }
    Ok(())
}

fn synchronized_states_from_flat_beta<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    beta_flat: &Array1<f64>,
) -> Result<Vec<ParameterBlockState>, String> {
    let mut synced = states.to_vec();
    set_states_from_flat_beta(&mut synced, specs, beta_flat)?;
    refresh_all_block_etas(family, specs, &mut synced)?;
    Ok(synced)
}

/// Inf-norm of the penalized stationarity residual with valid KKT multipliers
/// projected out at active linear constraints.
///
/// For a linearly constrained convex quadratic with constraints `Aβ ≥ b`,
/// the KKT conditions at β̂ read
///
///   S·β̂ − ∇ℓ(β̂) = A_activeᵀ λ
///   Aβ̂ − b ≥ 0
///   λ ≥ 0
///   λᵢ(Aᵢβ̂ − bᵢ) = 0
///
/// The residual component represented by nonnegative active multipliers is
/// therefore not a convergence defect. This helper removes that normal-cone
/// component before taking the inf-norm. Axis-aligned lower bounds are just a
/// special case; coupled derivative-guard rows must use the same KKT geometry.
///
/// `known_active_rows`, when provided, seeds the working set with the QP
/// solver's authoritative active rows. Trust-region damping and finite
/// precision can leave the committed β with row slacks slightly above the slack
/// tolerance even though the QP identified the row as binding; slack-based
/// detection alone then misses the row and leaves its Lagrange-multiplier mass
/// in the projected residual. Seeding from the QP's active set is exact; the
/// non-negative-multiplier iteration below then removes any seeded row whose
/// least-squares multiplier turns out to be strictly negative, so the union
/// of (QP active) ∪ (slack-detected) never declares false convergence.
fn projected_stationarity_inf_norm(
    residual: &Array1<f64>,
    beta: &Array1<f64>,
    constraints: Option<&LinearInequalityConstraints>,
    known_active_rows: Option<&[usize]>,
) -> f64 {
    assert_eq!(residual.len(), beta.len());
    let raw_inf = residual.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs()));
    let Some(constraints) = constraints else {
        return raw_inf;
    };
    projected_linear_constraint_stationarity_inf_norm(
        residual,
        beta,
        constraints,
        known_active_rows,
    )
    .unwrap_or(raw_inf)
}

fn projected_linear_constraint_stationarity_inf_norm(
    residual: &Array1<f64>,
    beta: &Array1<f64>,
    constraints: &LinearInequalityConstraints,
    known_active_rows: Option<&[usize]>,
) -> Option<f64> {
    let projected = projected_linear_constraint_stationarity_vector(
        residual,
        beta,
        constraints,
        known_active_rows,
    )?;
    let primal_violation = linear_constraint_primal_violation(beta, constraints)?;
    Some(
        projected
            .iter()
            .fold(0.0_f64, |acc, &v| acc.max(v.abs()))
            .max(primal_violation),
    )
}

fn linear_constraint_primal_violation(
    beta: &Array1<f64>,
    constraints: &LinearInequalityConstraints,
) -> Option<f64> {
    if constraints.a.ncols() != beta.len() || constraints.a.nrows() != constraints.b.len() {
        return None;
    }
    let mut primal_violation = 0.0_f64;
    for row in 0..constraints.a.nrows() {
        if constraints.b[row] == f64::NEG_INFINITY {
            continue;
        }
        if !constraints.b[row].is_finite() {
            return None;
        }
        let value = constraints.a.row(row).dot(beta);
        let slack = value - constraints.b[row];
        if !slack.is_finite() {
            return None;
        }
        primal_violation = primal_violation.max((-slack).max(0.0));
    }
    Some(primal_violation)
}

fn projected_linear_constraint_stationarity_vector(
    residual: &Array1<f64>,
    beta: &Array1<f64>,
    constraints: &LinearInequalityConstraints,
    known_active_rows: Option<&[usize]>,
) -> Option<Array1<f64>> {
    let p = beta.len();
    if residual.len() != p
        || constraints.a.ncols() != p
        || constraints.a.nrows() != constraints.b.len()
    {
        return None;
    }
    let n_rows = constraints.a.nrows();
    // Union the slack-detected active rows with the optional QP-supplied
    // hint. Using a boolean membership table preserves a canonical row order
    // (matching the constraint matrix) so the rank-reduction below is
    // deterministic across calls.
    let mut in_active = vec![false; n_rows];
    if let Some(hint) = known_active_rows {
        for &row in hint {
            if row < n_rows && constraints.b[row].is_finite() {
                in_active[row] = true;
            }
        }
    }
    for row in 0..n_rows {
        if constraints.b[row] == f64::NEG_INFINITY {
            continue;
        }
        if !constraints.b[row].is_finite() {
            return None;
        }
        let a_row = constraints.a.row(row);
        let value = a_row.dot(beta);
        let slack = value - constraints.b[row];
        if !slack.is_finite() {
            return None;
        }
        if in_active[row] {
            continue;
        }
        // Active-row inclusion band for the stationarity-residual cone projection.
        // A constraint binding at the constrained optimum carries a Lagrange
        // multiplier whose mass IS the stationarity residual (`r = A_activeᵀ λ`,
        // λ >= 0); to project it out, every genuinely tight row must be a candidate.
        // The constrained QP only reports rows it drove tight during a
        // non-degenerate step, so monotone derivative-guard rows tight at the
        // optimum but never explicitly stepped sit just above the old `1e-6·scale`
        // band, get excluded, and leave the multiplier unresolved — tripping the
        // `active_set_incomplete` refusal on an exactly constrained-stationary
        // iterate (gam#797 survival time block). Widen the band so every near-tight
        // row is a CANDIDATE; over-inclusion is safe because the downstream NNLS
        // (`project_stationarity_residual_on_constraint_cone`) assigns λ = 0 to any
        // candidate carrying no multiplier mass, so a non-binding row cannot
        // spuriously shrink the residual.
        let scale = value.abs().max(constraints.b[row].abs()).max(1.0);
        let active_tol = 1e-3 * scale + 1e-8;
        if slack <= active_tol {
            in_active[row] = true;
        }
    }
    let active_rows: Vec<usize> = (0..n_rows).filter(|&row| in_active[row]).collect();
    if active_rows.is_empty() {
        return Some(residual.clone());
    }

    let mut a_active = Array2::<f64>::zeros((active_rows.len(), p));
    for (pos, &row) in active_rows.iter().enumerate() {
        a_active.row_mut(pos).assign(&constraints.a.row(row));
    }
    project_stationarity_residual_on_constraint_cone(residual, &a_active)
        .map(|(projected, _)| projected)
}

fn exact_newton_joint_stationarity_inf_norm<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
    eval: &FamilyEvaluation,
    states: &[ParameterBlockState],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Option<f64>, String> {
    if eval.blockworking_sets.len() != states.len() || states.len() != s_lambdas.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: "exact-newton joint stationarity check: block dimension mismatch".to_string(),
        }
        .into());
    }
    if specs.len() != states.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: "exact-newton joint stationarity check: spec/state count mismatch".to_string(),
        }
        .into());
    }
    if let Some(sets) = block_active_sets
        && sets.len() != states.len()
    {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton joint stationarity check: active-set count mismatch, got {}, expected {}",
            sets.len(),
            states.len()
        ) }.into());
    }

    let block_constraints = collect_block_linear_constraints(family, states, specs)?;
    let mut inf_norm = 0.0_f64;
    for b in 0..states.len() {
        let gradient = match &eval.blockworking_sets[b] {
            // For exact-Newton families the block score is ∇ log L with respect
            // to that block, while the penalized negative objective is
            //
            //   Q(beta, rho) = -log L(beta) + 0.5 beta^T P_mode(rho) beta,
            //
            // where `P_mode` includes the rho-independent stabilization ridge
            // exactly when that ridge participates in the quadratic objective.
            //
            // The coupled first-order condition is therefore
            //
            //   ∇Q = -∇ log L + P beta = 0.
            //
            // So the exact penalized stationarity residual for block b is
            //
            //   r_b = P_mode,b * beta_b - gradient_b.
            //
            // For blocks with simple lower-bound constraints (e.g. I-spline
            // monotone time coefficients, monotone wiggle coefficients) the
            // residual on an active-bound coordinate is the KKT multiplier
            // λ_j ≥ 0 rather than a convergence defect; the projection in
            // `projected_stationarity_inf_norm` drops those entries so the
            // inf-norm measures only the free-set residual that must be
            // driven to zero. Using only coordinate step size or an
            // unprojected norm can declare convergence too early OR fail to
            // ever declare convergence at a constrained optimum.
            BlockWorkingSet::ExactNewton { gradient, .. } => gradient,
            _ => return Ok(None),
        };
        let mut residual = s_lambdas[b].dot(&states[b].beta) - gradient;
        if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
            residual += &states[b].beta.mapv(|v| ridge * v);
        }
        let block_active_hint = block_active_sets
            .and_then(|sets| sets.get(b))
            .and_then(|opt| opt.as_deref());
        let block_inf = projected_stationarity_inf_norm(
            &residual,
            &states[b].beta,
            block_constraints[b].as_ref(),
            block_active_hint,
        );
        inf_norm = inf_norm.max(block_inf);
    }
    Ok(Some(inf_norm))
}

fn exact_newton_joint_gradient_from_eval(
    eval: &FamilyEvaluation,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
) -> Result<Option<Array1<f64>>, String> {
    if eval.blockworking_sets.len() != specs.len() {
        return Err(format!(
            "exact-newton joint gradient extraction: family returned {} block working sets, expected {}",
            eval.blockworking_sets.len(),
            specs.len()
        ));
    }
    if states.len() != specs.len() {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton joint gradient extraction: state count {} does not match spec count {}",
            states.len(),
            specs.len()
        ) }.into());
    }
    let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    let mut gradient = Array1::<f64>::zeros(total_p);
    let mut offset = 0usize;
    for ((spec, work), state) in specs
        .iter()
        .zip(eval.blockworking_sets.iter())
        .zip(states.iter())
    {
        let width = spec.design.ncols();
        match work {
            BlockWorkingSet::ExactNewton {
                gradient: block_gradient,
                ..
            } => {
                if block_gradient.len() != width {
                    return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                        "exact-newton joint gradient extraction: block gradient length mismatch, got {}, expected {}",
                        block_gradient.len(),
                        width
                    ) }.into());
                }
                gradient
                    .slice_mut(ndarray::s![offset..offset + width])
                    .assign(block_gradient);
            }
            BlockWorkingSet::Diagonal {
                working_response,
                working_weights,
            } => {
                // Recover the per-block log-likelihood score from the IRLS
                // working set.  By construction of the IRLS pseudo-response
                //
                //     z_i = η_i + (∂ℓ/∂η_i) / w_i,
                //
                // so the row score is `w_i (z_i − η_i)` and the
                // coefficient-space score is
                //
                //     ∇_β_b log L = X_b^T (w ⊙ (z − η)).
                //
                // Without this branch the joint-Newton path is unable to
                // assemble its RHS for families that emit Diagonal working
                // sets alongside an exact joint Hessian (e.g. Gaussian
                // location-scale): the inner fit returns non-converged, and
                // the outer evaluator falls into the nonconverged-result
                // branch and reports a zero outer gradient.
                let n = working_response.len();
                if working_weights.len() != n || state.eta.len() != n || spec.design.nrows() != n {
                    return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                        "exact-newton joint gradient extraction: diagonal working-set length mismatch (z={}, w={}, η={}, X_rows={})",
                        working_response.len(),
                        working_weights.len(),
                        state.eta.len(),
                        spec.design.nrows()
                    ) }.into());
                }
                let mut weighted = Array1::<f64>::zeros(n);
                for i in 0..n {
                    weighted[i] = working_weights[i] * (working_response[i] - state.eta[i]);
                }
                let block_gradient =
                    <DesignMatrix as LinearOperator>::apply_transpose(&spec.design, &weighted);
                if block_gradient.len() != width {
                    return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                        "exact-newton joint gradient extraction: diagonal block transpose length mismatch, got {}, expected {}",
                        block_gradient.len(),
                        width
                    ) }.into());
                }
                gradient
                    .slice_mut(ndarray::s![offset..offset + width])
                    .assign(&block_gradient);
            }
        }
        offset += width;
    }
    Ok(Some(gradient))
}

fn exact_newton_joint_stationarity_inf_norm_from_gradient(
    gradient: &Array1<f64>,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    block_constraints: &[Option<LinearInequalityConstraints>],
    block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<f64, String> {
    if states.len() != specs.len() || states.len() != s_lambdas.len() {
        return Err(
            "exact-newton joint stationarity check from gradient: block dimension mismatch"
                .to_string(),
        );
    }
    if block_constraints.len() != states.len() {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton joint stationarity check from gradient: constraint count mismatch, got {}, expected {}",
            block_constraints.len(),
            states.len()
        ) }.into());
    }
    if let Some(sets) = block_active_sets
        && sets.len() != states.len()
    {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton joint stationarity check from gradient: active-set count mismatch, got {}, expected {}",
            sets.len(),
            states.len()
        ) }.into());
    }
    let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    if gradient.len() != total_p {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton joint stationarity check from gradient: joint gradient length mismatch, got {}, expected {}",
            gradient.len(),
            total_p
        ) }.into());
    }

    // Same KKT projection as `exact_newton_joint_stationarity_inf_norm`:
    // multipliers at active lower bounds are not convergence defects, so we
    // measure only the free-set residual. See `projected_stationarity_inf_norm`
    // for the tolerance choice and its parallel with `projected_gradient_norm`
    // in `pirls.rs`.
    //
    // The optional `block_active_sets` arrives from the joint-Newton inner
    // loop's `cached_active_sets` and carries the QP solver's authoritative
    // active rows per block. Threading it through is what makes the
    // stationarity test correctly fire at the constrained optimum: a damped
    // constrained step may commit β with row slacks slightly above the slack
    // tolerance even though the QP identified the rows as binding, and
    // slack-based detection alone then misses the rows and leaves the
    // Lagrange-multiplier mass in the residual.
    let mut inf_norm = 0.0_f64;
    let mut offset = 0usize;
    for b in 0..states.len() {
        let width = specs[b].design.ncols();
        let mut residual =
            s_lambdas[b].dot(&states[b].beta) - gradient.slice(ndarray::s![offset..offset + width]);
        if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
            residual += &states[b].beta.mapv(|v| ridge * v);
        }
        let block_active_hint = block_active_sets
            .and_then(|sets| sets.get(b))
            .and_then(|opt| opt.as_deref());
        let block_inf = projected_stationarity_inf_norm(
            &residual,
            &states[b].beta,
            block_constraints[b].as_ref(),
            block_active_hint,
        );
        inf_norm = inf_norm.max(block_inf);
        offset += width;
    }
    Ok(inf_norm)
}

fn exact_newton_joint_stationarity_vector_from_gradient(
    gradient: &Array1<f64>,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
) -> Result<Array1<f64>, String> {
    if states.len() != specs.len() || states.len() != s_lambdas.len() {
        return Err(
            "exact-newton joint stationarity vector from gradient: block dimension mismatch"
                .to_string(),
        );
    }
    let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    if gradient.len() != total_p {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton joint stationarity vector from gradient: joint gradient length mismatch, got {}, expected {}",
            gradient.len(),
            total_p
        ) }.into());
    }

    let mut residual = Array1::<f64>::zeros(total_p);
    let mut offset = 0usize;
    for b in 0..states.len() {
        let width = specs[b].design.ncols();
        let start = offset;
        let end = offset + width;
        let mut block = s_lambdas[b].dot(&states[b].beta) - gradient.slice(ndarray::s![start..end]);
        if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
            block += &states[b].beta.mapv(|v| ridge * v);
        }
        residual.slice_mut(ndarray::s![start..end]).assign(&block);
        offset = end;
    }
    Ok(residual)
}

fn exact_newton_joint_projected_stationarity_vector_from_gradient(
    gradient: &Array1<f64>,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    block_constraints: &[Option<LinearInequalityConstraints>],
    block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Array1<f64>, String> {
    if states.len() != specs.len()
        || states.len() != s_lambdas.len()
        || states.len() != block_constraints.len()
    {
        return Err(
            "exact-newton projected stationarity vector from gradient: block dimension mismatch"
                .to_string(),
        );
    }
    if let Some(sets) = block_active_sets
        && sets.len() != states.len()
    {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton projected stationarity vector from gradient: active-set count mismatch, got {}, expected {}",
            sets.len(),
            states.len()
        ) }.into());
    }
    let total_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    if gradient.len() != total_p {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "exact-newton projected stationarity vector from gradient: joint gradient length mismatch, got {}, expected {}",
            gradient.len(),
            total_p
        ) }.into());
    }

    let mut residual = Array1::<f64>::zeros(total_p);
    let mut offset = 0usize;
    for b in 0..states.len() {
        let width = specs[b].design.ncols();
        let start = offset;
        let end = offset + width;
        let mut block = s_lambdas[b].dot(&states[b].beta) - gradient.slice(ndarray::s![start..end]);
        if ridge_policy.include_quadratic_penalty && ridge > 0.0 {
            block += &states[b].beta.mapv(|v| ridge * v);
        }
        if let Some(constraints) = block_constraints[b].as_ref() {
            let block_active_hint = block_active_sets
                .and_then(|sets| sets.get(b))
                .and_then(|opt| opt.as_deref());
            block = projected_linear_constraint_stationarity_vector(
                &block,
                &states[b].beta,
                constraints,
                block_active_hint,
            )
            .ok_or_else(|| {
                format!("exact-newton projected stationarity vector: failed to project block {b}")
            })?;
        }
        residual.slice_mut(ndarray::s![start..end]).assign(&block);
        offset = end;
    }
    Ok(residual)
}

/// Build the free-space-projected KKT residual for the IFT correction.
///
/// The active set passed via `block_active_sets` is consumed by the inner
/// projection so the returned vector lies in `range(I − P_normal_cone)`. The
/// [`crate::solver::estimate::reml::unified::ProjectedKktResidual`] return type makes
/// that invariant visible at every call site — callers cannot forget to
/// project, and `reml/unified.rs` cannot accidentally accept an unprojected
/// vector.
fn exact_newton_joint_kkt_residual_for_ift<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Option<ProjectedKktResidual>, String> {
    let eval = family.evaluate(states)?;
    let Some(gradient) = exact_newton_joint_gradient_from_eval(&eval, specs, states)? else {
        return Ok(None);
    };
    let block_constraints = collect_block_linear_constraints(family, states, specs)?;
    exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
        &gradient,
        specs,
        states,
        s_lambdas,
        ridge,
        ridge_policy,
        &block_constraints,
        block_active_sets,
    )
}

fn exact_newton_joint_kkt_residual_for_ift_from_cached_gradient<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    block_active_sets: Option<&[Option<Vec<usize>>]>,
    cached_gradient: Option<&Array1<f64>>,
) -> Result<Option<ProjectedKktResidual>, String> {
    if let Some(gradient) = cached_gradient {
        let block_constraints = collect_block_linear_constraints(family, states, specs)?;
        return exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
            gradient,
            specs,
            states,
            s_lambdas,
            ridge,
            ridge_policy,
            &block_constraints,
            block_active_sets,
        );
    }
    exact_newton_joint_kkt_residual_for_ift(
        family,
        specs,
        states,
        s_lambdas,
        ridge,
        ridge_policy,
        block_active_sets,
    )
}

fn exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
    gradient: &Array1<f64>,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    ridge_policy: RidgePolicy,
    block_constraints: &[Option<LinearInequalityConstraints>],
    block_active_sets: Option<&[Option<Vec<usize>>]>,
) -> Result<Option<ProjectedKktResidual>, String> {
    let residual = exact_newton_joint_projected_stationarity_vector_from_gradient(
        gradient,
        states,
        specs,
        s_lambdas,
        ridge,
        ridge_policy,
        block_constraints,
        block_active_sets,
    )?;
    if residual.iter().all(|v| v.is_finite()) {
        Ok(Some(ProjectedKktResidual::from_active_projected(residual)))
    } else {
        // Surface this clearly: a non-finite projected residual reaches the
        // unified evaluator as `kkt_residual = None`, which then makes the
        // envelope-consistency tripwire fire with "no projected residual"
        // as the suspected cause. Emit the count and magnitude so the
        // failure is diagnosable from a single log line.
        let nan_count = residual.iter().filter(|v| v.is_nan()).count();
        let inf_count = residual.iter().filter(|v| v.is_infinite()).count();
        let finite_max = residual
            .iter()
            .filter(|v| v.is_finite())
            .copied()
            .map(f64::abs)
            .fold(0.0_f64, f64::max);
        log::warn!(
            "[exact-newton kkt-residual projection] dropping projected KKT residual to None: \
             len={} nan_count={} inf_count={} finite_max={:.3e}. The unified evaluator will \
             treat this convergent path as if no residual were available, which silently \
             disables the IFT correction and can trip the envelope-gradient consistency check \
             on near-singular H. Investigate which block produced the non-finite entry.",
            residual.len(),
            nan_count,
            inf_count,
            finite_max,
        );
        Ok(None)
    }
}

fn compute_joint_covariance<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    per_block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
) -> Result<Array2<f64>, String> {
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, e)| *e).unwrap_or(0);
    let Some(mut h) = exact_newton_joint_hessian_symmetrized(
        family,
        states,
        specs,
        total,
        "joint exact-newton Hessian shape mismatch in covariance",
    )?
    else {
        return Err(
            "joint covariance requires an exact analytic Hessian; objective perturbation is forbidden"
                .to_string(),
        );
    };
    for (b, spec) in specs.iter().enumerate() {
        let (start, end) = ranges[b];
        let lambdas = per_block_log_lambdas[b].mapv(f64::exp);
        let mut s_lambda = Array2::<f64>::zeros((end - start, end - start));
        for (k, s) in spec.penalties.iter().enumerate() {
            s.add_scaled_to(lambdas[k], &mut s_lambda);
        }
        h.slice_mut(ndarray::s![start..end, start..end])
            .scaled_add(1.0, &s_lambda);
    }
    symmetrize_dense_in_place(&mut h);
    if use_exact_newton_strict_spd(family) {
        // #748: the strict posterior precision is `H + S_λ` AT THE CONVERGED
        // OPTIMUM. A δ-ridge inverse `(H + S_λ + δI)⁻¹` would mask a genuinely
        // non-PD curvature and report it as if it were the posterior
        // covariance, biasing every standard error. Instead: eigendecompose and
        // **reject** when the precision is genuinely indefinite (a real
        // fit-quality failure — the mode is not a strict maximum), and on the
        // PSD case return the honest positive-eigenspace pseudo-inverse (the
        // structural null space of a penalised model is a flat posterior
        // direction, not something to ridge away).
        let p = h.nrows();
        let (evals, _) = FaerEigh::eigh(&h, Side::Lower).map_err(|e| {
            format!("strict pseudo-laplace covariance eigendecomposition failed: {e}")
        })?;
        let max_abs_eval = evals.iter().fold(0.0_f64, |acc, &ev| acc.max(ev.abs()));
        let eps_np = f64::EPSILON * (p as f64) * (p as f64);
        let tol = (10.0 * eps_np * max_abs_eval).max(100.0 * f64::EPSILON);
        if let Some(&min_eval) = evals
            .iter()
            .filter(|&&ev| ev < -tol)
            .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
        {
            let below = evals.iter().filter(|&&ev| ev < -tol).count();
            return Err(format!(
                "strict pseudo-laplace covariance: joint coefficient Hessian is non-PD at the \
                 converged optimum ({below} eigenvalue(s) below -tol, min(λ)={min_eval:.6e}, \
                 max|λ|={max_abs_eval:.6e}, tol={tol:.6e}); the mode is not a strict posterior \
                 maximum, so the reported covariance would be meaningless — fit-quality failure \
                 surfaced instead of δ-ridge masking (gam#748)"
            ));
        }
        pinv_positive_part(&h, effective_solverridge(options.ridge_floor))
    } else {
        match inverse_spdwith_retry(&h, effective_solverridge(options.ridge_floor), 8) {
            Ok(cov) => Ok(cov),
            Err(_) => pinv_positive_part(&h, effective_solverridge(options.ridge_floor)),
        }
    }
}

fn compute_joint_covariance_required<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    per_block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
) -> Result<Option<Array2<f64>>, CustomFamilyError> {
    if !options.compute_covariance {
        return Ok(None);
    }
    compute_joint_covariance(family, specs, states, per_block_log_lambdas, options)
        .map(Some)
        .map_err(|e| CustomFamilyError::InvalidInput {
            context: "compute_joint_covariance_required",
            reason: format!("joint covariance computation failed: {e}"),
        })
}

/// Compute joint working-set geometry at convergence for ALO diagnostics.
fn compute_joint_geometry<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    states: &[ParameterBlockState],
    per_block_log_lambdas: &[Array1<f64>],
) -> Result<Option<FitGeometry>, String> {
    if specs.len() != per_block_log_lambdas.len() {
        return Ok(None);
    }
    if specs.len() == 1 {
        let eval = family.evaluate(states).ok();
        let Some(eval) = eval else {
            return Ok(None);
        };
        let spec = &specs[0];
        let lambdas = per_block_log_lambdas[0].mapv(f64::exp);
        // The penalized joint Hessian `H_pen = H_lik + Σ_k λ_k S_k` is the exact
        // mgcv quantity the trace edf `p − Σ_k λ_k·tr(H_pen⁻¹ S_k)` consumes. Two
        // single-block working-set shapes reach here:
        //
        // * `Diagonal` — IRLS/GLM families expose only the diagonal working
        //   weights, so the likelihood curvature is reconstructed as the
        //   Gauss–Newton gram `XᵀWX`.
        // * `ExactNewton` — coefficient-space exact-curvature families (CTN
        //   transformation-normal, …) already carry the dense negative
        //   log-likelihood Hessian `−∇²log L = H_lik` directly. Materialize it
        //   and add the penalties, so these families report inference / total
        //   edf instead of dropping geometry (and therefore inference) for the
        //   whole fit (#720).
        let (mut h, working_weights, working_response) = match eval.blockworking_sets.as_slice() {
            [
                BlockWorkingSet::Diagonal {
                    working_response,
                    working_weights,
                },
            ] => {
                let Some(h) = spec
                    .design
                    .xt_diag_x_signed_op(SignedWeightsView::from_array(working_weights))
                    .ok()
                else {
                    return Ok(None);
                };
                (h, working_weights.clone(), working_response.clone())
            }
            [BlockWorkingSet::ExactNewton { hessian, .. }] => {
                let h = hessian.to_dense();
                if h.nrows() != spec.design.ncols() || h.ncols() != spec.design.ncols() {
                    return Ok(None);
                }
                // The exact-Newton block carries no IRLS pseudo-data; the
                // trace edf reads only the penalized Hessian, and the
                // downstream IRLS covariance path is unused for these
                // families (they report dispersion = 1). Match the joint
                // multi-block branch's zero-length convention.
                let working_len = states.first().map(|state| state.eta.len()).unwrap_or(0);
                (h, Array1::zeros(working_len), Array1::zeros(working_len))
            }
            _ => return Ok(None),
        };
        for (k, s) in spec.penalties.iter().enumerate() {
            let s_dense = s.as_dense_cow();
            h.scaled_add(lambdas[k], &*s_dense);
        }
        // Exact-Newton families may return a Hessian assembled from directional
        // callbacks whose off-diagonal entries differ by floating-point order
        // or, for pseudo-Laplace tests, by a deliberately non-symmetric input
        // that is accepted only after symmetrization. Export the same symmetric
        // penalized Hessian used by the determinant/covariance path instead of
        // letting result assembly reject an otherwise valid fit geometry.
        symmetrize_dense_in_place(&mut h);
        return Ok(Some(FitGeometry {
            penalized_hessian: h.into(),
            working_weights,
            working_response,
        }));
    }

    let requires_explicit_joint_hessian = specs.iter().enumerate().any(|(idx, spec)| {
        custom_family_block_role(&spec.name, idx, specs.len())
            == crate::solver::estimate::BlockRole::LinkWiggle
    });
    let total_p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
    let Some(mut h) = exact_newton_joint_hessian_symmetrized(
        family,
        states,
        specs,
        total_p,
        "compute_joint_geometry",
    )?
    else {
        if requires_explicit_joint_hessian {
            return Err(
                "link-wiggle fits require an exact explicit joint Hessian for posterior sampling"
                    .to_string(),
            );
        }
        return Ok(None);
    };
    let ranges = block_param_ranges(specs);
    for (block_idx, spec) in specs.iter().enumerate() {
        let Some(block_log_lambdas) = per_block_log_lambdas.get(block_idx) else {
            return Ok(None);
        };
        let lambdas = block_log_lambdas.mapv(f64::exp);
        if lambdas.len() != spec.penalties.len() {
            return Ok(None);
        }
        let (start, end) = ranges[block_idx];
        let block_dim = end - start;
        for (penalty_idx, penalty) in spec.penalties.iter().enumerate() {
            let scale = lambdas[penalty_idx];
            if scale == 0.0 {
                continue;
            }
            let dense = penalty.as_dense_cow();
            if dense.nrows() == block_dim && dense.ncols() == block_dim {
                h.slice_mut(ndarray::s![start..end, start..end])
                    .scaled_add(scale, &*dense);
            } else if dense.nrows() == total_p && dense.ncols() == total_p {
                h.scaled_add(scale, &*dense);
            } else {
                return Ok(None);
            }
        }
    }
    let working_len = states.first().map(|state| state.eta.len()).unwrap_or(0);
    Ok(Some(FitGeometry {
        penalized_hessian: h.into(),
        working_weights: Array1::zeros(working_len),
        working_response: Array1::zeros(working_len),
    }))
}

pub fn fit_custom_family<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
    fit_custom_family_with_rho_prior(family, specs, options, crate::types::RhoPrior::Flat)
}

/// Lift reduced-space `ParameterBlockState`s back to the raw block
/// dimensions described by `canonical.gauge`. Each block's
/// `beta` becomes `T_i · θ_i` (selection-T zeros dropped raw entries);
/// `eta = design · beta` is invariant under the transform, so the
/// reduced-space `eta` field carries through unchanged.
fn lift_block_states_to_raw(
    canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
    reduced: Vec<ParameterBlockState>,
) -> Vec<ParameterBlockState> {
    let theta_blocks: Vec<Array1<f64>> = reduced.iter().map(|s| s.beta.clone()).collect();
    let raw_betas = canonical.gauge.lift_block_betas(&theta_blocks);
    reduced
        .into_iter()
        .zip(raw_betas.into_iter())
        .map(|(state, beta_raw)| ParameterBlockState {
            beta: beta_raw,
            eta: state.eta,
        })
        .collect()
}

/// Lift a reduced-space conditional covariance / joint geometry pair
/// back to the raw coordinate system by sandwiching with the joint
/// block-diagonal transform `T_full = blockdiag(T_i)`. Selection-T
/// zero-pads the dropped raw rows/cols; the lifted Hessian is exactly
/// the post-canonicalisation Hessian as seen in raw coordinates and is
/// rank-deficient by construction along the dropped directions
/// (matching the inner-solve geometry the canonical step produced).
fn lift_fit_geometry_to_raw(
    canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
    covariance_conditional: Option<Array2<f64>>,
    geometry: Option<FitGeometry>,
) -> (Option<Array2<f64>>, Option<FitGeometry>) {
    let lifted_cov = covariance_conditional.map(|c| canonical.gauge.lift_covariance(&c));
    let lifted_geom = geometry.map(|g| {
        let h_red = g.penalized_hessian.into_array();
        let h_raw = canonical.gauge.lift_covariance(&h_red);
        FitGeometry {
            penalized_hessian: h_raw.into(),
            working_weights: g.working_weights,
            working_response: g.working_response,
        }
    });
    (lifted_cov, lifted_geom)
}

struct BlockwiseFitAssembly<'a> {
    rho_physical: Array1<f64>,
    covariance_conditional: Option<Array2<f64>>,
    geometry: Option<FitGeometry>,
    canonical: Option<&'a crate::solver::identifiability_canonical::CanonicalSpecs>,
    result_specs: &'a [ParameterBlockSpec],
    penalized_objective: f64,
    outer_iterations: usize,
    outer_gradient_norm: Option<f64>,
    criterion_certificate: Option<crate::solver::outer_strategy::CriterionCertificate>,
    outer_converged: bool,
    context: &'static str,
}

fn assemble_custom_family_fit_result(
    inner: BlockwiseInnerResult,
    assembly: BlockwiseFitAssembly<'_>,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
    let BlockwiseFitAssembly {
        rho_physical,
        covariance_conditional,
        geometry,
        canonical,
        result_specs,
        penalized_objective,
        outer_iterations,
        outer_gradient_norm,
        criterion_certificate,
        outer_converged,
        context,
    } = assembly;
    let lambdas = rho_physical.mapv(f64::exp);
    let log_lambdas = lambdas.mapv(|v| v.max(1e-300).ln());
    let (block_states, covariance_conditional, geometry, precomputed_edf) =
        if let Some(canonical) = canonical {
            let precomputed_edf = reduced_blockwise_edf(geometry.as_ref(), canonical, &lambdas);
            let block_states = lift_block_states_to_raw(canonical, inner.block_states);
            let (covariance_conditional, geometry) =
                lift_fit_geometry_to_raw(canonical, covariance_conditional, geometry);
            (
                block_states,
                covariance_conditional,
                geometry,
                precomputed_edf,
            )
        } else {
            (inner.block_states, covariance_conditional, geometry, None)
        };

    blockwise_fit_from_parts(
        BlockwiseFitResultParts {
            block_states,
            log_likelihood: inner.log_likelihood,
            log_lambdas,
            lambdas,
            covariance_conditional,
            stable_penalty_term: 2.0 * inner.penalty_value,
            penalized_objective,
            outer_iterations,
            outer_gradient_norm,
            criterion_certificate,
            inner_cycles: inner.cycles,
            outer_converged,
            geometry,
            precomputed_edf,
        },
        result_specs,
    )
    .map_err(|reason| CustomFamilyError::Optimization { context, reason })
}

/// Install the channel-aware `AdditiveBlockJacobian` callbacks declared by a
/// family's [`CustomFamily::output_channel_assignment`].
///
/// Multi-output families that build their specs by hand (or through the
/// low-level `fit_custom_family` API) declare their per-block output channel
/// here so the pre-fit identifiability audit routes channel-aware instead of
/// mistaking a shared covariate basis for cross-block aliases (#558). Blocks
/// that already carry an explicit `jacobian_callback` are left untouched
/// (the family wired its own, possibly β-dependent, multi-output Jacobian).
///
/// Returns `None` when the family declares no assignment (single-output flat
/// route, the default) so the caller can keep borrowing the original specs
/// without an allocation.
fn wire_output_channels<F: CustomFamily + ?Sized>(
    family: &F,
    specs: &[ParameterBlockSpec],
) -> Result<Option<Vec<ParameterBlockSpec>>, CustomFamilyError> {
    validate_blockspecs(specs)?;
    let Some(channels) = family.output_channel_assignment(specs) else {
        return Ok(None);
    };
    if channels.len() != specs.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "output_channel_assignment returned {} channels for {} blocks",
                channels.len(),
                specs.len(),
            ),
        });
    }
    let n_family_outputs = channels.iter().copied().max().map(|m| m + 1).unwrap_or(1);
    if n_family_outputs <= 1 {
        // A single output channel is exactly the flat route — nothing to wire.
        return Ok(None);
    }
    // When every block already carries an explicit (family-wired) callback,
    // the channel-aware route is already taken — avoid cloning the specs.
    if specs.iter().all(|s| s.jacobian_callback.is_some()) {
        return Ok(None);
    }
    let mut wired = specs.to_vec();
    for (idx, spec) in wired.iter_mut().enumerate() {
        // Respect a family-supplied callback (e.g. multinomial / location-scale
        // already wire their own multi-output, possibly β-dependent Jacobian).
        if spec.jacobian_callback.is_some() {
            continue;
        }
        let own_output = channels[idx];
        // The block's effective design at β=0 (with no callback) is exactly
        // its linear design — the additive-block Jacobian for an `η_r = X_r β_r`
        // channel.
        let dense = spec.effective_design("wire_output_channels").map_err(|e| {
            CustomFamilyError::DimensionMismatch {
                reason: format!("block {idx} effective design for channel wiring: {e}"),
            }
        })?;
        spec.jacobian_callback = Some(Arc::new(AdditiveBlockJacobian {
            design: dense,
            own_output,
            n_family_outputs,
        }));
    }
    Ok(Some(wired))
}

/// True iff an outer-smoothing `Err` is a POST-AUDIT NUMERICAL pathology that
/// the never-fail posterior-sampling rung can recover from (gam#860), rather
/// than an ill-posed input that must keep raising.
///
/// All structural guards (the #531-class identifiability audit, the #789B
/// zero-events guard, the #859 cross-fit alignment check) raise BEFORE the outer
/// solver runs, so by the time the outer optimizer reports "no candidate seeds
/// passed outer startup validation" (every seed rejected during exact-eval
/// validation, e.g. the #787 kappa-driven penalty-topology dim-mismatch that
/// surfaces as a non-finite cost) the design is structurally well-posed and a
/// posterior mode exists to sample about. Those two signatures are the
/// escalatable ones. Any other `Err` (a genuine solver contract violation,
/// dimension error, etc.) keeps the hard raise.
fn outer_startup_failure_is_escalatable(err: &EstimationError) -> bool {
    match err {
        EstimationError::RemlOptimizationFailed(message) => {
            message.contains("no candidate seeds passed outer startup validation")
                || message.contains("objective returned a non-finite cost")
                // Data-driven inner non-convergence on a structurally-audited design:
                // the coupled exact-joint Newton path could not drive a weakly-identified
                // block's penalized stationarity residual below tol at every screened seed
                // (the #787 weak marginal/logslope-coupling KKT-flooring regime). This
                // surfaces as a hard `Err` from the inner solve (rather than the
                // `Ok(!inner_converged)` retreat sentinel), so when it rejects every seed
                // BEFORE the outer optimizer starts it would otherwise dead-end short of
                // the post-run escalation rung. It is a post-audit NUMERICAL pathology, not
                // an ill-posed input — the best inner mode reached during screening is a
                // usable posterior mode — so route it into the same never-fail escalation
                // (gam#860).
                //
                // Both coupled-exact-joint non-convergence signatures qualify: the
                // pre-budget "exited the joint Newton path before convergence" exit and
                // the "exhausted the joint Newton budget without KKT convergence" exit are
                // the same #787-class weak-identification floor reached two ways.
                //
                // The SAME prefixes are also emitted for GENUINELY STRUCTURAL cert
                // refusals (the diagnosis is carried in the trailing `; diagnosis: <label>`
                // slot of the bubbled error). Those — a rank-deficient joint design, an
                // unresolved active set, or a cross-block alias surfaced at fit time — are
                // NOT recoverable by sampling about the mode (the mode itself is
                // degenerate), so they must keep hard-raising. We therefore escalate the
                // coupled-joint failure only when it carries no structural diagnosis label.
                || ((message
                    .contains("coupled exact-joint inner solve exited the joint Newton path")
                    || message.contains(
                        "coupled exact-joint inner solve exhausted the joint Newton budget",
                    ))
                    && !message.contains("diagnosis: rank_deficient_H_pen")
                    && !message.contains("diagnosis: active_set_incomplete")
                    && !message.contains("diagnosis: aliasing_detected_at_fit"))
        }
        _ => false,
    }
}

/// Minimum effective degrees of freedom a penalized term must retain in the
/// outer λ-selection. One effective dimension is the smallest non-arbitrary
/// floor: it asserts the penalized component must explain at least ONE effective
/// direction of its own range space, i.e. it has not collapsed entirely onto its
/// unpenalized polynomial null space. It is NOT a tuning constant — `1.0` is the
/// boundary between "the smooth contributes" and "the smooth is statistically
/// indistinguishable from its null-space limit".
const EFFECTIVE_DF_FLOOR: f64 = 1.0;

/// Unit-weight effective degrees of freedom of a single penalized term as a
/// function of `ρ = log λ`, expressed through the design/penalty generalized
/// eigenvalues `γ_j` on the penalty range space:
///
/// ```text
/// edf(ρ) = Σ_j γ_j / (γ_j + e^ρ),   γ_j = (design range curvature)_j / (penalty)_j.
/// ```
///
/// This is the data-FREE structural edf: it uses the design column Gram `XᵀX`
/// (unit weights), NOT the family's Fisher weight, so it is the same regardless
/// of where the inner solve sits on a near-flat Fisher surface. It is the
/// quantity whose collapse the #715/#684 over-shrinkage describes — when the
/// Fisher curvature vanishes the REML objective flattens in ρ and the optimizer
/// lets λ drift past the point where this structural edf falls below the floor.
fn unit_weight_term_edf(gammas: &[f64], rho: f64) -> f64 {
    let lambda = rho.exp();
    gammas
        .iter()
        .map(|&g| if g > 0.0 { g / (g + lambda) } else { 0.0 })
        .sum()
}

/// Generalized eigenvalues `γ_j` of the design column Gram `G = XᵀX` against the
/// penalty `S` on `range(S)`, computed structurally (unit weights).
///
/// These are the eigenvalues of the pencil `(UᵀG U, D)` where `S = U D Uᵀ` and
/// the index runs over `range(S)` (the positive eigenvalues `d_j` of `S`).
/// Equivalently they are the eigenvalues of the symmetric matrix
///
/// ```text
/// B = D^{-1/2} (Uᵀ G U) D^{-1/2}   restricted to range(S),
/// ```
///
/// with `D = diag(d_j)` over the range and `U` the corresponding penalty
/// eigenvectors. With these `γ_j` the structural effective df is the EXACT
/// trace identity
///
/// ```text
/// Σ_j γ_j/(γ_j + λ) = tr{ G (G + λ S)⁻¹ }   for all λ > 0.
/// ```
///
/// This is NOT a per-direction Rayleigh quotient `(u_jᵀ G u_j)/d_j`: that would
/// keep only the diagonal of `B` and is correct only when `G` and `S` commute
/// (are simultaneously diagonalizable). Smooth Gram/penalty pairs generally do
/// not commute, so the off-diagonal coupling of `B` must be retained — it is
/// what makes the eigenvalue sum match the trace identity above.
///
/// Returns `None` (caller falls back to the uniform ρ bound) whenever the
/// geometry cannot be materialized safely as a `p×p` block-local pair — Kronecker
/// penalties are expanded, but `Blockwise`/total-dim penalties whose dense form
/// is not `p×p` are skipped rather than risk a mis-projected curvature that could
/// bias the REML selection.
fn design_penalty_range_gammas(design: &DesignMatrix, penalty: &PenaltyMatrix) -> Option<Vec<f64>> {
    let p = design.ncols();
    if p == 0 {
        return None;
    }
    let s_dense = penalty.to_dense();
    if s_dense.nrows() != p || s_dense.ncols() != p {
        // Blockwise/total-dim layout or shape mismatch: not safely projectable
        // here. Fall back to the uniform bound.
        return None;
    }
    let x = design.to_dense();
    if x.ncols() != p {
        return None;
    }
    let gram = x.t().dot(&x);
    // Eigendecompose the penalty to find its range space S = U D Uᵀ.
    let (s_evals, s_evecs) = s_dense.eigh(Side::Lower).ok()?;
    let s_max = s_evals.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
    if !(s_max > 0.0) {
        return None;
    }
    let s_thresh = positive_eigenvalue_threshold(s_evals.as_slice()?);
    // Collect the range-space columns U_r (penalty eigenvectors with d_j above
    // the numerical-zero threshold) and their inverse square-root weights
    // d_j^{-1/2}. Directions in ker(S) are dropped: they are unpenalized and do
    // not enter the structural edf of this term.
    let mut range_cols: Vec<usize> = Vec::new();
    let mut inv_sqrt_d: Vec<f64> = Vec::new();
    for (j, &dj) in s_evals.iter().enumerate() {
        if dj <= s_thresh {
            continue; // null space of S: not a penalized direction.
        }
        range_cols.push(j);
        inv_sqrt_d.push(1.0 / dj.sqrt());
    }
    let r = range_cols.len();
    if r == 0 {
        return None;
    }
    // Form U_r (p×r) and the symmetric pencil matrix
    //   B = D_r^{-1/2} (U_rᵀ G U_r) D_r^{-1/2}   (r×r),
    // whose eigenvalues are the generalized eigenvalues of (UᵀGU, D) on
    // range(S). Scaling U_r's columns by d_j^{-1/2} up front gives
    //   Y = U_r D_r^{-1/2}  (p×r),   B = Yᵀ G Y,
    // which is symmetric by construction (Gram of G in the Y-columns).
    let mut y = Array2::<f64>::zeros((p, r));
    for (col, (&src, &w)) in range_cols.iter().zip(inv_sqrt_d.iter()).enumerate() {
        let u = s_evecs.column(src);
        for row in 0..p {
            y[(row, col)] = u[row] * w;
        }
    }
    let b = y.t().dot(&gram).dot(&y);
    // Symmetrize defensively against round-off before the symmetric solver, then
    // take eigenvalues. These are the γ_j (data-free, unit-weight).
    let mut b_sym = b.clone();
    for i in 0..r {
        for j in (i + 1)..r {
            let avg = 0.5 * (b_sym[(i, j)] + b_sym[(j, i)]);
            b_sym[(i, j)] = avg;
            b_sym[(j, i)] = avg;
        }
    }
    let (b_evals, _) = b_sym.eigh(Side::Lower).ok()?;
    let mut gammas = Vec::with_capacity(r);
    for &gj in b_evals.iter() {
        // A penalized direction with no design support has γ→0: edf→0 for any
        // λ>0, so it cannot be floored by bounding ρ. Clamp tiny negative
        // round-off to 0; it never contributes to the retained df sum.
        if gj.is_finite() && gj > 0.0 {
            gammas.push(gj);
        } else {
            gammas.push(0.0);
        }
    }
    if gammas.is_empty() {
        return None;
    }
    Some(gammas)
}

/// Per-outer-coordinate ρ UPPER bound enforcing the effective-df floor.
///
/// For each penalized term, the structural unit-weight edf `Σ_j γ_j/(γ_j+e^ρ)`
/// is monotone decreasing in ρ. The bound is the ρ at which it equals
/// `EFFECTIVE_DF_FLOOR` (when the term's max attainable edf exceeds the floor),
/// found by bisection on the closed-form edf. Tied coordinates (shared precision
/// label) take the TIGHTEST (smallest) per-term bound: the shared λ must retain
/// the floor for EVERY contributing term, so the binding constraint is the most
/// restrictive one — relaxing to a looser term's bound would let some other term
/// fall below its floor. Every coordinate is additionally capped at the caller's
/// uniform `ceiling` so this can only TIGHTEN, never loosen, the existing bound.
///
/// This enters ONLY the λ-selection domain. The inner β solve is exact
/// CONDITIONAL on the selected λ, so there is no per-λ approximation (same
/// discipline as the #747 solver-only ridge). It is NOT, however, a bias-free
/// no-op: whenever the unconstrained REML optimum lies beyond this upper bound,
/// the bound changes the SELECTED λ, and the selected λ changes the fitted
/// β̂ = argmin{−ℓ + ½λ βᵀSβ} (∂β̂/∂λ = −(H + λS)⁻¹ S β̂ ≠ 0). The floor is an
/// explicit smoothing-regularization constraint on the λ-selection — it
/// deliberately moves the estimate away from the (flat-Fisher) null-space
/// collapse, not a transparent reparameterization. It is the λ-upper-side dual
/// of the #752
/// full-subspace logdet work — there the value/gradient subspace was fixed on the
/// λ→∞ side of a near-collinear block; here the selection domain is bounded so a
/// flat Fisher surface cannot push a term past null-space collapse (#715/#684).
fn effective_df_floor_rho_upper_bounds(
    specs: &[ParameterBlockSpec],
    layout: &PenaltyLabelLayout,
    n_rho: usize,
    ceiling: f64,
) -> Array1<f64> {
    let mut upper = Array1::<f64>::from_elem(n_rho, ceiling);
    let mut physical = 0usize;
    for spec in specs {
        for penalty in &spec.penalties {
            let outer = layout.physical_to_outer.get(physical).copied().flatten();
            physical += 1;
            let Some(outer) = outer else {
                continue; // fixed penalty: not an outer coordinate.
            };
            let Some(gammas) = design_penalty_range_gammas(&spec.design, penalty) else {
                continue; // un-projectable geometry: keep the uniform ceiling.
            };
            // Maximum attainable structural edf (ρ → −∞) is the number of
            // design-supported penalized directions. If it cannot reach the
            // floor even unpenalized, the floor is not enforceable for this term
            // (a single-dimension range space with the floor at its own cap), so
            // keep the uniform ceiling.
            let edf_max = unit_weight_term_edf(&gammas, f64::NEG_INFINITY);
            if !(edf_max > EFFECTIVE_DF_FLOOR) {
                continue;
            }
            // Bisect for ρ* with edf(ρ*) = floor on [−ceiling, ceiling]; edf is
            // monotone decreasing in ρ. If edf at the ceiling still exceeds the
            // floor, the uniform ceiling already retains enough df — keep it.
            if unit_weight_term_edf(&gammas, ceiling) >= EFFECTIVE_DF_FLOOR {
                continue;
            }
            let mut lo = -ceiling;
            let mut hi = ceiling;
            for _ in 0..64 {
                let mid = 0.5 * (lo + hi);
                if unit_weight_term_edf(&gammas, mid) >= EFFECTIVE_DF_FLOOR {
                    lo = mid;
                } else {
                    hi = mid;
                }
            }
            let rho_star = 0.5 * (lo + hi);
            // Tied coordinates: take the tightest (smallest) bound across terms,
            // so every term sharing this λ retains at least the floor.
            let slot = &mut upper[outer];
            if rho_star > -ceiling && rho_star < *slot {
                *slot = rho_star;
            }
        }
    }
    upper
}

pub fn fit_custom_family_with_rho_prior<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
    // Multi-output families that omitted the per-block channel callback get it
    // installed here from their declared `output_channel_assignment`, so the
    // identifiability audit routes channel-aware (single source of truth for
    // the channel-wiring; no per-test/per-builder duplication — #558).
    let wired = wire_output_channels(family, specs)?;
    let raw_specs: &[ParameterBlockSpec] = wired.as_deref().unwrap_or(specs);
    validate_blockspecs(raw_specs)?;

    // Pre-fit cross-block identifiability canonicalisation. Every
    // blockwise fit path in the tree (standard, gaussian/binomial
    // location-scale, survival, BMS, transformation-normal, custom
    // families) reaches this entry point with a finalised
    // `ParameterBlockSpec` list, so wiring the canonicalisation here
    // covers all four `solver::workflow.rs` entry points plus every
    // direct caller of `fit_custom_family` without each family needing
    // its own canonicalisation hook.
    //
    // Contract: specs arrive *after* `nullspace-lead`'s
    // `joint_null_rotation` absorption. The canonical step inspects
    // post-rotation columns only, runs the joint RRQR identifiability
    // audit, and converts attributed cross-block drops into a per-block
    // selection transform `T_i`. The inner solve runs in the reduced
    // coordinate space; coefficients and joint geometry are lifted back
    // to the raw space at result assembly via `T_i` and the joint
    // block-diagonal `T_full = blockdiag(T_i)`.
    //
    // An audit that is fatal *without* attributed drops (the >2-way
    // structural alias case where RRQR couldn't pin redundancy onto a
    // single block/column) still aborts: silently absorbing it would
    // change model semantics beyond what canonicalisation can repair.
    // Per the panic-vs-Err contract: never panic mid-construction.
    let canonical_started = std::time::Instant::now();
    let canonical_n_rows = raw_specs.first().map(|s| s.design.nrows()).unwrap_or(0);
    let canonical_n_cols_raw: usize = raw_specs.iter().map(|s| s.design.ncols()).sum();
    log::info!(
        "[STAGE] identifiability canonicalise: start blocks={} n={} p_total_raw={}",
        raw_specs.len(),
        canonical_n_rows,
        canonical_n_cols_raw,
    );
    let canonical =
        crate::solver::identifiability_canonical::canonicalize_for_identifiability(raw_specs)?;
    let canonical_n_cols_red: usize = canonical
        .reduced_specs
        .iter()
        .map(|s| s.design.ncols())
        .sum();
    log::info!(
        "[STAGE] identifiability canonicalise: end elapsed={:.3}s alias_pairs={} dropped_cols={} \
         p_total_raw={} p_total_reduced={} fatal_attributed={}",
        canonical_started.elapsed().as_secs_f64(),
        canonical.audit.aliased_pairs.len(),
        canonical.audit.dropped_columns.len(),
        canonical_n_cols_raw,
        canonical_n_cols_red,
        canonical.audit.fatal,
    );
    if !canonical.audit.aliased_pairs.is_empty() {
        log::info!("[identifiability audit] {}", canonical.audit.summary);
        // Aggregate by (block_a, block_b) so the log stays bounded by the
        // block-pair count rather than the quadratic direction-pair count
        // — a few wide blocks alone produce 100+ pair-lines and bury the
        // useful structural signal. INFO carries the cluster shape (count,
        // overlap range, perfect-collinearity count); DEBUG prints the
        // worst three sample pairs per cluster for forensic users.
        let mut by_pair: BTreeMap<(&str, &str), Vec<&_>> = BTreeMap::new();
        for pair in &canonical.audit.aliased_pairs {
            by_pair
                .entry((pair.block_a.as_str(), pair.block_b.as_str()))
                .or_default()
                .push(pair);
        }
        for ((a, b), pairs) in &by_pair {
            let count = pairs.len();
            let max = pairs
                .iter()
                .map(|p| p.overlap)
                .fold(f64::NEG_INFINITY, f64::max);
            let min = pairs
                .iter()
                .map(|p| p.overlap)
                .fold(f64::INFINITY, f64::min);
            let near_one = pairs.iter().filter(|p| p.overlap >= 0.9999).count();
            log::info!(
                "[identifiability audit] alias-cluster {a} ~ {b}: {count} direction-pair{plural} \
                 (overlap {min:.4}..{max:.4}; {near_one} ≥0.9999)",
                plural = if count == 1 { "" } else { "s" },
            );
        }
        if log::log_enabled!(log::Level::Debug) {
            for ((a, b), pairs) in &by_pair {
                let mut sorted = pairs.clone();
                sorted.sort_by(|p, q| {
                    q.overlap
                        .partial_cmp(&p.overlap)
                        .unwrap_or(std::cmp::Ordering::Equal)
                });
                for pair in sorted.iter().take(3) {
                    log::debug!(
                        "[identifiability audit]   sample {a}[{ai}] ~ {b}[{bi}] overlap={ov:.4}",
                        ai = pair.direction_a,
                        bi = pair.direction_b,
                        ov = pair.overlap,
                    );
                }
            }
        }
    }
    for drop in &canonical.audit.dropped_columns {
        log::info!(
            "[identifiability audit] dropped: block='{}' local_col={} ({})",
            drop.block,
            drop.column,
            drop.reason,
        );
    }
    let specs: &[ParameterBlockSpec] = &canonical.reduced_specs;
    let penalty_counts = validate_blockspecs(specs)?;

    let label_layout = penalty_label_layout(specs, penalty_counts.clone())?;
    let rho0 = label_layout.initial_rho.clone();
    let (persistent_warm_start_key, persistent_warm_start) =
        load_persistent_custom_family_warm_start::<F>(family, specs, options, rho0.len());

    if rho0.is_empty() {
        let physical_rho0 = expand_labeled_log_lambdas(&rho0, &label_layout)?;
        let per_block = split_labeled_log_lambdas(&rho0, &label_layout)?;
        let mut inner = inner_blockwise_fit(
            family,
            specs,
            &per_block,
            options,
            persistent_warm_start.as_ref(),
        )?;
        refresh_all_block_etas(family, specs, &mut inner.block_states)?;
        let covariance_conditional = compute_joint_covariance_required(
            family,
            specs,
            &inner.block_states,
            &per_block,
            options,
        )?;
        let reml_term = if options.use_remlobjective {
            0.5 * (inner.block_logdet_h - inner.block_logdet_s)
        } else {
            0.0
        };
        let geometry = compute_joint_geometry(family, specs, &inner.block_states, &per_block)
            .map_err(|reason| CustomFamilyError::Optimization {
                context: "fit_custom_family no-smoothing joint geometry",
                reason,
            })?;
        let penalized_objective = checked_penalizedobjective(
            inner.log_likelihood,
            inner.penalty_value,
            reml_term,
            "custom-family fit without smoothing parameters",
        )
        .map_err(|reason| CustomFamilyError::Optimization {
            context: "fit_custom_family no-smoothing penalized objective",
            reason,
        })?;
        let warm_start = constrained_warm_start_from_inner(&rho0, &inner);
        store_persistent_custom_family_warm_start(
            persistent_warm_start_key.as_deref(),
            specs,
            &warm_start,
        );
        let inner_converged = inner.converged;
        return assemble_custom_family_fit_result(
            inner,
            BlockwiseFitAssembly {
                rho_physical: physical_rho0,
                covariance_conditional,
                geometry,
                canonical: Some(&canonical),
                result_specs: raw_specs,
                penalized_objective,
                outer_iterations: 0,
                outer_gradient_norm: None,
                criterion_certificate: None,
                outer_converged: inner_converged,
                context: "fit_custom_family no-smoothing result assembly",
            },
        );
    }

    // Exact Hessians are primary whenever the assembled family can supply them.
    // If a particular outer step is ill-conditioned, strategy fallback handles
    // the downgrade; we do not suppress second-order capability preemptively
    // based on the presence of a wiggle block.
    if options.inner_max_cycles <= 1 && options.outer_max_iter <= 1 {
        log::info!(
            "[OUTER] custom family: skipping smoothing outer solve for explicit one-cycle inner probe"
        );
        let per_block = split_labeled_log_lambdas(&rho0, &label_layout)?;
        let mut inner = inner_blockwise_fit(family, specs, &per_block, options, None)?;
        refresh_all_block_etas(family, specs, &mut inner.block_states).map_err(|reason| {
            CustomFamilyError::Optimization {
                context: "fit_custom_family one-cycle eta refresh",
                reason,
            }
        })?;
        let penalized_objective = inner_penalized_objective(
            &inner,
            include_exact_newton_logdet_h(family, options),
            include_exact_newton_logdet_s(family, options),
            "custom-family explicit one-cycle inner probe",
        )
        .map_err(|reason| CustomFamilyError::Optimization {
            context: "fit_custom_family one-cycle penalized objective",
            reason,
        })?;
        let physical_rho0 = expand_labeled_log_lambdas(&rho0, &label_layout)?;
        let inner_converged = inner.converged;
        return assemble_custom_family_fit_result(
            inner,
            BlockwiseFitAssembly {
                rho_physical: physical_rho0,
                covariance_conditional: None,
                geometry: None,
                canonical: Some(&canonical),
                result_specs: raw_specs,
                penalized_objective,
                outer_iterations: 0,
                outer_gradient_norm: Some(0.0),
                criterion_certificate: None,
                outer_converged: inner_converged,
                context: "fit_custom_family one-cycle result assembly",
            },
        );
    }

    use crate::estimate::EstimationError;
    use crate::solver::outer_strategy::{FallbackPolicy, OuterEval, OuterEvalOrder, OuterProblem};

    let screening_cap = Arc::new(AtomicUsize::new(0));
    let outer_inner_cap = options
        .outer_inner_max_iterations
        .clone()
        .unwrap_or_else(|| Arc::new(AtomicUsize::new(options.inner_max_cycles.max(1))));
    outer_inner_cap.store(options.inner_max_cycles.max(1), Ordering::Relaxed);
    let mut outer_options = options.clone();
    outer_options.screening_max_inner_iterations = Some(Arc::clone(&screening_cap));
    outer_options.outer_inner_max_iterations = Some(Arc::clone(&outer_inner_cap));

    let n_rho = rho0.len();
    let (cap_gradient, cap_hessian) =
        custom_family_outer_derivatives(family, specs, &outer_options);
    let derivative_policy = family.outer_derivative_policy(specs, 0, &outer_options);
    let hessian = cap_hessian;
    let need_outer_hessian = hessian.is_analytic();
    log::info!(
        "[OUTER] custom family derivative-policy: n_params={} gradient={:?} hessian={:?} capability={:?} requested_outer_hessian={} predicted_gradient_work={} predicted_hessian_work={} inner_hvp_available={} outer_hvp_available={} outer_dense_available={}",
        n_rho,
        cap_gradient,
        hessian,
        derivative_policy.capability,
        need_outer_hessian,
        derivative_policy.predicted_gradient_work,
        derivative_policy.predicted_hessian_work,
        family.inner_coefficient_hessian_hvp_available(specs),
        family.outer_hyper_hessian_hvp_available(specs),
        family.outer_hyper_hessian_dense_available(specs),
    );
    let outer_max_iter = cost_gated_first_order_max_iter(
        options.outer_max_iter,
        family.coefficient_gradient_cost(specs),
        need_outer_hessian,
    );
    let bfgs_step_cap = first_order_bfgs_loglambda_step_cap(need_outer_hessian);
    if outer_max_iter < options.outer_max_iter {
        log::info!(
            "[OUTER] custom family: first-order work gate reduced outer_max_iter {} -> {}",
            options.outer_max_iter,
            outer_max_iter,
        );
    }
    // EFS / HybridEfs structural property (`H^{-1/2} B_k H^{-1/2} ≽ 0` plus a
    // parameter-independent nullspace, Wood-Fasiolo) fails for multi-block
    // families whose joint likelihood Hessian depends on β. Disable
    // fixed-point only for genuinely first-order capabilities; exact-Hessian
    // capabilities route to ARC before EFS is considered.
    let multi_block_beta_dependent =
        specs.len() > 1 && family.exact_newton_joint_hessian_beta_dependent();
    // Exact-Hessian plans must fail on their own terms rather than silently
    // retrying on a quasi-Newton surface. First-order-only families keep the
    // automatic cascade because there is no second-order geometry to discard.
    let fallback_policy = if need_outer_hessian {
        FallbackPolicy::Disabled
    } else {
        FallbackPolicy::Automatic
    };
    let problem = OuterProblem::new(n_rho)
        .with_gradient(cap_gradient)
        .with_hessian(hessian)
        .with_disable_fixed_point(multi_block_beta_dependent)
        .with_fallback_policy(fallback_policy)
        .with_tolerance(options.outer_tol)
        .with_max_iter(outer_max_iter)
        .with_bfgs_step_cap(bfgs_step_cap)
        .with_seed_config(family.outer_seed_config(n_rho))
        .with_initial_rho(rho0.clone())
        .with_screen_initial_rho(options.screen_initial_rho)
        // Per-coordinate ρ box bounds. The uniform ceiling of 10 is the
        // belt-and-suspenders cap: λ = exp(10) ≈ 22k is already extremely strong
        // shrinkage, and the bound keeps the optimizer out of the dead-flat
        // λ ≈ 10⁹ region where ARC's quadratic model breaks down, the retry-stall
        // detector fires, and downstream empty-block_states crashes surface.
        //
        // ON TOP of that uniform ceiling, each penalized term's UPPER bound is
        // tightened to the ρ at which its structural (unit-weight) effective df
        // would fall to one — the EFFECTIVE_DF_FLOOR. Near a flat Fisher surface
        // (multinomial simplex boundary diag(p)−ppᵀ→0, #715; Gaussian log-σ on a
        // gently-varying scale, #684) the REML criterion loses ρ-curvature and
        // the optimizer would otherwise let some λ_{class,term} drift past the
        // point where the term collapses onto its unpenalized polynomial null
        // space, over-smoothing the cubic/sigmoid/log-σ signal below the mature
        // reference. The floor is derived from the penalty RANGE-SPACE
        // eigenstructure (design/penalty generalized eigenvalues), not from the
        // vanishing Fisher weight, and enters ONLY the λ-selection domain — the
        // inner β solve at the selected ρ is unchanged and exact, so the
        // converged β is unbiased (cf. the #747 solver-only ridge). This is the
        // λ-upper-side dual of the #752 full-subspace logdet work.
        .with_bounds(
            Array1::<f64>::from_elem(n_rho, -10.0),
            effective_df_floor_rho_upper_bounds(specs, &label_layout, n_rho, 10.0),
        );
    // Install the seed-screening cap only when initial-rho screening is
    // wanted. A caller that pins an already-identified `initial_rho` and
    // opts out (`screen_initial_rho == false`) leaves the OuterConfig
    // screening cap `None`, so `should_screen_seeds` short-circuits and the
    // screening cascade never runs. This is the lever the survival
    // constant-scale (parametric-AFT) regime uses: its time-warp ρ seed is
    // pinned AT the inner ρ box bound (the affine-baseline limit) on a
    // dead-flat, statistically-unidentified time ridge where every capped
    // proxy fit collapses to non-finite cost and the cascade escalates to a
    // full uncapped inner solve per seed on the near-singular Hessian — the
    // multi-minute no-iteration-log stall (#736, #735, #721). With the cap
    // unset, the pinned seed flows straight to the outer solver, which
    // certifies box-constraint stationarity at iteration 0. Every other
    // custom-family caller defaults `screen_initial_rho = true` and keeps
    // full screening; genuinely flexible scale/spatial survival fits carry
    // log-sigma penalties, never set the flag false, and screen normally.
    let problem = if options.screen_initial_rho {
        problem.with_screening_cap(Arc::clone(&screening_cap))
    } else {
        problem
    };
    // Attach the workflow-level warm-start session if one was threaded
    // through. This makes the custom-family outer optimizer (BFGS / ARC
    // depending on derivative capabilities) use the same persistent
    // cache infrastructure as standard REML — every accepted outer step
    // is checkpointed to disk, every fit starts by consulting the disk
    // for a prior best iterate. Without this, every survival-marginal-
    // slope / GAMLSS / latent fit starts cold even when a converged ρ
    // from a near-identical prior fit is sitting in `~/.cache/gam/warm`.
    let problem = if let Some(session) = options.cache_session.clone() {
        let key_hex = session.key().to_hex();
        log::info!(
            "[CACHE] attach key={}.. family-tag={} backend=outer-strategy mirrors={}",
            &key_hex[..8.min(key_hex.len())],
            std::any::type_name::<F>()
                .rsplit("::")
                .next()
                .unwrap_or("?"),
            options.cache_mirror_sessions.len(),
        );
        let mut p = problem.with_cache_session(session);
        if !options.cache_mirror_sessions.is_empty() {
            p = p.with_cache_mirror_sessions(options.cache_mirror_sessions.clone());
        }
        p
    } else {
        problem
    };

    // Robustness is unconditional, so escalation is always armed: the inner-non-
    // convergence branch inside `eval_outer` marks a trial rho *infeasible*
    // (recoverable) rather than hard-erroring, letting the outer optimizer retreat
    // and the run reach the terminal HMC sampling rung instead of dead-ending
    // before it (the gap `verify` located at this site).
    let eval_outer = |outer: &mut CustomOuterState,
                      rho: &Array1<f64>,
                      order: OuterEvalOrder|
     -> Result<OuterEval, EstimationError> {
        let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
        let request_hessian =
            matches!(order, OuterEvalOrder::ValueGradientHessian) && need_outer_hessian;
        let eval_result = match outerobjectivegradienthessian_labeled(
            family,
            specs,
            &outer_options,
            &label_layout,
            rho,
            warm_ref,
            &rho_prior,
            if request_hessian {
                EvalMode::ValueGradientHessian
            } else {
                EvalMode::ValueAndGradient
            },
        ) {
            Ok(eval) if !eval.inner_converged => {
                outer.warm_cache = Some(eval.warm_start.clone());
                outer.last_error = Some("custom-family inner solve did not converge".to_string());
                // Recoverable: this trial rho is infeasible (inner solve did not
                // converge), so the outer optimizer retreats rather than the whole
                // run hard-erroring. When the search ultimately reports
                // `converged == false`, the post-run rung samples the proper
                // posterior (never-fail).
                return Ok(OuterEval::infeasible(rho.len()));
            }
            Ok(eval)
                if eval.objective.is_finite()
                    && eval.gradient.iter().all(|v| v.is_finite())
                    && match &eval.outer_hessian {
                        crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
                            hessian.iter().all(|v| v.is_finite())
                        }
                        crate::solver::outer_strategy::HessianResult::Operator(op) => {
                            !request_hessian || op.dim() == rho.len()
                        }
                        crate::solver::outer_strategy::HessianResult::Unavailable => {
                            !request_hessian
                        }
                    } =>
            {
                let warm_start = eval.warm_start.clone();
                let gradient_norm = eval
                    .gradient
                    .iter()
                    .map(|value| value * value)
                    .sum::<f64>()
                    .sqrt();
                update_custom_outer_inner_cap_from_warm_start(
                    &outer_options,
                    &warm_start,
                    Some(gradient_norm),
                    &mut outer.initial_gradient_norm,
                );
                outer.warm_cache = Some(warm_start.clone());
                store_persistent_custom_family_warm_start(
                    persistent_warm_start_key.as_deref(),
                    specs,
                    &warm_start,
                );
                outer.last_error = None;
                eval
            }
            Ok(_) => {
                outer.last_error =
                    Some("custom-family outer objective/derivatives became non-finite".to_string());
                // Recoverable (data-driven): the objective/derivatives became
                // non-finite at this trial rho (e.g. separation / near-singular
                // information), so the outer optimizer retreats from this infeasible
                // point rather than the whole run hard-erroring. When the search
                // ultimately reports `converged == false`, the post-run rung samples
                // the proper posterior (never-fail).
                return Ok(OuterEval::infeasible(rho.len()));
            }
            Err(e) => {
                // Genuine eval-error (internal computation failure: linalg error,
                // etc.) — NOT data-driven. Leave as a hard Err even when escalation
                // is armed: a real bug must surface, not be silently sampled over.
                // Only the "did not converge" / "non-finite objective" data-driven
                // paths above convert to infeasible-when-armed.
                outer.last_error = Some(e.clone());
                return Err(EstimationError::RemlOptimizationFailed(e));
            }
        };
        let inner_beta_hint = Some(Array1::from_iter(
            eval_result
                .warm_start
                .block_beta
                .iter()
                .flat_map(|beta| beta.iter().copied()),
        ));
        Ok(OuterEval {
            cost: eval_result.objective,
            gradient: eval_result.gradient,
            hessian: eval_result.outer_hessian,
            inner_beta_hint,
        })
    };

    let mut obj = problem.build_objective_with_screening_proxy(
        CustomOuterState::new(persistent_warm_start.clone()),
        |outer: &mut CustomOuterState, rho: &Array1<f64>| {
            // Always use warm cache when available — the previous inner solution
            // gives a much better starting point. This was previously disabled for
            // exact-Hessian families, forcing every inner solve to start from
            // scratch (5-10 Newton steps instead of 1-2 with warm start).
            let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
            match outerobjectivegradienthessian_labeled(
                family,
                specs,
                &outer_options,
                &label_layout,
                rho,
                warm_ref,
                &rho_prior,
                EvalMode::ValueOnly,
            ) {
                Ok(eval) if eval.inner_converged && eval.objective.is_finite() => {
                    outer.warm_cache = Some(eval.warm_start);
                    outer.last_error = None;
                    Ok(eval.objective)
                }
                Ok(eval) => {
                    outer.warm_cache = Some(eval.warm_start);
                    outer.last_error = Some(
                        "custom-family value-only inner solve did not converge or objective was non-finite"
                            .to_string(),
                    );
                    // Recoverable (data-driven): this value-only probe is the
                    // line-search cost the outer optimizer calls most often. A
                    // non-converged inner solve / non-finite objective at this trial
                    // rho means the point is infeasible — return an infinite cost so
                    // the line search retreats, rather than hard-erroring out of
                    // `problem.run` and bypassing the post-run escalation (sampling)
                    // rung. When the search reports `converged == false` the never-fail
                    // rung samples the proper posterior.
                    Ok(f64::INFINITY)
                }
                Err(e) => {
                    // Genuine eval-error (internal computation failure) — NOT
                    // data-driven. Leave as a hard Err even when escalation is armed
                    // so a real bug surfaces instead of being silently sampled over.
                    outer.last_error = Some(e.clone());
                    Err(EstimationError::RemlOptimizationFailed(e))
                }
            }
        },
        |outer: &mut CustomOuterState, rho: &Array1<f64>| {
            eval_outer(
                outer,
                rho,
                if need_outer_hessian {
                    OuterEvalOrder::ValueGradientHessian
                } else {
                    OuterEvalOrder::ValueAndGradient
                },
            )
        },
        |outer: &mut CustomOuterState, rho: &Array1<f64>, order: OuterEvalOrder| {
            eval_outer(outer, rho, order)
        },
        Some(|outer: &mut CustomOuterState| {
            outer.reset();
        }),
        Some(|outer: &mut CustomOuterState, rho: &Array1<f64>| {
            if label_layout.has_tied_coordinates() {
                return Err(EstimationError::RemlOptimizationFailed(
                    "custom-family EFS is not available for tied coefficient-group precision labels"
                        .to_string(),
                ));
            }
            let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
            match outerobjectiveefs(
                family,
                specs,
                &outer_options,
                &label_layout.penalty_counts,
                rho,
                warm_ref,
                rho_prior.clone(),
            ) {
                Ok((eval, warm, true)) => {
                    outer.warm_cache = Some(warm);
                    outer.last_error = None;
                    Ok(eval)
                }
                Ok((_eval, warm, false)) => {
                    outer.warm_cache = Some(warm);
                    outer.last_error =
                        Some("custom-family EFS inner solve did not converge".to_string());
                    // Intentionally LEFT as a hard Err even when escalation is armed.
                    // Unlike the BFGS/value-only paths above, an EFS error does NOT
                    // dead-end the run: it surfaces as a recoverable objective-eval
                    // error at the fixed-point bridge (outer_strategy.rs:2409-2410
                    // `into_objective_error` -> `ObjectiveEvalError::recoverable`),
                    // so the EFS seed is rejected / the FixedPoint run returns Err,
                    // and `run_outer`'s fallback cascade (outer_strategy.rs:5297) routes
                    // to the fixed-point-disabled analytic-gradient BFGS attempt. That
                    // attempt is always present here because custom-family declares an
                    // analytic outer gradient (custom_family.rs:11826), so
                    // `automatic_fallback_attempts` (outer_strategy.rs:1502) adds it.
                    // BFGS then evaluates via `eval_outer` / the value-only cost
                    // closure, both of which now retreat-when-armed, so the run reaches
                    // `Ok(converged == false)` and the post-run sampling rung. No
                    // analogous infeasible sentinel is needed at this site.
                    Err(EstimationError::RemlOptimizationFailed(
                        "custom-family EFS inner solve did not converge".to_string(),
                    ))
                }
                Err(e) => {
                    // Genuine eval-error (internal computation failure) — NOT
                    // data-driven. Hard Err so a real bug surfaces.
                    outer.last_error = Some(e.clone());
                    Err(EstimationError::RemlOptimizationFailed(e))
                }
            }
        }),
        |outer: &mut CustomOuterState, rho: &Array1<f64>| {
            let warm_ref = screened_outer_warm_start(outer.warm_cache.as_ref(), rho);
            match custom_family_seed_screening_proxy_labeled(
                family,
                specs,
                &outer_options,
                &label_layout,
                rho,
                warm_ref,
                &rho_prior,
            ) {
                Ok((score, warm_start, _inner_converged)) if score.is_finite() => {
                    outer.warm_cache = Some(warm_start);
                    outer.last_error = None;
                    Ok(score)
                }
                Ok((score, warm_start, _inner_converged)) => {
                    outer.warm_cache = Some(warm_start);
                    outer.last_error = Some(format!(
                        "custom-family seed-screening proxy produced non-finite score {score}"
                    ));
                    Err(EstimationError::RemlOptimizationFailed(
                        "custom-family seed-screening proxy produced non-finite score".to_string(),
                    ))
                }
                Err(e) => {
                    outer.last_error = Some(e.clone());
                    Err(EstimationError::RemlOptimizationFailed(e))
                }
            }
        },
    )
    .with_seed_inner_state(|outer: &mut CustomOuterState, beta: &Array1<f64>| {
        outer.seed_cached_beta(n_rho, specs, beta)
    });

    let outer_result = problem.run(&mut obj, "custom family");

    let last_error_detail = obj
        .state
        .last_error
        .as_ref()
        .map(|e| {
            format!(
                " last objective error: {}",
                normalize_outer_eval_error_detail(e)
            )
        })
        .unwrap_or_default();

    // Startup-validation escalation net (gam#860). When the outer optimizer
    // returns `Err` because no candidate seed passed startup validation, the
    // raise is a POST-AUDIT NUMERICAL pathology, not an ill-posed input: by the
    // time we reach the outer solve the structural audits have already passed
    // (the #531-class identifiability audit, the #789B zero-events guard, and
    // the #859 cross-fit alignment all raise BEFORE the solver). So an
    // all-seeds-rejected / non-finite-cost failure HERE is a solver numerical
    // defect (e.g. the #787 kappa-driven penalty-topology dim-mismatch) on a
    // structurally-well-posed design — exactly the regime the never-fail
    // posterior-sampling rung exists for. Route it into the SAME AUTO-ESCALATE
    // the non-convergence path below uses, seeding the sampler at the initial ρ
    // (`rho0`, the bootstrap seed), instead of hard-raising. The carve-out is
    // strict: this only catches the post-audit startup-validation failure, never
    // the structural guards above (they keep raising with their own messages),
    // and the degraded refit below STILL raises if even `rho0` produces a
    // non-finite mode (sampling about NaN would manufacture meaningless
    // infinite-width intervals that masquerade as a fit — see the finite-mode
    // check after the refit). The result carries the existing escalation's
    // degraded / sampled-not-certified flagging so confidence is honest.
    let (rho_star, outer_grad_norm, outer_iters, nonconvergence_escalation, outer_certificate) =
        match outer_result {
            Ok(outer_result) => {
                // Geometry-driven terminal escalation. When the outer smoothing
                // optimizer cannot certify convergence, the objective is always
                // *proper* (Jeffreys/PC term unconditionally armed), so a
                // non-convergence here is a geometry signal (indefinite / non-smooth
                // LAML landscape that stalled Strong-Wolfe) — not a reason to fail.
                // Instead we AUTO-ESCALATE to sampling the proper posterior about the
                // best mode the inner solve reached (the never-fail bottom rung; see
                // `hmc::sample_gaussian_mode_posterior`). The fast Arc/EFS path is
                // untouched: this branch is only reached after the optimizer reports
                // non-convergence, so nice landscapes never pay any sampling cost.
                let nonconvergence_escalation = !outer_result.converged;
                if nonconvergence_escalation {
                    log::info!(
                        "[robust] outer smoothing did not certify convergence (plan={} iters={} |g|={}); \
                     AUTO-ESCALATE to never-fail posterior sampling about the best mode",
                        outer_result.plan_used,
                        outer_result.iterations,
                        outer_result.final_grad_norm_report(),
                    );
                }
                (
                    outer_result.rho,
                    outer_result.final_grad_norm,
                    outer_result.iterations,
                    nonconvergence_escalation,
                    outer_result.criterion_certificate,
                )
            }
            Err(e) if outer_startup_failure_is_escalatable(&e) => {
                log::warn!(
                    "[robust] outer smoothing raised at startup validation on a structurally-audited \
                 design (post-audit numerical pathology, gam#860): {e}.{last_error_detail} \
                 AUTO-ESCALATE to never-fail posterior sampling about the initial ρ seed; the \
                 degraded refit below still raises if even the seed produces a non-finite mode.",
                );
                (rho0.clone(), None, 0, true, None)
            }
            Err(e) => {
                return Err(format!(
                "outer smoothing optimization failed after exhausting strategy fallbacks: {e}.{last_error_detail}"
            )
            .into());
            }
        };
    screening_cap.store(0, Ordering::Relaxed);

    let per_block = split_labeled_log_lambdas(&rho_star, &label_layout)?;
    let final_seed = obj.state.warm_cache.clone();
    let mut final_options = options.clone();
    final_options.outer_inner_max_iterations = None;
    let mut inner = inner_blockwise_fit(
        family,
        specs,
        &per_block,
        &final_options,
        final_seed.as_ref(),
    )
    .map_err(|e| {
        format!(
            "outer smoothing optimization failed during final inner refit: \
                     {e}.{last_error_detail}"
        )
    })?;
    if !inner.converged && !nonconvergence_escalation {
        return Err(CustomFamilyError::Optimization {
            context: "fit_custom_family final inner refit",
            reason: format!(
                "outer smoothing optimization final inner refit did not converge after {} cycles.{}",
                inner.cycles, last_error_detail
            ),
        });
    }
    if !inner.converged && nonconvergence_escalation {
        // The mode the inner solve reached is still the seed for the proper
        // posterior; a marginal inner non-convergence only widens the sampled
        // intervals (honest, not wrong). Proceed to assemble + sample.
        log::info!(
            "[robust] final inner refit did not fully converge ({} cycles) under escalation; \
             sampling the proper posterior about the reached mode",
            inner.cycles,
        );
    }
    // Finite-mode carve-out for the escalation net (gam#860). The never-fail
    // rung samples a Gaussian posterior ABOUT the reached mode; that is honest
    // only when the mode is finite (a non-converged-but-finite mode just widens
    // the sampled intervals). If the refit produced a NON-FINITE β — e.g. the
    // degraded startup-validation fallback (`rho0`) still lands on garbage —
    // sampling about NaN would manufacture meaningless infinite-width intervals
    // that masquerade as a fit, so KEEP the hard raise with a clear message
    // rather than escalate. (On the certified path β is finite by construction,
    // so this guard only ever fires on a genuinely broken escalation seed.)
    if nonconvergence_escalation
        && inner
            .block_states
            .iter()
            .any(|state| state.beta.iter().any(|value| !value.is_finite()))
    {
        return Err(CustomFamilyError::Optimization {
            context: "fit_custom_family escalation finite-mode check",
            reason: format!(
                "outer smoothing escalation cannot sample a posterior: the refit mode is \
                 non-finite (β contains NaN/inf), so there is no valid mode to sample about; \
                 this is an ill-posed problem, not a recoverable numerical non-convergence.{}",
                last_error_detail
            ),
        });
    }
    let final_warm_start = constrained_warm_start_from_inner(&rho_star, &inner);
    store_persistent_custom_family_warm_start(
        persistent_warm_start_key.as_deref(),
        specs,
        &final_warm_start,
    );
    refresh_all_block_etas(family, specs, &mut inner.block_states).map_err(|e| {
        format!(
            "outer smoothing optimization failed during final eta refresh: \
             {e}.{last_error_detail}"
        )
    })?;
    let mut covariance_conditional =
        compute_joint_covariance_required(family, specs, &inner.block_states, &per_block, options)?;

    let geometry = compute_joint_geometry(family, specs, &inner.block_states, &per_block).map_err(
        |reason| CustomFamilyError::Optimization {
            context: "fit_custom_family joint geometry",
            reason,
        },
    )?;
    let penalized_objective = inner_penalized_objective(
        &inner,
        include_exact_newton_logdet_h(family, options),
        include_exact_newton_logdet_s(family, options),
        "custom-family fit final outer refit",
    )
    .map_err(|reason| CustomFamilyError::Optimization {
        context: "fit_custom_family penalized objective",
        reason,
    })?;
    // Never-fail terminal rung. Under escalation, sample the proper posterior
    // `N(β̂, H⁻¹)` whose precision `H` is the SAME penalized (Jeffreys-augmented)
    // joint Hessian the inner solve produced at the reached mode `β̂`, and report
    // its honest covariance in place of the optimizer-conditional one. Both `H`
    // and `β̂` are in the reduced (canonical) coordinate space here; the joint
    // lift below (`lift_fit_geometry_to_raw`) carries the sampled covariance back
    // to raw space exactly like the conditional covariance it replaces.
    //
    // Sampling a multivariate normal cannot dead-end: `sample_gaussian_mode_posterior`
    // jitters and Cholesky-factors `H`, so a marginally indefinite boundary
    // Hessian only widens the intervals. If that structural factorization is
    // genuinely impossible (e.g. a non-PSD precision after symmetrization) the
    // sampler returns `Err`; rather than re-introducing the dead-end we then keep
    // the optimizer-conditional covariance (a finite point with its existing SEs)
    // and still return a fit — never an `Err` for non-convergence.
    if nonconvergence_escalation {
        if let Some(geom) = geometry.as_ref() {
            let joint_mode: Array1<f64> = {
                let mut mode = Vec::new();
                for state in &inner.block_states {
                    mode.extend(state.beta.iter().copied());
                }
                Array1::from(mode)
            };
            let precision = geom.penalized_hessian.as_array();
            if joint_mode.len() == precision.nrows()
                && precision.nrows() == precision.ncols()
                && joint_mode.iter().all(|v| v.is_finite())
            {
                let sampling_config =
                    crate::inference::hmc::NutsConfig::for_dimension(joint_mode.len());
                match crate::inference::hmc::sample_gaussian_mode_posterior(
                    joint_mode.view(),
                    precision.view(),
                    &sampling_config,
                ) {
                    Ok(posterior) => {
                        let dim = joint_mode.len();
                        let n = posterior.samples.nrows();
                        if n > 1 {
                            // Sample posterior covariance about the posterior mean
                            // (honest intervals; not the Laplace inverse-Hessian).
                            let mean = &posterior.posterior_mean;
                            let mut cov = Array2::<f64>::zeros((dim, dim));
                            for row in posterior.samples.rows() {
                                let centered = &row.to_owned() - mean;
                                for a in 0..dim {
                                    for b in 0..dim {
                                        cov[[a, b]] += centered[a] * centered[b];
                                    }
                                }
                            }
                            cov.mapv_inplace(|v| v / (n as f64 - 1.0));
                            // DIAGNOSTIC GUARD (no false-confident intervals).
                            // The sampler NEVER fails, so without checking its
                            // mixing diagnostics a divergent (R̂ ≫ 1) / near-zero-
                            // ESS draw would be reported as an "honest" covariance.
                            // That is especially dangerous here: the seed `H` is
                            // the Jeffreys-AUGMENTED precision evaluated at β̂, which
                            // may be NON-converged on a flat (unidentified) joint
                            // direction — so a poorly-mixed chain can report a
                            // FINITE, NARROW interval around an arbitrary point on
                            // that flat direction (the prior's interval), masquer-
                            // ading as data-driven. We therefore only accept the
                            // sampled covariance as honest when the chain actually
                            // mixed; otherwise we INFLATE it to reflect the non-
                            // convergence and flag it low-confidence rather than
                            // silently reporting a Jeffreys-narrowed interval.
                            //
                            // R̂ ≤ 1.05 is the standard "mixed" gate (stricter than
                            // the 1.1 used for a coarse converged/not flag, because
                            // this covariance is reported as honest uncertainty).
                            // The ESS floor scales with dimension (≥ 10 effective
                            // draws per parameter, absolute floor 50) so a chain
                            // that produced essentially no independent information
                            // about the posterior is caught independent of model
                            // size.
                            const RHAT_MIXED_MAX: f64 = 1.05;
                            let ess_floor = (10.0 * dim as f64).max(50.0);
                            let rhat = posterior.rhat;
                            let ess = posterior.ess;
                            let diagnostics_ok = rhat.is_finite()
                                && ess.is_finite()
                                && rhat <= RHAT_MIXED_MAX
                                && ess >= ess_floor;
                            if diagnostics_ok {
                                log::info!(
                                    "[robust] never-fail posterior sampling mixed: dim={dim} \
                                     draws={n} rhat={rhat:.3} ess={ess:.0}; reporting sampled \
                                     covariance as honest intervals",
                                );
                                covariance_conditional = Some(cov);
                            } else {
                                // Non-converged: do NOT report the narrow sampled
                                // covariance as data-driven. Inflate it so the
                                // reported uncertainty reflects the failure to
                                // resolve the posterior — widen by the R̂ excess (a
                                // divergent chain widens hard) and an ESS-deficit
                                // factor (too few independent draws ⇒ the sample
                                // covariance is itself unreliable / too narrow). The
                                // result is a clearly-flagged LOW-CONFIDENCE summary,
                                // never an artificially tight interval, and we still
                                // return a fit (the never-fail guarantee stands).
                                let rhat_factor = if rhat.is_finite() {
                                    rhat.max(1.0)
                                } else {
                                    // R̂ unestimable (too few chains/samples) ⇒
                                    // treat as maximally unresolved.
                                    RHAT_MIXED_MAX
                                };
                                let ess_factor = if ess.is_finite() && ess > 0.0 {
                                    (ess_floor / ess).sqrt().max(1.0)
                                } else {
                                    ess_floor.sqrt()
                                };
                                let inflation = (rhat_factor * rhat_factor) * ess_factor;
                                cov.mapv_inplace(|v| v * inflation);
                                log::warn!(
                                    "[robust] never-fail posterior sampling DID NOT MIX: dim={dim} \
                                     draws={n} rhat={rhat:.3} (>{RHAT_MIXED_MAX}) ess={ess:.0} \
                                     (<{ess_floor:.0}); reporting LOW-CONFIDENCE inflated covariance \
                                     (x{inflation:.2}) instead of a possibly false-confident \
                                     Jeffreys-narrowed interval (intervals are prior-dominated on \
                                     any unidentified joint direction, NOT data-driven)",
                                );
                                covariance_conditional = Some(cov);
                            }
                        }
                    }
                    Err(reason) => {
                        log::warn!(
                            "[robust] never-fail posterior sampling could not factor the precision \
                             ({reason}); retaining optimizer-conditional covariance (still no dead-end)",
                        );
                    }
                }
            }
        }
    }
    let rho_star_physical = expand_labeled_log_lambdas(&rho_star, &label_layout)?;
    let outer_converged = !nonconvergence_escalation;
    assemble_custom_family_fit_result(
        inner,
        BlockwiseFitAssembly {
            rho_physical: rho_star_physical,
            covariance_conditional,
            geometry,
            canonical: Some(&canonical),
            result_specs: raw_specs,
            penalized_objective,
            outer_iterations: outer_iters,
            outer_gradient_norm: outer_grad_norm,
            criterion_certificate: outer_certificate,
            outer_converged,
            context: "fit_custom_family result assembly",
        },
    )
}

pub(crate) fn fit_custom_family_fixed_log_lambdas<
    F: CustomFamily + Clone + Send + Sync + 'static,
>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    warm_start: Option<&CustomFamilyWarmStart>,
    outer_iterations: usize,
    outer_gradient_norm: Option<f64>,
    outer_converged: bool,
) -> Result<crate::solver::estimate::UnifiedFitResult, CustomFamilyError> {
    let penalty_counts = validate_blockspecs(specs)?;
    let rho = flatten_log_lambdas(specs);
    let per_block = split_log_lambdas(&rho, &penalty_counts)?;
    let mut inner = inner_blockwise_fit(
        family,
        specs,
        &per_block,
        options,
        warm_start.map(|warm| &warm.inner),
    )?;
    if !inner.converged {
        return Err(CustomFamilyError::Optimization {
            context: "fit_custom_family_fixed_log_lambdas inner solve",
            reason: format!(
                "fixed-log-lambda inner solve did not converge after {} cycles",
                inner.cycles
            ),
        });
    }
    refresh_all_block_etas(family, specs, &mut inner.block_states)?;
    let covariance_conditional =
        compute_joint_covariance_required(family, specs, &inner.block_states, &per_block, options)?;
    let geometry = compute_joint_geometry(family, specs, &inner.block_states, &per_block).map_err(
        |reason| CustomFamilyError::Optimization {
            context: "fit_custom_family_fixed_log_lambdas joint geometry",
            reason,
        },
    )?;
    let penalized_objective = inner_penalized_objective(
        &inner,
        include_exact_newton_logdet_h(family, options),
        include_exact_newton_logdet_s(family, options),
        "custom-family fixed-log-lambda fit",
    )
    .map_err(|reason| CustomFamilyError::Optimization {
        context: "fit_custom_family_fixed_log_lambdas penalized objective",
        reason,
    })?;
    assemble_custom_family_fit_result(
        inner,
        BlockwiseFitAssembly {
            rho_physical: rho,
            covariance_conditional,
            geometry,
            canonical: None,
            result_specs: specs,
            penalized_objective,
            outer_iterations,
            outer_gradient_norm,
            criterion_certificate: None,
            outer_converged,
            context: "fit_custom_family_fixed_log_lambdas result assembly",
        },
    )
}

pub(crate) fn fit_custom_family_fixed_log_lambda_warm_start<
    F: CustomFamily + Clone + Send + Sync + 'static,
>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
) -> Result<(Vec<Array1<f64>>, bool, usize), CustomFamilyError> {
    // Pre-fit identifiability gate. Mirrors the outer-fit gate so
    // warm-start callers (e.g. the survival marginal-slope rigid pilot
    // at survival_marginal_slope.rs ~18078) fail in milliseconds on
    // rank-deficient joint designs instead of spending minutes inside
    // a singular penalised Newton inner system.
    //
    // We deliberately do NOT call `canonicalize_for_identifiability`
    // here: blockwise families capture their per-block designs at
    // construction time (e.g. SurvivalMarginalSlopeFamily holds
    // `self.marginal_design` and `self.logslope_design` at raw width)
    // and their `evaluate*` paths assert on those raw widths when
    // assembling per-row Hessian contributions. Substituting a
    // column-reduced spec under that family would produce a runtime
    // shape mismatch in the family's syr_row_into / row_outer_into
    // calls, masking the audit's diagnostic with a panic later in the
    // pipeline.
    //
    // The principled construction-time orthogonalisation lives in
    // `crate::families::identifiability_compiler` (and the per-family
    // `*_identifiability.rs` modules). Once Phase 4b threads those
    // compiled operators through the family construction sites, the
    // raw joint design will already be rank-clean on entry and this
    // gate becomes a defensive check.
    let audit =
        crate::solver::identifiability_audit::audit_identifiability(specs).map_err(|reason| {
            CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "fit_custom_family_fixed_log_lambda_warm_start identifiability audit failed: {reason}"
                ),
            }
        })?;
    if audit.fatal {
        return Err(CustomFamilyError::Optimization {
            context: "fit_custom_family_fixed_log_lambda_warm_start identifiability audit",
            reason: format!(
                "fatal pre-fit identifiability audit: {summary}",
                summary = audit.summary
            ),
        });
    }
    let penalty_counts = validate_blockspecs(specs)?;
    let rho = flatten_log_lambdas(specs);
    let per_block = split_log_lambdas(&rho, &penalty_counts)?;
    let inner = inner_blockwise_fit(family, specs, &per_block, options, None)?;
    let block_beta: Vec<Array1<f64>> = inner
        .block_states
        .iter()
        .map(|state| state.beta.clone())
        .collect();
    if !block_beta
        .iter()
        .flat_map(|beta| beta.iter())
        .all(|value| value.is_finite())
    {
        return Err(CustomFamilyError::Optimization {
            context: "fit_custom_family_fixed_log_lambda_warm_start",
            reason: "fixed-log-lambda warm start produced non-finite coefficients".to_string(),
        });
    }
    Ok((block_beta, inner.converged, inner.cycles))
}

#[cfg(test)]
mod test_support {
    use super::*;
    use ndarray::{Array1, Array2};

    pub(crate) fn outerobjectivegradienthessian<F: CustomFamily + Clone + Send + Sync + 'static>(
        family: &F,
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
        penalty_counts: &[usize],
        rho: &Array1<f64>,
        warm_start: Option<&ConstrainedWarmStart>,
        eval_mode: EvalMode,
    ) -> Result<(f64, Array1<f64>, Option<Array2<f64>>, ConstrainedWarmStart), String> {
        let result = super::outerobjectivegradienthessian_internal(
            family,
            specs,
            options,
            penalty_counts,
            rho,
            warm_start,
            crate::types::RhoPrior::Flat,
            eval_mode,
        )?;
        Ok((
            result.objective,
            result.gradient,
            result.outer_hessian.materialize_dense()?,
            result.warm_start,
        ))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[derive(Clone)]
    struct BatchedOuterHessianTestFamily {
        matrix: Array2<f64>,
    }

    struct TestOuterHessianOperator {
        matrix: Array2<f64>,
    }

    impl crate::solver::outer_strategy::OuterHessianOperator for TestOuterHessianOperator {
        fn dim(&self) -> usize {
            self.matrix.nrows()
        }

        fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
            Ok(self.matrix.dot(v))
        }

        fn is_cheap_to_materialize(&self) -> bool {
            true
        }
    }

    impl CustomFamily for BatchedOuterHessianTestFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![],
            })
        }

        fn outer_hyper_hessian_hvp_available(&self, block_specs: &[ParameterBlockSpec]) -> bool {
            assert!(block_specs.len() <= isize::MAX as usize);
            true
        }

        fn outer_hyper_hessian_operator(
            &self,
            block_specs: &[ParameterBlockSpec],
        ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
            assert!(block_specs.len() <= isize::MAX as usize);
            Some(Arc::new(TestOuterHessianOperator {
                matrix: self.matrix.clone(),
            }))
        }
    }

    #[test]
    fn blockwise_fit_from_parts_accepts_stacked_solver_eta_with_canonical_geometry_rows() {
        let canonical_design = DesignMatrix::from(Array2::ones((2, 1)));
        let stacked_design = DesignMatrix::from(Array2::ones((6, 1)));
        let spec = ParameterBlockSpec {
            name: "stacked".to_string(),
            design: canonical_design,
            offset: Array1::zeros(2),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: Some(stacked_design),
            stacked_offset: Some(Array1::zeros(6)),
        };
        let state = ParameterBlockState {
            beta: array![0.25],
            eta: Array1::zeros(6),
        };
        let fit = blockwise_fit_from_parts(
            BlockwiseFitResultParts {
                block_states: vec![state],
                log_likelihood: -1.0,
                log_lambdas: Array1::zeros(0),
                lambdas: Array1::zeros(0),
                covariance_conditional: Some(Array2::eye(1)),
                stable_penalty_term: 0.0,
                penalized_objective: 1.0,
                outer_iterations: 0,
                outer_gradient_norm: Some(0.0),
                criterion_certificate: None,
                inner_cycles: 0,
                outer_converged: true,
                geometry: Some(FitGeometry {
                    penalized_hessian: Array2::eye(1).into(),
                    working_weights: Array1::ones(2),
                    working_response: Array1::zeros(2),
                }),
                precomputed_edf: Some((1.0, Vec::new(), vec![1.0])),
            },
            &[spec],
        )
        .expect("stacked solver eta should assemble against canonical geometry rows");

        assert_eq!(fit.block_states[0].eta.len(), 6);
        assert_eq!(fit.geometry.as_ref().unwrap().working_weights.len(), 2);
    }

    #[test]
    fn batched_outer_hessian_terms_materialize_to_exact_small_matrix() {
        let exact = array![[4.0, -1.0], [-1.0, 3.0]];
        let family = BatchedOuterHessianTestFamily {
            matrix: exact.clone(),
        };
        // rho.len() must equal sum(spec.penalties.len()); empty specs ⇒ empty rho.
        let terms = family
            .batched_outer_hessian_terms(&[], &[], &[], &Array1::<f64>::zeros(0), None)
            .expect("batched Hessian hook succeeds")
            .expect("test family exposes batched HVP terms");
        let operator = match terms.outer_hessian {
            crate::solver::outer_strategy::HessianResult::Operator(operator) => operator,
            _ => panic!("batched hook should expose an operator"),
        };
        let dense = operator
            .mul_mat(Array2::<f64>::eye(2).view())
            .expect("operator materializes on small exact case");
        assert_eq!(dense, exact);
    }

    #[test]
    fn batched_outer_hessian_operator_selected_only_for_hessian_eval() {
        let family = BatchedOuterHessianTestFamily {
            matrix: array![[2.0, 0.5], [0.5, 5.0]],
        };
        let selected = custom_family_batched_outer_hessian_operator(
            &family,
            &[],
            &[],
            &[],
            &Array1::<f64>::zeros(0),
            None,
            EvalMode::ValueGradientHessian,
        )
        .expect("selection check succeeds");
        assert!(
            selected.is_some(),
            "supported Hessian/HVP families should select the batched operator path"
        );

        let not_selected = custom_family_batched_outer_hessian_operator(
            &family,
            &[],
            &[],
            &[],
            &Array1::<f64>::zeros(0),
            None,
            EvalMode::ValueAndGradient,
        )
        .expect("non-Hessian selection check succeeds");
        assert!(
            not_selected.is_none(),
            "batched Hessian terms must not run for gradient-only evaluations"
        );
    }

    #[test]
    fn batched_outer_gradient_override_rejected_when_jeffreys_curvature_is_active() {
        assert!(
            batched_outer_gradient_contract_allows_override(None),
            "released objective without robust Jeffreys curvature may use a family-owned batched gradient"
        );

        let zero_hphi = Array2::<f64>::zeros((2, 2));
        assert!(
            batched_outer_gradient_contract_allows_override(Some(&zero_hphi)),
            "a gated zero Jeffreys curvature leaves the batched gradient contract unchanged"
        );

        let active_hphi = array![[0.0, 0.0], [0.0, 1.0e-6]];
        assert!(
            !batched_outer_gradient_contract_allows_override(Some(&active_hphi)),
            "nonzero H_phi changes the logdet operator and needs the unified H_phi-aware gradient"
        );
    }

    use crate::families::gamlss::{BinomialLocationScaleFamily, BinomialLocationScaleWiggleFamily};
    use crate::matrix::DesignMatrix;
    use crate::test_support::binomial_location_scale_base_fixture;
    use approx::assert_relative_eq;
    use faer::sparse::{SparseColMat, Triplet};
    use ndarray::{Array1, Array2, array};

    /// The marker-free coupled-joint-Hessian gate (#727, #729) trusts a family
    /// that returns a genuinely coupled joint Hessian — nonzero off-diagonal
    /// blocks — without a hand-set `has_explicit_joint_hessian()`. Pin the
    /// structural probe that drives every `_with_specs` dispatch: block-diagonal
    /// (the trait default) is NOT coupling, a single nonzero off-block IS, and a
    /// shape disagreement must never be claimed as coupling.
    #[test]
    fn joint_hessian_coupling_probe_detects_off_diagonal_blocks() {
        // Two blocks of width 2 each → a 4×4 joint Hessian. Only `beta.len()`
        // is read, so the `eta` lengths are immaterial.
        let states = vec![
            ParameterBlockState {
                beta: Array1::zeros(2),
                eta: Array1::zeros(3),
            },
            ParameterBlockState {
                beta: Array1::zeros(2),
                eta: Array1::zeros(3),
            },
        ];

        // Strictly block-diagonal (per-block curvature, zero off-blocks): the
        // trait default shape, NOT coupling.
        let block_diagonal = array![
            [1.0_f64, 0.5, 0.0, 0.0],
            [0.5, 1.0, 0.0, 0.0],
            [0.0, 0.0, 2.0, 0.3],
            [0.0, 0.0, 0.3, 2.0],
        ];
        assert!(
            !joint_hessian_has_cross_block_coupling(&block_diagonal, &states),
            "block-diagonal joint Hessian must not be treated as coupled"
        );

        // A single nonzero off-diagonal-block entry (and its transpose) is
        // genuine cross-block curvature the block-diagonal default can never
        // produce, so it must be trusted as coupled.
        let mut coupled = block_diagonal.clone();
        coupled[[0, 2]] = 1.0e-9;
        coupled[[2, 0]] = 1.0e-9;
        assert!(
            joint_hessian_has_cross_block_coupling(&coupled, &states),
            "a nonzero off-diagonal block must be detected as coupling"
        );

        // A matrix whose dimension disagrees with the total β width is
        // malformed; the probe must answer the coupling question with `false`
        // rather than claim coupling for a mis-shaped Hessian.
        let wrong_shape = Array2::<f64>::zeros((3, 3));
        assert!(
            !joint_hessian_has_cross_block_coupling(&wrong_shape, &states),
            "shape disagreement must not be claimed as coupling"
        );
    }

    fn solve_blockweighted_system(
        x: &DesignMatrix,
        y_star: &Array1<f64>,
        w: &Array1<f64>,
        s_lambda: &Array2<f64>,
        ridge_floor: f64,
        ridge_policy: RidgePolicy,
    ) -> Result<Array1<f64>, String> {
        let n = x.nrows();
        if y_star.len() != n || w.len() != n {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: "weighted-system dimension mismatch".to_string(),
            }
            .into());
        }
        let xtwy = x.compute_xtwy(w, y_star)?;
        x.solve_systemwith_policy(w, &xtwy, Some(s_lambda), ridge_floor, ridge_policy)
            .map_err(|_| "block solve failed after ridge retries".to_string())
    }

    #[test]
    fn default_inner_cycle_budget_covers_large_scale_joint_newton_tail() {
        let options = BlockwiseFitOptions::default();

        assert_eq!(
            options.inner_max_cycles,
            DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES
        );
        assert!(
            options.inner_max_cycles > 300,
            "startup validation must not reject still-descending exact joint solves at the old cap"
        );
    }

    #[test]
    fn startup_validation_failure_routes_to_never_fail_escalation() {
        use crate::estimate::EstimationError;

        let all_seeds_rejected = EstimationError::RemlOptimizationFailed(
            "no candidate seeds passed outer startup validation (custom family):\n  generated=4"
                .to_string(),
        );
        assert!(
            outer_startup_failure_is_escalatable(&all_seeds_rejected),
            "post-audit all-seeds startup rejection must reach the never-fail escalation net"
        );

        let non_finite_eval = EstimationError::RemlOptimizationFailed(
            "outer eval failed: objective returned a non-finite cost".to_string(),
        );
        assert!(
            outer_startup_failure_is_escalatable(&non_finite_eval),
            "non-finite startup evals are the same post-audit numerical pathology"
        );

        let structural_input = EstimationError::InvalidInput(
            "zero-event survival marginal-slope input remains structurally invalid".to_string(),
        );
        assert!(
            !outer_startup_failure_is_escalatable(&structural_input),
            "structural input errors must not be converted into sampled fits"
        );
    }

    #[test]
    fn joint_penalty_subspace_trace_matches_projected_logdet_derivative() {
        let ranges = vec![(0, 3)];
        let s_lambda = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
        let penalties = vec![s_lambda];
        let h = array![[4.0, 0.2, 7.0], [0.2, 9.0, -3.0], [7.0, -3.0, 30.0]];
        // `∂Sλ/∂ρ` is supported on range(Sλ) (here the leading 2×2 block, the
        // positive-eigenvalue subspace of `S`). Since #901 the kernel is the
        // full spectral `M⁺`, whose trace differentiates `log|H+Sλ|₊` exactly
        // for EVERY drift; a range(Sλ)-supported drift exercises the same
        // contract the production `∂Sλ/∂ρ` does (and is where the old
        // range(Sλ)-block kernel and `M⁺` agree, so this pin is stable
        // across the kernel generalization).
        let drift = array![[0.7, -0.4, 0.0], [-0.4, 1.3, 0.0], [0.0, 0.0, 0.0]];

        let (logdet, kernel) = joint_penalty_subspace_trace_parts(
            &JointHessianSource::Dense(h.clone()),
            &ranges,
            &penalties,
            3,
            0.0,
            None,
        )
        .expect("projection parts build");
        let kernel = kernel.expect("rank-deficient penalty still has an identified subspace");
        // Kernel basis = kept eigenvectors of M = H + Sλ (full rank 3 here),
        // NOT the rank-2 range(Sλ) basis of the pre-#901 reduced kernel.
        assert_eq!(kernel.u_s.ncols(), 3);
        // logdet is the FULL identifiable-subspace `log|H + Sλ|₊`. Here H + Sλ
        // is full rank (3), so this is the ordinary log-det of
        //   M = [[5, 0.2, 7], [0.2, 11, -3], [7, -3, 30]],  det(M) = 1056.4.
        let m = array![[5.0, 0.2, 7.0], [0.2, 11.0, -3.0], [7.0, -3.0, 30.0]];
        let (m_evals, _) = m.eigh(faer::Side::Lower).expect("M eigendecomposition");
        let expected_logdet: f64 = m_evals.iter().map(|&v| v.ln()).sum();
        assert_relative_eq!(logdet, expected_logdet, epsilon = 1e-10);

        let analytic = kernel.trace_projected_logdet(&drift);
        let eps = 1.0e-6;
        let h_plus = &h + &(drift.mapv(|v| eps * v));
        let h_minus = &h - &(drift.mapv(|v| eps * v));
        let (logdet_plus, _) = joint_penalty_subspace_trace_parts(
            &JointHessianSource::Dense(h_plus),
            &ranges,
            &penalties,
            3,
            0.0,
            None,
        )
        .expect("plus projection parts build");
        let (logdet_minus, _) = joint_penalty_subspace_trace_parts(
            &JointHessianSource::Dense(h_minus),
            &ranges,
            &penalties,
            3,
            0.0,
            None,
        )
        .expect("minus projection parts build");
        let finite_difference = (logdet_plus - logdet_minus) / (2.0 * eps);

        assert_relative_eq!(
            analytic,
            finite_difference,
            epsilon = 1e-8,
            max_relative = 1e-8
        );
    }

    #[test]
    fn joint_outer_gradient_uses_projected_trace_for_rank_deficient_penalty() {
        let ranges = vec![(0, 3)];
        let rho = array![0.0];
        let beta = array![1.0, -1.0, 3.0];
        let s_lambda = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
        let h = array![[4.0, 0.2, 7.0], [0.2, 9.0, -3.0], [7.0, -3.0, 30.0]];
        let spec = ParameterBlockSpec {
            name: "surface".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                1, 3,
            )))),
            offset: Array1::zeros(1),
            penalties: vec![PenaltyMatrix::Dense(s_lambda.clone())],
            nullspace_dims: vec![1],
            initial_log_lambdas: rho.clone(),
            initial_beta: Some(beta.clone()),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let specs = vec![spec];
        let inner = BlockwiseInnerResult {
            block_states: vec![ParameterBlockState {
                beta: beta.clone(),
                eta: Array1::zeros(1),
            }],
            active_sets: vec![None],
            log_likelihood: 0.0,
            penalty_value: 0.5 * beta.dot(&fast_av(&s_lambda, &beta)),
            cycles: 1,
            converged: true,
            block_logdet_h: 0.0,
            block_logdet_s: 0.0,
            s_lambdas: vec![s_lambda.clone()],
            joint_workspace: None,
            kkt_residual: None,
            active_constraints: None,
        };
        let per_block = vec![rho.clone()];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: false,
            ..BlockwiseFitOptions::default()
        };
        let no_dh =
            |_direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
        let no_d2h = |_u: &Array1<f64>,
                      _v: &Array1<f64>|
         -> Result<Option<DriftDerivResult>, String> { Ok(None) };

        let projected = joint_outer_evaluate(
            &inner,
            &specs,
            &per_block,
            &rho,
            &beta,
            JointHessianSource::Dense(h.clone()),
            &ranges,
            3,
            0.0,
            0.0,
            0.0,
            1.0,
            0.0,
            true,
            true,
            false,
            true,
            EvalMode::ValueAndGradient,
            &options,
            crate::types::RhoPrior::Flat,
            PseudoLogdetMode::Smooth,
            &no_dh,
            None,
            &no_d2h,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        .expect("projected outer evaluation succeeds");

        let unprojected = joint_outer_evaluate(
            &inner,
            &specs,
            &per_block,
            &rho,
            &beta,
            JointHessianSource::Dense(h.clone()),
            &ranges,
            3,
            0.0,
            0.0,
            0.0,
            1.0,
            0.0,
            true,
            true,
            false,
            false,
            EvalMode::ValueAndGradient,
            &options,
            crate::types::RhoPrior::Flat,
            PseudoLogdetMode::Smooth,
            &no_dh,
            None,
            &no_d2h,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        .expect("unprojected outer evaluation succeeds");

        let (_, kernel) = joint_penalty_subspace_trace_parts(
            &JointHessianSource::Dense(h.clone()),
            &ranges,
            std::slice::from_ref(&s_lambda),
            3,
            0.0,
            None,
        )
        .expect("projection kernel builds");
        let projected_trace = kernel
            .expect("rank-deficient penalty has positive subspace")
            .trace_projected_logdet(&s_lambda);
        let expected_gradient =
            0.5 * beta.dot(&fast_av(&s_lambda, &beta)) + 0.5 * projected_trace - 0.5 * 2.0;

        assert_relative_eq!(
            projected.gradient[0],
            expected_gradient,
            epsilon = 1e-12,
            max_relative = 1e-12
        );
        // Post gh#752/#901 contract: the trace kernel is the FULL spectral
        // pseudo-inverse `M⁺ = (H+Sλ)⁺` over range(H+Sλ). On a NONSINGULAR `M`
        // (this fixture) that is exactly `M⁻¹`, so the projected route and the
        // full-space operator route compute the same generalized determinant
        // and the same ρ-trace — the projection must be INVARIANT here. (The
        // historical assertion that they differ encoded the pre-#752 range(Sλ)
        // reduction, which dropped the penalty-null likelihood curvature and
        // was itself the bug. The case where the routes genuinely diverge — a
        // singular `M` whose ker(H+Sλ) the pseudo-logdet must drop — is
        // asserted in `joint_outer_gradient_projected_trace_drops_joint_null`.)
        assert_relative_eq!(
            projected.gradient[0],
            unprojected.gradient[0],
            epsilon = 1e-8,
            max_relative = 1e-8
        );
    }

    /// The discriminating case for `project_hessian_logdet`: a joint Hessian
    /// whose ker(H) overlaps ker(Sλ), so `M = H + Sλ` is genuinely singular.
    /// The projected route must drop the unidentified direction (pseudo-logdet
    /// + `M⁺` trace kernel over range(M)) and produce the exact closed-form
    /// gradient; a full-space `M⁻¹` route has no finite answer here. This is
    /// the routing guard the nonsingular fixture above cannot provide (there
    /// the two routes coincide by design).
    #[test]
    fn joint_outer_gradient_projected_trace_drops_joint_null() {
        let ranges = vec![(0, 3)];
        let rho = array![0.0];
        let beta = array![1.0, -1.0, 3.0];
        let s_lambda = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
        // ker(h) = span(e3) = ker(s_lambda) ⇒ M = H + Sλ is singular with the
        // unidentified direction e3.
        let h = array![[4.0, 0.2, 0.0], [0.2, 9.0, 0.0], [0.0, 0.0, 0.0]];
        let spec = ParameterBlockSpec {
            name: "surface".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                1, 3,
            )))),
            offset: Array1::zeros(1),
            penalties: vec![PenaltyMatrix::Dense(s_lambda.clone())],
            nullspace_dims: vec![1],
            initial_log_lambdas: rho.clone(),
            initial_beta: Some(beta.clone()),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let specs = vec![spec];
        let inner = BlockwiseInnerResult {
            block_states: vec![ParameterBlockState {
                beta: beta.clone(),
                eta: Array1::zeros(1),
            }],
            active_sets: vec![None],
            log_likelihood: 0.0,
            penalty_value: 0.5 * beta.dot(&fast_av(&s_lambda, &beta)),
            cycles: 1,
            converged: true,
            block_logdet_h: 0.0,
            block_logdet_s: 0.0,
            s_lambdas: vec![s_lambda.clone()],
            joint_workspace: None,
            kkt_residual: None,
            active_constraints: None,
        };
        let per_block = vec![rho.clone()];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: false,
            ..BlockwiseFitOptions::default()
        };
        let no_dh =
            |_direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
        let no_d2h = |_u: &Array1<f64>,
                      _v: &Array1<f64>|
         -> Result<Option<DriftDerivResult>, String> { Ok(None) };

        let projected = joint_outer_evaluate(
            &inner,
            &specs,
            &per_block,
            &rho,
            &beta,
            JointHessianSource::Dense(h.clone()),
            &ranges,
            3,
            0.0,
            0.0,
            0.0,
            1.0,
            0.0,
            true,
            true,
            false,
            true,
            EvalMode::ValueAndGradient,
            &options,
            crate::types::RhoPrior::Flat,
            PseudoLogdetMode::Smooth,
            &no_dh,
            None,
            &no_d2h,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        .expect("projected outer evaluation succeeds on a singular joint Hessian");

        let (_, kernel) = joint_penalty_subspace_trace_parts(
            &JointHessianSource::Dense(h.clone()),
            &ranges,
            std::slice::from_ref(&s_lambda),
            3,
            0.0,
            None,
        )
        .expect("projection kernel builds");
        let projected_trace = kernel
            .expect("rank-deficient joint Hessian has a positive subspace")
            .trace_projected_logdet(&s_lambda);
        let expected_gradient =
            0.5 * beta.dot(&fast_av(&s_lambda, &beta)) + 0.5 * projected_trace - 0.5 * 2.0;

        assert!(
            projected.objective.is_finite(),
            "pseudo-logdet objective must stay finite when ker(H+Sλ) is dropped"
        );
        assert_relative_eq!(
            projected.gradient[0],
            expected_gradient,
            epsilon = 1e-10,
            max_relative = 1e-10
        );
    }

    // Experimental scan documenting that on THIS fixture's geometry the
    // joint_outer_evaluate path does not show divergence between
    // project_hessian_logdet=true and =false at large-scale ρ: the dominant
    // term ½ λ β'Sβ grows linearly in λ regardless of projection, and the trace
    // pair cancels in both routes here. The clustered-PC marginal-slope failure
    // (#808/#787) is a DIFFERENT geometry — a near-collinear penalty-null trend
    // whose likelihood determinant the range(Sλ)-only route drops. That route is
    // now disabled for all marginal-slope families: the project_hessian_logdet
    // flag at every joint_outer_evaluate/_efs call site reads
    // `use_projected_penalty_logdet()` (default true), so value and analytic
    // gradient share the range(H+Sλ) generalized determinant.
    #[test]
    fn large_scale_rho_scan_joint_outer_evaluate_is_projection_invariant() {
        // Same fixture shape as the rank-deficient projected-trace test,
        // but with H_unpen scaled to data-Hessian magnitude (n ~ 2e5).
        let ranges = vec![(0, 3)];
        let beta = array![1.0, -1.0, 3.0];
        let s_unit: Array2<f64> = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 0.0]];
        let n_scale = 2.0e5_f64;
        let h: Array2<f64> =
            array![[4.0, 0.2, 7.0], [0.2, 9.0, -3.0], [7.0, -3.0, 30.0]].mapv(|v| v * n_scale);

        let no_dh = |_d: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
        let no_d2h = |_u: &Array1<f64>,
                      _v: &Array1<f64>|
         -> Result<Option<DriftDerivResult>, String> { Ok(None) };

        eprintln!("\n=== large-scale rho-scan: unprojected vs projected outer gradient ===");
        eprintln!(
            "{:>5}  {:>10}  {:>16}  {:>16}  {:>10}",
            "rho", "lambda", "g_unprojected", "g_projected", "ratio"
        );

        let mut g_un_at_10 = 0.0_f64;
        let mut g_pr_at_10 = 0.0_f64;

        for &rho_val in &[0.0_f64, 2.0, 4.0, 6.0, 8.0, 10.0] {
            let lam = rho_val.exp();
            let rho = array![rho_val];
            let s_lambda = s_unit.mapv(|v| v * lam);

            let spec = ParameterBlockSpec {
                name: "surface".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros(
                    (1, 3),
                ))),
                offset: Array1::zeros(1),
                penalties: vec![PenaltyMatrix::Dense(s_unit.clone())],
                nullspace_dims: vec![1],
                initial_log_lambdas: rho.clone(),
                initial_beta: Some(beta.clone()),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            };
            let specs = vec![spec];
            let inner = BlockwiseInnerResult {
                block_states: vec![ParameterBlockState {
                    beta: beta.clone(),
                    eta: Array1::zeros(1),
                }],
                active_sets: vec![None],
                log_likelihood: 0.0,
                penalty_value: 0.5 * lam * beta.dot(&fast_av(&s_unit, &beta)),
                cycles: 1,
                converged: true,
                block_logdet_h: 0.0,
                block_logdet_s: 0.0,
                s_lambdas: vec![s_lambda.clone()],
                joint_workspace: None,
                kkt_residual: None,
                active_constraints: None,
            };
            let per_block = vec![rho.clone()];
            let options = BlockwiseFitOptions {
                use_remlobjective: true,
                use_outer_hessian: false,
                ..BlockwiseFitOptions::default()
            };

            // project_hessian_logdet = true (current main behavior)
            let projected = joint_outer_evaluate(
                &inner,
                &specs,
                &per_block,
                &rho,
                &beta,
                JointHessianSource::Dense(h.clone()),
                &ranges,
                3,
                0.0,
                0.0,
                0.0,
                1.0,
                0.0,
                true,
                true,
                false,
                true,
                EvalMode::ValueAndGradient,
                &options,
                crate::types::RhoPrior::Flat,
                PseudoLogdetMode::Smooth,
                &no_dh,
                None,
                &no_d2h,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
            )
            .expect("projected eval ok");

            // project_hessian_logdet = false (the 0.1.92 / pre-fix behavior)
            let unprojected = joint_outer_evaluate(
                &inner,
                &specs,
                &per_block,
                &rho,
                &beta,
                JointHessianSource::Dense(h.clone()),
                &ranges,
                3,
                0.0,
                0.0,
                0.0,
                1.0,
                0.0,
                true,
                true,
                false,
                false,
                EvalMode::ValueAndGradient,
                &options,
                crate::types::RhoPrior::Flat,
                PseudoLogdetMode::Smooth,
                &no_dh,
                None,
                &no_d2h,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
            )
            .expect("unprojected eval ok");

            let g_un = unprojected.gradient[0];
            let g_pr = projected.gradient[0];
            eprintln!(
                "{:>5.1}  {:>10.3e}  {:>16.6e}  {:>16.6e}  {:>10.3e}",
                rho_val,
                lam,
                g_un,
                g_pr,
                g_un.abs() / (g_pr.abs() + 1e-30)
            );
            if rho_val == 10.0 {
                g_un_at_10 = g_un.abs();
                g_pr_at_10 = g_pr.abs();
            }
        }

        // Finding: at this fixture geometry the two routes agree to
        // ~1e-6 relative precision at every ρ in [0, 10].  Both grow
        // linearly in λ (≈ ½ λ β'Sβ + bounded trace contribution).
        // The optimizer-visible blow-up in large-scale therefore cannot be
        // a missing projection in joint_outer_evaluate — it must live
        // in the survival-marginal-slope custom gradient path.
        let rel_diff = (g_un_at_10 - g_pr_at_10).abs() / g_pr_at_10.max(1e-30);
        assert!(
            rel_diff < 1e-4,
            "projection should be near-invariant on this fixture at rho=10; \
             got g_un={:.6e}, g_pr={:.6e}, rel_diff={:.3e}",
            g_un_at_10,
            g_pr_at_10,
            rel_diff
        );
    }

    // ── Large-scale reproducer for the marginal-slope ρ-saturation
    // failure ────────────────────────────────────────────────────────────
    //
    // Failure being investigated:
    //   outer iter=60, |g|=4.18e13, three of four ρ-coords pinned at the
    //   box bound ±10 (`with_rho_bound(10.0)`). The dominant explicit term
    //   ½λβ'Sβ at large scale (n≈2e5, p≈60, β'Sβ~10⁴, λ=exp(10)≈22k) is
    //   only ~10⁸ — observed gradient is ~10¹³, FIVE orders of magnitude
    //   beyond what the projected-trace kernel cancellation predicts.
    //
    // The existing `large_scale_rho_scan_joint_outer_evaluate_is_projection_invariant`
    // test uses single-block, p=3, nullspace_dims=1, and supplies
    // `compute_dh = Ok(None)` — that path SKIPS the trace pair entirely and
    // therefore cannot reproduce the failure. The large-scale fit has:
    //   - 3 blocks (time_surface, marginal_surface, logslope_surface)
    //   - 4 penalty coords (time:1, marginal:2 [anisotropic], logslope:1)
    //   - Duchon-shape penalties: large nullspace_dims (d+1=4 for d=3 PCs)
    //     producing rank-deficient S with many zero eigenvalues
    //   - n ~ 2e5 → H_unpen scale ~ n × diag-of-design-Gram
    //   - Realistic `compute_dh(d)` returning the per-coord penalty drift
    //     ∂H/∂ρ_k = λ_k S_k (chained through the direction d)
    //
    // This test reproduces the SHAPE: builds large-scale-dimensioned blocks
    // with rank-deficient Duchon-shape penalties, scales H to large-scale
    // magnitude, supplies a realistic penalty-drift `compute_dh`, evaluates
    // `joint_outer_evaluate` at the actual failure ρ point
    // [time=10, marg=10, marg=10, logslope=4.5], and asserts every gradient
    // entry is BOUNDED by a physically reasonable multiple of the dominant
    // ½λβ'Sβ term.
    //
    // If this test passes with reasonable bounds: the bug is NOT in
    //   joint_outer_evaluate itself — it must live in the marginal-slope-
    //   specific drift derivatives (`evaluate_exact_newton_joint_gradient_*`
    //   in survival_marginal_slope.rs) that feed the closure.
    // If this test fails: joint_outer_evaluate has a numerical defect that
    //   surfaces at large scale + realistic Ḣ. We then bisect inside the
    //   evaluator.
    //
    #[test]
    fn large_scale_multiblock_outer_gradient_with_realistic_drift_is_bounded() {
        // LargeScale-realistic dimensions for binary-outcome marginal-slope.
        // Duchon(PC1,PC2,PC3, centers=10, order=1) → p_basis = centers +
        // null_basis(d+1=4) = 14 columns per spatial block, nullspace dim=4.
        // The actual fit has time_surface with a different basis (B-spline
        // along entry/exit age) — we approximate with p_time=10, null=2.
        let p_time = 10usize;
        let p_marg = 14usize;
        let p_logs = 14usize;
        let p_total = p_time + p_marg + p_logs;

        // Block ranges in the joint coefficient vector.
        let ranges = vec![
            (0, p_time),
            (p_time, p_time + p_marg),
            (p_time + p_marg, p_total),
        ];

        // ── Build rank-deficient Duchon-shape penalty matrices.
        // S = U diag(σ) Uᵀ where σ has `nullspace_dims` trailing zeros.
        // We use deterministic orthonormal columns from a simple QR of a
        // structured matrix to mimic the eigenstructure without random.
        fn build_duchon_shape(p: usize, nullspace: usize, signal_scale: f64) -> Array2<f64> {
            // Diagonal eigenvalue spectrum, geometric decay across the
            // signal subspace then zeros on the nullspace.
            let rank = p - nullspace;
            let mut eigvals = vec![0.0_f64; p];
            for i in 0..rank {
                // 1.0, 0.5, 0.25, ... — physical Duchon penalty spectrum
                // has spectrum decaying like 1/k for high-frequency modes;
                // geometric decay is a faithful caricature.
                eigvals[i] = signal_scale * 0.5_f64.powi(i as i32);
            }
            // Use a deterministic orthogonal basis: discrete cosine basis.
            // U[i,j] = sqrt(2/p) cos(π (i+0.5) j / p) for j>0; U[i,0]=1/√p.
            let mut u = Array2::<f64>::zeros((p, p));
            for i in 0..p {
                u[[i, 0]] = 1.0 / (p as f64).sqrt();
                for j in 1..p {
                    u[[i, j]] = (2.0 / p as f64).sqrt()
                        * (std::f64::consts::PI * (i as f64 + 0.5) * j as f64 / p as f64).cos();
                }
            }
            // S = U diag(eigvals) Uᵀ.
            let mut s = Array2::<f64>::zeros((p, p));
            for k in 0..p {
                if eigvals[k] == 0.0 {
                    continue;
                }
                for i in 0..p {
                    for j in 0..p {
                        s[[i, j]] += eigvals[k] * u[[i, k]] * u[[j, k]];
                    }
                }
            }
            s
        }

        // time_surface: 1 penalty (nullspace=2: constant + linear in age).
        let s_time = build_duchon_shape(p_time, 2, 1.0);
        // marginal_surface: 2 penalties (nullspace=4 each, anisotropic).
        let s_marg_0 = build_duchon_shape(p_marg, 4, 1.0);
        let s_marg_1 = build_duchon_shape(p_marg, 4, 0.7);
        // logslope_surface: 1 penalty (nullspace=4).
        let s_logs = build_duchon_shape(p_logs, 4, 1.0);

        // ── Failure-point ρ = [10, 10, 10, 4.5]. λ = exp(ρ).
        let rho = array![10.0_f64, 10.0, 10.0, 4.5];
        let lams: Array1<f64> = rho.mapv(f64::exp);

        // λ-scaled S matrices (per-block, in block-local indexing — this
        // is what BlockwiseInnerResult.s_lambdas stores).
        let s_lambdas_local: Vec<Array2<f64>> = vec![
            s_time.mapv(|v| v * lams[0]),
            // marginal block has TWO penalties — they are summed into one
            // local s_lambda (this matches how BlockwiseInnerResult stores
            // a per-block sum of all penalties in that block):
            (&s_marg_0 * lams[1]) + &(&s_marg_1 * lams[2]),
            s_logs.mapv(|v| v * lams[3]),
        ];

        // β at large scale: |β|∞ ~ 1, β'Sβ ~ trace(S) ~ O(p) ~ 10.
        let beta_flat = Array1::<f64>::from_iter((0..p_total).map(|i| ((i as f64) * 0.13).sin()));

        // ── Large-scale joint unpenalized Hessian.
        // Real survival Hessian = Xᵀ W X with W diagonal and n=2e5. We
        // mimic the SCALE by H = n * (I + small dense perturbation).
        let n_scale = 2.0e5_f64;
        let mut h = Array2::<f64>::eye(p_total) * n_scale;
        // Add a small off-diagonal coupling to make it non-trivial but SPD.
        for i in 0..p_total {
            for j in 0..p_total {
                if i != j {
                    let v = 0.05_f64
                        * n_scale
                        * ((i as f64 - j as f64).abs() / p_total as f64).exp().recip();
                    h[[i, j]] = v;
                }
            }
        }

        // ── Hessian β-chain closure.
        // CONTRACT: `compute_dh(v_k)` takes a β-space direction `v_k`
        // (length p_total = `∂β/∂ρ_k` under the envelope) and returns
        // `D_beta H[v_k]` — the third-order tensor of H contracted with
        // `v_k`. The penalty-drift component `λ_k S_k` is added by
        // `joint_outer_evaluate` automatically from `inner.s_lambdas` —
        // this closure adds ONLY the β-chained piece.
        //
        // For an idealized H_unpen that is independent of β (linear model
        // limit, no nonlinear inner geometry), `D_beta H = 0` and the
        // closure returns `Ok(None)`. This is exactly the regime the
        // existing single-block `large_scale_rho_scan_*` test exercises
        // and finds projection-invariant. The marginal-slope family's
        // Hessian DOES depend on β (through the joint geometry), so the
        // closure is non-trivial in production — and that is the
        // candidate source of the gradient blowup.
        //
        // This test takes the idealized path (`Ok(None)`) so any blowup
        // observed here is attributable to `joint_outer_evaluate`'s
        // multi-block / rank-deficient-S handling alone. If this test
        // PASSES (gradient bounded), the bug must live in the family's
        // `hessian_derivative_correction_result` β-chain — not in the
        // evaluator. If it FAILS, the evaluator itself has the defect at
        // large scale + Duchon-shape S.
        let no_dh = |_v_k: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> { Ok(None) };
        let compute_dh = no_dh;
        let no_d2h = |_u: &Array1<f64>,
                      _v: &Array1<f64>|
         -> Result<Option<DriftDerivResult>, String> { Ok(None) };

        // ── ParameterBlockSpec for each block.
        let mk_spec = |name: &str,
                       p: usize,
                       penalties: Vec<Array2<f64>>,
                       null: usize,
                       rho_block: Array1<f64>|
         -> ParameterBlockSpec {
            ParameterBlockSpec {
                name: name.to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::<f64>::zeros((1, p)),
                )),
                offset: Array1::zeros(1),
                penalties: penalties.into_iter().map(PenaltyMatrix::Dense).collect(),
                nullspace_dims: vec![null],
                initial_log_lambdas: rho_block,
                initial_beta: Some(beta_flat.slice(s![..p]).to_owned()),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            }
        };
        let specs = vec![
            mk_spec(
                "time_surface",
                p_time,
                vec![s_time.clone()],
                2,
                array![rho[0]],
            ),
            mk_spec(
                "marginal_surface",
                p_marg,
                vec![s_marg_0.clone(), s_marg_1.clone()],
                4,
                array![rho[1], rho[2]],
            ),
            mk_spec(
                "logslope_surface",
                p_logs,
                vec![s_logs.clone()],
                4,
                array![rho[3]],
            ),
        ];

        let per_block = vec![array![rho[0]], array![rho[1], rho[2]], array![rho[3]]];

        let inner = BlockwiseInnerResult {
            block_states: vec![
                ParameterBlockState {
                    beta: beta_flat.slice(s![0..p_time]).to_owned(),
                    eta: Array1::zeros(1),
                },
                ParameterBlockState {
                    beta: beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
                    eta: Array1::zeros(1),
                },
                ParameterBlockState {
                    beta: beta_flat.slice(s![p_time + p_marg..p_total]).to_owned(),
                    eta: Array1::zeros(1),
                },
            ],
            active_sets: vec![None, None, None],
            log_likelihood: 0.0,
            penalty_value: 0.5
                * (lams[0]
                    * beta_flat.slice(s![0..p_time]).dot(&fast_av(
                        &s_time,
                        &beta_flat.slice(s![0..p_time]).to_owned(),
                    ))
                    + lams[1]
                        * beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
                            &s_marg_0,
                            &beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
                        ))
                    + lams[2]
                        * beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
                            &s_marg_1,
                            &beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
                        ))
                    + lams[3]
                        * beta_flat.slice(s![p_time + p_marg..p_total]).dot(&fast_av(
                            &s_logs,
                            &beta_flat.slice(s![p_time + p_marg..p_total]).to_owned(),
                        ))),
            cycles: 1,
            converged: true,
            block_logdet_h: 0.0,
            block_logdet_s: 0.0,
            s_lambdas: s_lambdas_local,
            joint_workspace: None,
            kkt_residual: None,
            active_constraints: None,
        };

        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: false,
            ..BlockwiseFitOptions::default()
        };

        let projected = joint_outer_evaluate(
            &inner,
            &specs,
            &per_block,
            &rho,
            &beta_flat,
            JointHessianSource::Dense(h.clone()),
            &ranges,
            p_total,
            0.0,
            0.0,
            0.0,
            1.0,
            0.0,
            true,
            true,
            false,
            true,
            EvalMode::ValueAndGradient,
            &options,
            crate::types::RhoPrior::Flat,
            PseudoLogdetMode::Smooth,
            &compute_dh,
            None,
            &no_d2h,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        .expect("large-scale projected eval");

        eprintln!("\n=== large-scale multi-block reproducer with realistic Ḣ ===");
        eprintln!("ρ = {:?}", rho.as_slice().unwrap());
        eprintln!("λ = {:?}", lams.as_slice().unwrap());
        eprintln!(
            "|β|∞ = {:.3}",
            beta_flat.iter().fold(0.0_f64, |a, &b| a.max(b.abs()))
        );
        eprintln!("objective = {:.6e}", projected.objective);
        eprintln!("gradient = {:?}", projected.gradient.as_slice().unwrap());

        // Physical-bound check: ½λ_k β'_k S_k β_k is the dominant explicit
        // term per coord. For large-scale shape this is ~10⁸ at ρ=10 with
        // β-scale O(1). The full gradient including the projected trace
        // pair should be of THE SAME ORDER (or smaller after cancellation),
        // never 10⁵× larger.
        let dominant_terms = [
            0.5 * lams[0]
                * beta_flat.slice(s![0..p_time]).dot(&fast_av(
                    &s_time,
                    &beta_flat.slice(s![0..p_time]).to_owned(),
                )),
            0.5 * lams[1]
                * beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
                    &s_marg_0,
                    &beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
                )),
            0.5 * lams[2]
                * beta_flat.slice(s![p_time..p_time + p_marg]).dot(&fast_av(
                    &s_marg_1,
                    &beta_flat.slice(s![p_time..p_time + p_marg]).to_owned(),
                )),
            0.5 * lams[3]
                * beta_flat.slice(s![p_time + p_marg..p_total]).dot(&fast_av(
                    &s_logs,
                    &beta_flat.slice(s![p_time + p_marg..p_total]).to_owned(),
                )),
        ];
        assert_eq!(
            projected.gradient.len(),
            dominant_terms.len(),
            "projected gradient dimension changed"
        );
        for (k, (&g, &dominant_term)) in projected
            .gradient
            .iter()
            .zip(dominant_terms.iter())
            .enumerate()
        {
            // Bound: trace pair adds ~p contributions, plus H⁻¹ Ḣ trace
            // bounded by Σ |λ_k| / |H_diag| × p ~ λ_k p / n ~ tiny at
            // large scale. Total gradient should be within 10× of the
            // dominant term (allowing for projection-correction sign).
            let bound = dominant_term.abs().max(1.0) * 100.0;
            assert!(g.is_finite(), "gradient[{k}] is non-finite: {g}");
            assert!(
                g.abs() <= bound,
                "gradient[{k}] = {:.6e} exceeds physical bound 100·|½λβ'Sβ| = {:.6e} \
                 (dominant_term={:.6e}); this reproduces the large-scale blowup \
                 inside joint_outer_evaluate.",
                g,
                bound,
                dominant_term
            );
        }
    }

    #[test]
    fn direct_joint_hyper_inner_tolerance_follows_outer_target() {
        let options = BlockwiseFitOptions {
            inner_tol: 1e-6,
            outer_tol: 1e-5,
            inner_max_cycles: 100,
            ..BlockwiseFitOptions::default()
        };
        let (eval_options, strict_warm_start) =
            derivative_quality_options_and_warm_start(&options, None, true);

        assert_eq!(
            eval_options.inner_tol, options.outer_tol,
            "default exact joint-hyper eval should use the outer optimizer scale"
        );
        assert_eq!(eval_options.inner_max_cycles, options.inner_max_cycles);
        assert!(
            strict_warm_start.is_none(),
            "loosening to the outer scale should not discard cached inner state"
        );
        let large_scale_objective = 3.689e5;
        let posted_residual = 6.788e-1;
        let posted_objective_change = 4.209e-2;
        let eval_tol = eval_options.inner_tol * (1.0 + large_scale_objective);
        assert!(
            posted_residual <= 2.0 * eval_tol && posted_objective_change <= eval_tol,
            "the exact outer startup validation should accept numerically flat inner solves at outer scale"
        );
        let (rho_default, _) = derivative_quality_options_and_warm_start(&options, None, false);
        assert_eq!(
            rho_default.inner_tol, options.inner_tol,
            "rho-only exact joint-hyper eval must preserve the rho-only outer surface"
        );

        let tighter_options = BlockwiseFitOptions {
            inner_tol: 1e-3,
            outer_tol: 1e-5,
            inner_max_cycles: 100,
            ..BlockwiseFitOptions::default()
        };
        let (tightened, _) =
            derivative_quality_options_and_warm_start(&tighter_options, None, true);
        assert_eq!(tightened.inner_tol, tighter_options.outer_tol);
        assert_eq!(tightened.inner_max_cycles, 200);

        let (rho_only, _) =
            derivative_quality_options_and_warm_start(&tighter_options, None, false);
        assert_eq!(rho_only.inner_tol, tighter_options.inner_tol);
        assert_eq!(rho_only.inner_max_cycles, tighter_options.inner_max_cycles);
    }

    #[test]
    fn exact_spatial_joint_hyper_inner_tolerance_follows_spatial_outer_target() {
        let options = BlockwiseFitOptions {
            inner_tol: 1e-6,
            outer_tol: 1e-10,
            inner_max_cycles: 200,
            ..BlockwiseFitOptions::default()
        };
        let spatial_outer_tol = 1e-4;
        let eval_input = joint_hyper_options_for_outer_tolerance(&options, spatial_outer_tol);
        let (eval_options, strict_warm_start) =
            derivative_quality_options_and_warm_start(&eval_input, None, true);

        assert_eq!(eval_options.outer_tol, spatial_outer_tol);
        assert_eq!(
            eval_options.inner_tol, spatial_outer_tol,
            "exact spatial [rho, psi] evaluations should certify beta only to the tolerance of the outer optimizer consuming the derivative"
        );
        assert!(
            strict_warm_start.is_none(),
            "loosening an over-tight caller tolerance should preserve the cached inner state"
        );

        let large_scale_objective = 3.689e5;
        let posted_residual_plateau = 6.788e-1;
        let posted_objective_change = 4.209e-2;
        let eval_tol = eval_options.inner_tol * (1.0 + large_scale_objective);
        assert!(
            posted_residual_plateau <= eval_tol && posted_objective_change <= eval_tol,
            "the posted saturated Newton plateau is below the spatial outer derivative accuracy target"
        );
    }

    fn outerobjective_andgradient<F: CustomFamily + Clone + Send + Sync + 'static>(
        family: &F,
        specs: &[ParameterBlockSpec],
        options: &BlockwiseFitOptions,
        penalty_counts: &[usize],
        rho: &Array1<f64>,
        warm_start: Option<&ConstrainedWarmStart>,
    ) -> Result<(f64, Array1<f64>, ConstrainedWarmStart), String> {
        let (obj, grad, _, warm) = super::test_support::outerobjectivegradienthessian(
            family,
            specs,
            options,
            penalty_counts,
            rho,
            warm_start,
            EvalMode::ValueAndGradient,
        )?;
        Ok((obj, grad, warm))
    }

    struct BinomialLocationScaleWiggleOuterFixture {
        family: BinomialLocationScaleWiggleFamily,
        specs: Vec<ParameterBlockSpec>,
        penalty_counts: Vec<usize>,
        rho: Array1<f64>,
        options: BlockwiseFitOptions,
    }

    fn binomial_location_scale_wiggle_outer_fixture() -> BinomialLocationScaleWiggleOuterFixture {
        let base = binomial_location_scale_base_fixture();
        let q_seed = Array1::linspace(-1.4, 1.4, base.n);
        let knots = crate::families::wiggle::initializewiggle_knots_from_seed(q_seed.view(), 3, 4)
            .expect("knots");
        let wiggle_block = crate::families::wiggle::buildwiggle_block_input_from_knots(
            q_seed.view(),
            &knots,
            3,
            2,
            false,
        )
        .expect("wiggle block");
        let wigglespec = ParameterBlockSpec {
            name: "wiggle".to_string(),
            design: wiggle_block.design.clone(),
            offset: wiggle_block.offset.clone(),
            penalties: wiggle_block
                .penalties
                .iter()
                .map(|ps| match ps {
                    crate::solver::estimate::PenaltySpec::Block {
                        local, col_range, ..
                    } => PenaltyMatrix::Blockwise {
                        local: local.clone(),
                        col_range: col_range.clone(),
                        total_dim: wiggle_block.design.ncols(),
                    },
                    crate::solver::estimate::PenaltySpec::Dense(m)
                    | crate::solver::estimate::PenaltySpec::DenseWithMean { matrix: m, .. } => {
                        PenaltyMatrix::Dense(m.clone())
                    }
                })
                .collect(),
            nullspace_dims: wiggle_block.nullspace_dims.clone(),
            initial_log_lambdas: array![0.1],
            initial_beta: Some(Array1::from_elem(wiggle_block.design.ncols(), 0.03)),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let family = BinomialLocationScaleWiggleFamily {
            y: base.y,
            weights: base.weights,
            link_kind: crate::types::InverseLink::Standard(crate::types::StandardLink::Probit),
            threshold_design: Some(base.threshold_design),
            log_sigma_design: Some(base.log_sigma_design),
            wiggle_knots: knots,
            wiggle_degree: 3,
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        BinomialLocationScaleWiggleOuterFixture {
            family,
            specs: vec![base.threshold_spec, base.log_sigma_spec, wigglespec],
            penalty_counts: vec![1usize, 1usize, 1usize],
            rho: array![0.05, -0.15, 0.1],
            options: BlockwiseFitOptions {
                use_remlobjective: true,
                ridge_floor: 1e-10,
                outer_max_iter: 1,
                ..BlockwiseFitOptions::default()
            },
        }
    }

    #[derive(Clone)]
    struct OneBlockIdentityFamily;

    #[test]
    fn joint_coupled_coefficient_hessian_cost_matches_n_times_p_total_squared() {
        // Three blocks p_b = (12, 20, 8), n=200. Joint-coupled cost is
        // n·(Σp_b)² = 200·40² = 320_000. Block-diagonal default with the
        // same designs would give n·Σp_b² = 200·(144+400+64) = 121_600.
        // The cross-block fill 2·n·(p_t·p_m + p_t·p_l + p_m·p_l) =
        // 2·200·(240+96+160) = 198_400 accounts for the difference.
        let mk_spec = |p: usize| ParameterBlockSpec {
            name: "test".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                200, p,
            )))),
            offset: Array1::zeros(200),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let specs = vec![mk_spec(12), mk_spec(20), mk_spec(8)];
        assert_eq!(
            joint_coupled_coefficient_hessian_cost(200, &specs),
            200 * 40 * 40
        );
        assert_eq!(
            default_coefficient_hessian_cost(&specs),
            200 * (144 + 400 + 64)
        );
        assert!(
            joint_coupled_coefficient_hessian_cost(200, &specs)
                > default_coefficient_hessian_cost(&specs)
        );
    }

    #[test]
    fn large_scale_exact_adaptive_hessian_order_stays_second_order() {
        let n_train = 320_000u64;
        let p = 101usize;
        let retained_rho_dim = 3usize;
        let spec = ParameterBlockSpec {
            name: "matern60".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                1, p,
            )))),
            offset: Array1::zeros(1),
            penalties: (0..retained_rho_dim)
                .map(|_| PenaltyMatrix::Dense(Array2::eye(p)))
                .collect(),
            nullspace_dims: vec![0; retained_rho_dim],
            initial_log_lambdas: Array1::zeros(retained_rho_dim),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let coefficient_hessian_cost = n_train * (p as u64) * (p as u64);

        assert_eq!(coefficient_hessian_cost, 3_264_320_000);
        assert_eq!(
            retained_rho_dim as u64 * coefficient_hessian_cost,
            9_792_960_000
        );
        assert_eq!(
            exact_outer_order_from_capability(&[spec], coefficient_hessian_cost),
            ExactOuterDerivativeOrder::Second
        );
    }

    #[test]
    fn use_joint_matrix_free_path_triggers_at_each_documented_threshold() {
        // p ≥ 512 is sufficient regardless of n.
        assert!(use_joint_matrix_free_path(512, 1));
        assert!(use_joint_matrix_free_path(2048, 4));
        assert!(!use_joint_matrix_free_path(511, 1));

        // n ≥ 50_000 AND p ≥ 128: both must hold. This keeps p≈51 FLEX
        // marginal-slope large-scale fits on the bounded dense-materialized path.
        assert!(use_joint_matrix_free_path(128, 50_000));
        assert!(!use_joint_matrix_free_path(127, 50_000));
        assert!(!use_joint_matrix_free_path(128, 31_249));
        assert!(!use_joint_matrix_free_path(51, 320_000));

        // n · p ≥ 4_000_000 is the linear-work fallback, but only after the
        // same moderate-p guard; below that, materializing `p` columns is a
        // deterministic small-p bound on expensive row-kernel HVPs.
        assert!(use_joint_matrix_free_path(128, 31_250));
        assert!(!use_joint_matrix_free_path(127, 31_497));

        // Below every threshold: dense path.
        assert!(!use_joint_matrix_free_path(8, 100));
        assert!(!use_joint_matrix_free_path(64, 1000));
    }

    #[test]
    fn large_scale_shape_margslope_flex_cycle0_uses_bounded_dense_route() {
        let total_p = 51;
        let total_n = 320_000;
        let max_pcg_hvps_before_fix = JOINT_PCG_MAX_ITER_MULTIPLIER * total_p;

        assert_eq!(max_pcg_hvps_before_fix, 204);
        assert!(
            !use_joint_matrix_free_path(total_p, total_n),
            "p=51/n=320k should materialize exactly 51 columns instead of risking up to {max_pcg_hvps_before_fix} expensive PCG matvecs in cycle 0"
        );
    }

    struct CountingHessianWorkspace {
        dense_calls: Arc<AtomicUsize>,
        matvec_calls: Arc<AtomicUsize>,
        source_preference: JointHessianSourcePreference,
    }

    impl ExactNewtonJointHessianWorkspace for CountingHessianWorkspace {
        fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
            self.dense_calls.fetch_add(1, Ordering::Relaxed);
            Ok(Some(Array2::eye(2)))
        }

        fn hessian_source_preference(&self) -> JointHessianSourcePreference {
            self.source_preference
        }

        fn hessian_matvec_available(&self) -> bool {
            true
        }

        fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
            self.matvec_calls.fetch_add(1, Ordering::Relaxed);
            Ok(Some(v.clone()))
        }

        fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
            Ok(Some(Array1::ones(2)))
        }

        fn directional_derivative(&self, arr: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(None)
        }
    }

    #[test]
    fn workspace_hessian_source_prefers_dense_without_zero_matvec_probe() {
        let dense_calls = Arc::new(AtomicUsize::new(0));
        let matvec_calls = Arc::new(AtomicUsize::new(0));
        let workspace: Arc<dyn ExactNewtonJointHessianWorkspace> =
            Arc::new(CountingHessianWorkspace {
                dense_calls: Arc::clone(&dense_calls),
                matvec_calls: Arc::clone(&matvec_calls),
                source_preference: JointHessianSourcePreference::Dense,
            });

        let source = exact_newton_joint_hessian_source_from_workspace(
            &workspace,
            2,
            MaterializationIntent::InnerSolve,
            "counting workspace",
        )
        .expect("hessian source should build")
        .expect("hessian source should be present");

        assert_eq!(dense_calls.load(Ordering::Relaxed), 1);
        assert_eq!(matvec_calls.load(Ordering::Relaxed), 0);
        match source {
            JointHessianSource::Dense(hessian) => assert_eq!(hessian, Array2::<f64>::eye(2)),
            JointHessianSource::Operator { .. } => panic!("dense source was not preferred"),
        }
        assert_eq!(matvec_calls.load(Ordering::Relaxed), 0);
    }

    #[test]
    fn workspace_hessian_source_honors_operator_preference_before_dense_probe() {
        let dense_calls = Arc::new(AtomicUsize::new(0));
        let matvec_calls = Arc::new(AtomicUsize::new(0));
        let workspace: Arc<dyn ExactNewtonJointHessianWorkspace> =
            Arc::new(CountingHessianWorkspace {
                dense_calls: Arc::clone(&dense_calls),
                matvec_calls: Arc::clone(&matvec_calls),
                source_preference: JointHessianSourcePreference::Operator,
            });

        let source = exact_newton_joint_hessian_source_from_workspace(
            &workspace,
            2,
            MaterializationIntent::InnerSolve,
            "operator-preferred counting workspace",
        )
        .expect("hessian source should build")
        .expect("hessian source should be present");

        assert_eq!(
            dense_calls.load(Ordering::Relaxed),
            0,
            "operator-preferred source construction must not probe hessian_dense"
        );
        match source {
            JointHessianSource::Operator { apply, .. } => {
                let v = array![3.0, -2.0];
                assert_eq!(apply(&v).expect("operator apply should succeed"), v);
                assert_eq!(matvec_calls.load(Ordering::Relaxed), 1);
            }
            JointHessianSource::Dense(_) => panic!("operator source was not preferred"),
        }
    }

    /// A workspace that exposes both a dense build and a matrix-free HVP and
    /// refines its representation per intent (#738): matrix-free for the inner
    /// solve, dense for logdet factorization. Mirrors CTN's contract.
    struct IntentRefiningHessianWorkspace {
        dense_calls: Arc<AtomicUsize>,
        matvec_calls: Arc<AtomicUsize>,
    }

    impl ExactNewtonJointHessianWorkspace for IntentRefiningHessianWorkspace {
        fn hessian_dense(&self) -> Result<Option<Array2<f64>>, String> {
            self.dense_calls.fetch_add(1, Ordering::Relaxed);
            Ok(Some(Array2::eye(2)))
        }

        fn hessian_source_preference(&self) -> JointHessianSourcePreference {
            JointHessianSourcePreference::Operator
        }

        fn hessian_source_preference_for_intent(
            &self,
            intent: MaterializationIntent,
        ) -> JointHessianSourcePreference {
            match intent {
                MaterializationIntent::LogdetFactorization => JointHessianSourcePreference::Dense,
                MaterializationIntent::InnerSolve
                | MaterializationIntent::OuterEvaluation
                | MaterializationIntent::OuterGradient => JointHessianSourcePreference::Operator,
            }
        }

        fn hessian_matvec_available(&self) -> bool {
            true
        }

        fn hessian_matvec(&self, v: &Array1<f64>) -> Result<Option<Array1<f64>>, String> {
            self.matvec_calls.fetch_add(1, Ordering::Relaxed);
            Ok(Some(v.clone()))
        }

        fn hessian_diagonal(&self) -> Result<Option<Array1<f64>>, String> {
            Ok(Some(Array1::ones(2)))
        }

        fn directional_derivative(&self, arr: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(None)
        }
    }

    #[test]
    fn logdet_intent_takes_dense_while_inner_solve_takes_operator() {
        let dense_calls = Arc::new(AtomicUsize::new(0));
        let matvec_calls = Arc::new(AtomicUsize::new(0));
        let workspace: Arc<dyn ExactNewtonJointHessianWorkspace> =
            Arc::new(IntentRefiningHessianWorkspace {
                dense_calls: Arc::clone(&dense_calls),
                matvec_calls: Arc::clone(&matvec_calls),
            });

        // Logdet factorization intent: the consumer factorizes H + S_lambda,
        // so the workspace hands back the structural dense build directly,
        // probing hessian_dense and skipping the operator wrapper.
        let logdet_source = exact_newton_joint_hessian_source_from_workspace(
            &workspace,
            2,
            MaterializationIntent::LogdetFactorization,
            "intent-refining logdet",
        )
        .expect("logdet source should build")
        .expect("logdet source should be present");
        assert_eq!(dense_calls.load(Ordering::Relaxed), 1);
        assert_eq!(matvec_calls.load(Ordering::Relaxed), 0);
        match logdet_source {
            JointHessianSource::Dense(hessian) => assert_eq!(hessian, Array2::<f64>::eye(2)),
            JointHessianSource::Operator { .. } => {
                panic!("logdet intent must take the dense representation")
            }
        }

        // Inner solve intent: only H · v is applied, so the same workspace
        // hands back the matrix-free operator without touching hessian_dense.
        let inner_source = exact_newton_joint_hessian_source_from_workspace(
            &workspace,
            2,
            MaterializationIntent::InnerSolve,
            "intent-refining inner solve",
        )
        .expect("inner source should build")
        .expect("inner source should be present");
        assert_eq!(
            dense_calls.load(Ordering::Relaxed),
            1,
            "inner-solve intent must not probe hessian_dense"
        );
        match inner_source {
            JointHessianSource::Operator { apply, .. } => {
                let v = array![1.5, -4.0];
                assert_eq!(apply(&v).expect("operator apply should succeed"), v);
                assert_eq!(matvec_calls.load(Ordering::Relaxed), 1);
            }
            JointHessianSource::Dense(_) => {
                panic!("inner-solve intent must take the operator representation")
            }
        }
    }

    #[test]
    fn default_coefficient_gradient_cost_is_half_of_hessian_cost() {
        // The gradient-only sweep through the inner Newton solve does
        // roughly half the per-evaluation arithmetic of the full Hessian
        // assembly path (skips K-fold pairwise B_{j,k} blocks and K-fold
        // inner derivative solves). The default trait method preserves
        // this 2× ratio; families that override `coefficient_hessian_cost`
        // (e.g. GAMLSS via `joint_coupled_coefficient_hessian_cost`)
        // automatically inherit a consistent gradient-cost scaling without
        // a per-family override.
        let mk_spec = |n: usize, p: usize| ParameterBlockSpec {
            name: "test".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                n, p,
            )))),
            offset: Array1::zeros(n),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let specs = vec![mk_spec(500, 10), mk_spec(500, 14)];
        let h_cost = default_coefficient_hessian_cost(&specs);
        let g_cost = default_coefficient_gradient_cost(&specs);
        assert_eq!(h_cost, 500 * 100 + 500 * 196);
        assert_eq!(g_cost, h_cost / 2);
    }

    #[test]
    fn first_order_outer_iter_gate_caps_expensive_gradient_paths() {
        assert_eq!(
            cost_gated_first_order_max_iter(60, 10_000_000_000, false),
            8
        );
        assert_eq!(
            cost_gated_first_order_max_iter(60, 100_000_000_000, false),
            4
        );
        assert_eq!(
            cost_gated_first_order_max_iter(60, 100_000_000_000, true),
            60
        );
    }

    #[test]
    fn custom_family_default_outer_seed_config_is_tightened_for_expensive_paths() {
        let family = OneBlockIdentityFamily;

        let small = family.outer_seed_config(4);
        assert_eq!(small.max_seeds, 6);
        assert_eq!(small.seed_budget, 1);
        assert_eq!(small.screen_max_inner_iterations, 2);

        let large = family.outer_seed_config(16);
        assert_eq!(large.max_seeds, 4);
        assert_eq!(large.seed_budget, 1);
        assert_eq!(large.screen_max_inner_iterations, 2);
    }

    #[test]
    fn floor_positiveworking_weights_preserves_exactzeros() {
        let weights = array![0.0, 1.0e-16, 0.25];
        let floored = floor_positiveworking_weights(&weights, 1.0e-6);
        assert_eq!(floored[0], 0.0);
        assert_eq!(floored[1], 1.0e-6);
        assert_eq!(floored[2], 0.25);
    }

    #[test]
    fn screened_outer_warm_start_reuses_any_matching_rho_dimension() {
        let rho_far = array![2.25, -0.5];
        let cache = Some(ConstrainedWarmStart {
            rho: array![0.0, -0.5],
            block_beta: vec![array![1.0, -1.0]],
            active_sets: vec![None],
            cached_inner: None,
        });

        let retained = screened_outer_warm_start(cache.as_ref(), &rho_far)
            .expect("matching-dimension warm starts should remain reusable");
        assert_eq!(retained.rho, array![0.0, -0.5]);
        assert_eq!(retained.block_beta[0], array![1.0, -1.0]);
        assert_eq!(retained.active_sets[0], None);
    }

    #[test]
    fn cached_beta_warm_start_splits_blocks_and_validates_shape() {
        let mk_spec = |name: &str, p: usize| ParameterBlockSpec {
            name: name.to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                3, p,
            )))),
            offset: Array1::zeros(3),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let specs = vec![mk_spec("a", 2), mk_spec("b", 3)];

        let warm = constrained_warm_start_from_cached_beta(4, &specs, &array![1., 2., 3., 4., 5.])
            .expect("matching beta");
        assert_eq!(warm.rho.len(), 4);
        assert_eq!(warm.block_beta, vec![array![1., 2.], array![3., 4., 5.]]);
        assert_eq!(warm.active_sets, vec![None, None]);
        assert!(warm.cached_inner.is_none());

        let err = match constrained_warm_start_from_cached_beta(4, &specs, &array![1., 2., 3.]) {
            Ok(_) => panic!("wrong beta length should be rejected"),
            Err(err) => err,
        };
        assert!(
            err.to_string().contains(
                "cached inner beta has length 3, but custom-family blocks require length 5"
            ),
            "{err}"
        );
    }

    #[test]
    fn cached_beta_warm_start_rejects_nonfinite_entries() {
        let spec = ParameterBlockSpec {
            name: "a".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                3, 2,
            )))),
            offset: Array1::zeros(3),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };

        let err = match constrained_warm_start_from_cached_beta(1, &[spec], &array![1.0, f64::NAN])
        {
            Ok(_) => panic!("non-finite beta should be rejected"),
            Err(err) => err,
        };
        assert!(
            err.to_string()
                .contains("cached inner beta contains non-finite entries"),
            "{err}"
        );
    }

    #[test]
    fn custom_outer_state_reset_preserves_seeded_cached_beta() {
        let spec = ParameterBlockSpec {
            name: "a".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::zeros((
                3, 2,
            )))),
            offset: Array1::zeros(3),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let mut state = CustomOuterState::new(None);
        state
            .seed_cached_beta(1, &[spec], &array![4.0, -2.0])
            .expect("cached beta seed");

        state.warm_cache = None;
        state.reset();

        let warm = state
            .warm_cache
            .as_ref()
            .expect("reset should restore cached beta seed");
        assert_eq!(warm.rho.len(), 1);
        assert_eq!(warm.block_beta, vec![array![4.0, -2.0]]);
        assert!(warm.cached_inner.is_none());
    }

    #[test]
    fn custom_outer_state_reset_preserves_existing_persistent_warm_start() {
        let persistent = ConstrainedWarmStart {
            rho: array![0.25],
            block_beta: vec![array![1.0, 2.0]],
            active_sets: vec![None],
            cached_inner: None,
        };
        let mut state = CustomOuterState::new(Some(persistent.clone()));

        state.warm_cache = None;
        state.reset();

        let warm = state
            .warm_cache
            .as_ref()
            .expect("reset should restore persistent warm start");
        assert_eq!(warm.rho, persistent.rho);
        assert_eq!(warm.block_beta, persistent.block_beta);
    }

    #[test]
    fn public_warm_start_compatibility_checks_rho_dimension() {
        let warm = CustomFamilyWarmStart {
            inner: ConstrainedWarmStart {
                rho: array![0.0, -0.5],
                block_beta: vec![array![1.0, -1.0]],
                active_sets: vec![None],
                cached_inner: None,
            },
        };

        assert!(warm.compatible_with_rho(&array![0.75, -0.5]));
        assert!(warm.compatible_with_rho(&array![1.75, -0.5]));
        assert!(!warm.compatible_with_rho(&array![0.0]));
    }

    #[test]
    fn psi_drift_deriv_workspace_preserves_block_local_operator() {
        #[derive(Clone)]
        struct ZeroFamily;

        impl CustomFamily for ZeroFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                assert!(block_states.len() <= isize::MAX as usize);
                Ok(FamilyEvaluation {
                    log_likelihood: 0.0,
                    blockworking_sets: vec![],
                })
            }
        }

        struct BlockLocalPsiWorkspace;

        impl ExactNewtonJointPsiWorkspace for BlockLocalPsiWorkspace {
            fn second_order_terms(
                &self,
                idx: usize,
                idx2: usize,
            ) -> Result<Option<ExactNewtonJointPsiSecondOrderTerms>, String> {
                assert!(idx < usize::MAX);
                assert!(idx2 < usize::MAX);
                Ok(None)
            }

            fn hessian_directional_derivative(
                &self,
                psi_index: usize,
                arr: &Array1<f64>,
            ) -> Result<Option<DriftDerivResult>, String> {
                assert!(arr.iter().all(|v| !v.is_nan()));
                assert_eq!(psi_index, 0);
                Ok(Some(DriftDerivResult::Operator(Arc::new(
                    crate::solver::estimate::reml::unified::BlockLocalDrift {
                        local: array![[3.0, 1.0], [1.0, 2.0]],
                        start: 1,
                        end: 3,
                        total_dim: 3,
                    },
                ))))
            }
        }

        let callback = build_psi_drift_deriv_callback(
            &ZeroFamily,
            &[],
            &[],
            Arc::new(Vec::new()),
            false,
            Some(Arc::new(BlockLocalPsiWorkspace)),
        )
        .expect("non-Gaussian psi drift callback should be available");

        let result = callback(0, &array![1.0, 2.0, 3.0])
            .expect("workspace-backed psi drift derivative should be returned");

        match result {
            DriftDerivResult::Dense(_) => {
                panic!("workspace-backed block-local psi drift derivative was densified")
            }
            DriftDerivResult::Operator(op) => {
                let (local, start, end) = op
                    .block_local_data()
                    .expect("block-local operator metadata should be preserved");
                assert_eq!((start, end), (1, 3));
                assert_eq!(local, &array![[3.0, 1.0], [1.0, 2.0]]);
            }
        }
    }

    #[test]
    fn custom_family_outer_derivatives_respects_missing_second_order_capability() {
        #[derive(Clone)]
        struct OneBlockFirstOrderOnlyFamily;

        impl CustomFamily for OneBlockFirstOrderOnlyFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let n = block_states[0].eta.len();
                Ok(FamilyEvaluation {
                    log_likelihood: 0.0,
                    blockworking_sets: vec![BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n),
                        working_weights: Array1::ones(n),
                    }],
                })
            }

            fn exact_outer_derivative_order(
                &self,
                block_specs: &[ParameterBlockSpec],
                options: &BlockwiseFitOptions,
            ) -> ExactOuterDerivativeOrder {
                assert!(block_specs.len() <= isize::MAX as usize);
                assert!(std::mem::size_of_val(options) > 0);
                ExactOuterDerivativeOrder::First
            }
        }

        let specs = vec![ParameterBlockSpec {
            name: "x".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }];
        let (gradient, hessian) = custom_family_outer_derivatives(
            &OneBlockFirstOrderOnlyFamily,
            &specs,
            &BlockwiseFitOptions::default(),
        );
        assert_eq!(
            gradient,
            crate::solver::outer_strategy::Derivative::Analytic
        );
        assert_eq!(
            hessian,
            crate::solver::outer_strategy::DeclaredHessianForm::Unavailable
        );
    }

    #[derive(Clone)]
    struct DefaultDiagonalExactHookFamily;

    impl CustomFamily for DefaultDiagonalExactHookFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let eta = block_states[0].eta.clone();
            let weights = eta.mapv(|value| 2.0 + value * value);
            Ok(FamilyEvaluation {
                log_likelihood: -0.5 * eta.dot(&eta),
                blockworking_sets: vec![BlockWorkingSet::Diagonal {
                    working_response: Array1::zeros(eta.len()),
                    working_weights: weights,
                }],
            })
        }

        fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
            true
        }

        fn diagonalworking_weights_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            idx: usize,
            d_eta: &Array1<f64>,
        ) -> Result<Option<Array1<f64>>, String> {
            assert!(idx < usize::MAX);
            Ok(Some((&block_states[0].eta * d_eta) * 2.0))
        }

        fn exact_newton_joint_hessiansecond_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            u: &Array1<f64>,
            v: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            let spec = default_diagonal_exact_hook_spec();
            let u_eta = spec.design.apply(u);
            let v_eta = spec.design.apply(v);
            assert_eq!(block_states[0].eta.len(), u_eta.len());
            spec.design
                .xt_diag_x_signed_op(SignedWeightsView::from_array(&((&u_eta * &v_eta) * 2.0)))
                .map(Some)
        }
    }

    fn default_diagonal_exact_hook_spec() -> ParameterBlockSpec {
        ParameterBlockSpec {
            name: "default_exact".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0, 0.5],
                [0.0, 1.0],
                [2.0, -1.0]
            ])),
            offset: Array1::zeros(3),
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(2))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.2, -0.1]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }
    }

    #[test]
    fn default_custom_family_exact_hessian_hooks_assemble_diagonal_working_sets() {
        let family = DefaultDiagonalExactHookFamily;
        let spec = default_diagonal_exact_hook_spec();
        let beta = array![0.2, -0.1];
        let eta = spec.design.apply(&beta);
        let states = vec![ParameterBlockState {
            beta: beta.clone(),
            eta: eta.clone(),
        }];

        let h = family
            .exact_newton_joint_hessian_with_specs(&states, &[spec.clone()])
            .expect("default joint Hessian hook should succeed")
            .expect("diagonal working sets should assemble an exact joint Hessian");
        let expected_h = spec
            .design
            .xt_diag_x_signed_op(SignedWeightsView::from_array(
                &eta.mapv(|value| 2.0 + value * value),
            ))
            .unwrap();
        assert_eq!(h, expected_h);

        let direction = array![0.3, -0.4];
        let dh = family
            .exact_newton_joint_hessian_directional_derivative_with_specs(
                &states,
                &[spec.clone()],
                &direction,
            )
            .expect("default joint dH hook should succeed")
            .expect("diagonal weight derivative should assemble an exact joint dH");
        let d_eta = spec.design.apply(&direction);
        let expected_dh = spec
            .design
            .xt_diag_x_signed_op(SignedWeightsView::from_array(&((&eta * &d_eta) * 2.0)))
            .unwrap();
        assert_eq!(dh, expected_dh);

        let d2h = family
            .exact_newton_joint_hessiansecond_directional_derivative(&states, &direction, &beta)
            .expect("family second directional hook should succeed")
            .expect("second directional hook should be exact");
        let beta_eta = spec.design.apply(&beta);
        let expected_d2h = spec
            .design
            .xt_diag_x_signed_op(SignedWeightsView::from_array(&((&d_eta * &beta_eta) * 2.0)))
            .unwrap();
        assert_eq!(d2h, expected_d2h);
    }

    #[test]
    fn default_custom_family_exact_hessian_hooks_drive_profiled_outer_hessian() {
        let mut spec = default_diagonal_exact_hook_spec();
        spec.initial_beta = Some(Array1::zeros(2));
        let result = evaluate_custom_family_joint_hyper(
            &DefaultDiagonalExactHookFamily,
            &[spec],
            &BlockwiseFitOptions {
                use_remlobjective: true,
                use_outer_hessian: true,
                compute_covariance: false,
                inner_max_cycles: 1,
                ..BlockwiseFitOptions::default()
            },
            &array![0.0],
            &[vec![]],
            None,
            EvalMode::ValueGradientHessian,
        )
        .expect("profiled outer Hessian should use default exact Hessian hooks");

        assert_eq!(result.gradient.len(), 1);
        match result.outer_hessian {
            crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
                assert_eq!(hessian.dim(), (1, 1));
                assert!(hessian[[0, 0]].is_finite());
            }
            _ => panic!("outer Hessian should be analytic"),
        }
    }

    #[test]
    fn nonconverged_inner_refuses_profile_derivatives() {
        let spec = default_diagonal_exact_hook_spec();
        let result = evaluate_custom_family_joint_hyper(
            &DefaultDiagonalExactHookFamily,
            &[spec],
            &BlockwiseFitOptions {
                use_remlobjective: true,
                use_outer_hessian: true,
                compute_covariance: false,
                inner_max_cycles: 1,
                ..BlockwiseFitOptions::default()
            },
            &array![0.0],
            &[vec![]],
            None,
            EvalMode::ValueGradientHessian,
        );

        let err = match result {
            Ok(_) => panic!("non-converged inner solve must not expose derivatives"),
            Err(e) => e,
        };
        let msg = err.to_string();
        assert!(
            msg.contains("inner solve did not converge") && msg.contains("refusing to expose"),
            "unexpected error: {msg}"
        );
    }

    #[test]
    fn custom_family_seed_screening_proxy_accepts_finite_partial_inner_fit() {
        let specs = vec![default_diagonal_exact_hook_spec()];
        let penalty_counts = validate_blockspecs(&specs).expect("valid test spec");
        let layout = penalty_label_layout(&specs, penalty_counts).expect("valid label layout");
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: true,
            compute_covariance: false,
            inner_max_cycles: 1,
            ..BlockwiseFitOptions::default()
        };

        let (score, warm_start, inner_converged) = custom_family_seed_screening_proxy_labeled(
            &DefaultDiagonalExactHookFamily,
            &specs,
            &options,
            &layout,
            &array![0.0],
            None,
            &crate::types::RhoPrior::Flat,
        )
        .expect("screening proxy should score a finite partial inner solve");

        assert!(score.is_finite());
        assert!(
            !inner_converged,
            "one-cycle screening is expected to be a partial inner fit"
        );
        assert_eq!(warm_start.rho, array![0.0]);
        assert_eq!(warm_start.block_beta.len(), 1);
    }

    #[test]
    fn custom_family_outer_derivatives_exposes_surrogate_second_order_geometry() {
        // RidgedQuadraticReml is the default objective; its analytic outer
        // Hessian is routed to ARC, which handles indefinite Hessians via
        // cubic regularization. The previous behavior forced these families
        // onto BFGS+BfgsApprox and caused benchmark hangs at iter 0.
        #[derive(Clone)]
        struct SurrogateFamily;

        impl CustomFamily for SurrogateFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let n = block_states[0].eta.len();
                Ok(FamilyEvaluation {
                    log_likelihood: 0.0,
                    blockworking_sets: vec![BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n),
                        working_weights: Array1::ones(n),
                    }],
                })
            }
        }

        let specs = vec![ParameterBlockSpec {
            name: "x".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: true,
            ..BlockwiseFitOptions::default()
        };
        let (gradient, hessian) =
            custom_family_outer_derivatives(&SurrogateFamily, &specs, &options);
        assert_eq!(
            gradient,
            crate::solver::outer_strategy::Derivative::Analytic
        );
        assert_eq!(
            hessian,
            crate::solver::outer_strategy::DeclaredHessianForm::Either
        );
    }

    #[test]
    fn custom_family_outer_derivatives_keeps_strict_second_order_geometry() {
        #[derive(Clone)]
        struct StrictFamily;

        impl CustomFamily for StrictFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let n = block_states[0].eta.len();
                Ok(FamilyEvaluation {
                    log_likelihood: 0.0,
                    blockworking_sets: vec![BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n),
                        working_weights: Array1::ones(n),
                    }],
                })
            }

            fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
                ExactNewtonOuterObjective::StrictPseudoLaplace
            }
        }

        let specs = vec![ParameterBlockSpec {
            name: "x".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: true,
            ..BlockwiseFitOptions::default()
        };
        let (gradient, hessian) = custom_family_outer_derivatives(&StrictFamily, &specs, &options);
        assert_eq!(
            gradient,
            crate::solver::outer_strategy::Derivative::Analytic
        );
        assert_eq!(
            hessian,
            crate::solver::outer_strategy::DeclaredHessianForm::Either
        );
    }

    #[derive(Clone)]
    struct OneBlockQuarticExactFamily {
        linear: f64,
        curvature: f64,
        second_scale: f64,
    }

    impl CustomFamily for OneBlockQuarticExactFamily {
        fn exact_newton_joint_hessian_beta_dependent(&self) -> bool {
            // h(β) = 1 + curvature·β² genuinely depends on β; the default
            // (false for RidgedQuadraticReml) would short-circuit the joint
            // d²H aggregator to zeros and drop the per-block override below
            // before it ever reaches the outer Hessian's drift contribution.
            true
        }

        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta = block_states[0].beta[0];
            let log_likelihood =
                self.linear * beta - 0.5 * beta * beta - self.curvature * beta.powi(4) / 12.0;
            let gradient = self.linear - beta - self.curvature * beta.powi(3) / 3.0;
            let hessian = 1.0 + self.curvature * beta * beta;
            Ok(FamilyEvaluation {
                log_likelihood,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![gradient],
                    hessian: SymmetricMatrix::Dense(array![[hessian]]),
                }],
            })
        }

        fn exact_newton_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            block_idx: usize,
            direction: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert_eq!(block_idx, 0);
            let beta = block_states[0].beta[0];
            Ok(Some(array![[2.0 * self.curvature * beta * direction[0]]]))
        }

        fn exact_newton_hessian_second_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            block_idx: usize,
            u: &Array1<f64>,
            v: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert_eq!(block_idx, 0);
            let value = 2.0 * self.curvature * self.second_scale * u[0] * v[0];
            Ok(Some(array![[value]]))
        }
    }

    #[test]
    fn generic_single_block_fallback_includes_nonzero_d2h_drift() {
        let spec = ParameterBlockSpec {
            name: "quartic".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.75]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            inner_tol: 1e-11,
            use_remlobjective: true,
            use_outer_hessian: true,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let penalty_counts = vec![1];
        let rho = array![0.0];

        let with_d2 = evaluate_custom_family_hyper_internal(
            &OneBlockQuarticExactFamily {
                linear: 3.0,
                curvature: 0.5,
                second_scale: 1.0,
            },
            std::slice::from_ref(&spec),
            &options,
            &penalty_counts,
            &rho,
            &[vec![]],
            None,
            crate::types::RhoPrior::Flat,
            EvalMode::ValueGradientHessian,
        )
        .expect("single-block fallback with exact d2H should evaluate");
        let without_d2_contribution = evaluate_custom_family_hyper_internal(
            &OneBlockQuarticExactFamily {
                linear: 3.0,
                curvature: 0.5,
                second_scale: 0.0,
            },
            &[spec],
            &options,
            &penalty_counts,
            &rho,
            &[vec![]],
            None,
            crate::types::RhoPrior::Flat,
            EvalMode::ValueGradientHessian,
        )
        .expect("single-block fallback with zero d2H should evaluate");

        let h_with = with_d2.outer_hessian.unwrap_analytic();
        let h_without = without_d2_contribution.outer_hessian.unwrap_analytic();
        let d2h_delta = h_with[[0, 0]] - h_without[[0, 0]];
        assert!(
            d2h_delta.abs() > 1e-8,
            "expected nonzero outer Hessian contribution from d2H; with={:?}, without={:?}",
            h_with,
            h_without
        );
    }

    #[test]
    fn custom_family_outer_derivatives_keeps_second_order_for_large_inner_problem() {
        // Inner (n, p) scale does not block the analytic outer Hessian: the
        // outer Hessian assembled by `compute_outer_hessian` is shape
        // (K+ext_dim)×(K+ext_dim) where K = total penalties. For large inner
        // problems with modest K (the common case: n=50000, p=50, K=2) the
        // outer Hessian is tiny and must remain available so ARC can drive
        // the outer iteration. Prior versions of this test enforced an
        // inner-size cutoff that disabled the Hessian for exactly the
        // benchmark sizes (medium: n=50000,p=50; pathological: n=50000,p=80)
        // that were hanging 45-minute GH jobs on BFGS+BfgsApprox Strong Wolfe
        // failures at iter 0.
        #[derive(Clone)]
        struct StrictFamily;

        impl CustomFamily for StrictFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let n = block_states[0].eta.len();
                Ok(FamilyEvaluation {
                    log_likelihood: 0.0,
                    blockworking_sets: vec![BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n),
                        working_weights: Array1::ones(n),
                    }],
                })
            }

            fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
                ExactNewtonOuterObjective::StrictPseudoLaplace
            }
        }

        let specs = vec![ParameterBlockSpec {
            name: "x".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                Array2::<f64>::zeros((20_100, 50)),
            )),
            offset: Array1::zeros(20_100),
            penalties: vec![PenaltyMatrix::Dense(Array2::<f64>::eye(50))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            use_outer_hessian: true,
            ..BlockwiseFitOptions::default()
        };

        let (gradient, hessian) = custom_family_outer_derivatives(&StrictFamily, &specs, &options);
        assert_eq!(
            gradient,
            crate::solver::outer_strategy::Derivative::Analytic
        );
        assert_eq!(
            hessian,
            crate::solver::outer_strategy::DeclaredHessianForm::Either
        );
    }

    impl CustomFamily for OneBlockIdentityFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let n = block_states[0].eta.len();
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::Diagonal {
                    working_response: Array1::ones(n),
                    working_weights: Array1::ones(n),
                }],
            })
        }
    }

    #[test]
    fn fit_custom_family_rejects_invalid_blockspec_before_output_channel_probe() {
        let spec = ParameterBlockSpec {
            name: "bad_penalty".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0],
                [2.0],
            ])),
            offset: Array1::zeros(2),
            penalties: vec![PenaltyMatrix::Dense(Array2::<f64>::eye(2))],
            nullspace_dims: vec![0],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };

        let err = fit_custom_family(
            &OneBlockIdentityFamily,
            &[spec],
            &BlockwiseFitOptions::default(),
        )
        .expect_err("invalid block spec should return a typed error");
        let message = err.to_string();
        assert!(
            message.contains("block 0 penalty 0 must be 1x1, got 2x2"),
            "unexpected error: {message}",
        );
    }

    #[derive(Clone)]
    struct OneBlockGaussianFamily {
        y: Array1<f64>,
    }

    impl CustomFamily for OneBlockGaussianFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let eta = &block_states[0].eta;
            let resid = eta - &self.y;
            let ll = -0.5 * resid.dot(&resid);
            Ok(FamilyEvaluation {
                log_likelihood: ll,
                blockworking_sets: vec![BlockWorkingSet::Diagonal {
                    working_response: self.y.clone(),
                    working_weights: Array1::ones(self.y.len()),
                }],
            })
        }

        fn diagonalworking_weights_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            idx: usize,
            d_eta: &Array1<f64>,
        ) -> Result<Option<Array1<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(idx < usize::MAX);
            Ok(Some(Array1::zeros(d_eta.len())))
        }

        fn diagonalworking_weights_second_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            idx: usize,
            d_eta_u: &Array1<f64>,
            arr: &Array1<f64>,
        ) -> Result<Option<Array1<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(idx < usize::MAX);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(Array1::zeros(d_eta_u.len())))
        }
    }

    #[derive(Clone)]
    struct OneBlockConstrainedExactFamily {
        target: f64,
        lower: f64,
    }

    impl CustomFamily for OneBlockConstrainedExactFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta = block_states
                .first()
                .ok_or_else(|| "missing block 0".to_string())?
                .beta
                .first()
                .copied()
                .ok_or_else(|| "missing coefficient".to_string())?;
            let g = self.target - beta;
            let ll = -0.5 * (beta - self.target) * (beta - self.target);
            Ok(FamilyEvaluation {
                log_likelihood: ll,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![g],
                    hessian: SymmetricMatrix::Dense(array![[1.0]]),
                }],
            })
        }

        fn block_linear_constraints(
            &self,
            block_states: &[ParameterBlockState],
            block_idx: usize,
            block_spec: &ParameterBlockSpec,
        ) -> Result<Option<LinearInequalityConstraints>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(!block_spec.name.is_empty());
            if block_idx != 0 {
                return Ok(None);
            }
            Ok(Some(LinearInequalityConstraints {
                a: array![[1.0]],
                b: array![self.lower],
            }))
        }
    }

    #[derive(Clone)]
    struct OneBlockConstrainedNaNHessianFamily;

    impl CustomFamily for OneBlockConstrainedNaNHessianFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![0.0],
                    hessian: SymmetricMatrix::Dense(array![[f64::NAN]]),
                }],
            })
        }

        fn block_linear_constraints(
            &self,
            block_states: &[ParameterBlockState],
            block_idx: usize,
            block_spec: &ParameterBlockSpec,
        ) -> Result<Option<LinearInequalityConstraints>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(!block_spec.name.is_empty());
            if block_idx != 0 {
                return Ok(None);
            }
            Ok(Some(LinearInequalityConstraints {
                a: array![[1.0]],
                b: array![0.0],
            }))
        }
    }

    #[derive(Clone)]
    struct OneBlockConstrainedIndefiniteHessianFamily;

    impl CustomFamily for OneBlockConstrainedIndefiniteHessianFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![-1.0],
                    hessian: SymmetricMatrix::Dense(array![[-1.0]]),
                }],
            })
        }

        fn block_linear_constraints(
            &self,
            block_states: &[ParameterBlockState],
            block_idx: usize,
            block_spec: &ParameterBlockSpec,
        ) -> Result<Option<LinearInequalityConstraints>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(!block_spec.name.is_empty());
            if block_idx != 0 {
                return Ok(None);
            }
            Ok(Some(LinearInequalityConstraints {
                a: array![[1.0]],
                b: array![1.0],
            }))
        }
    }

    #[derive(Clone)]
    struct OneBlockLinearLikelihoodExactFamily {
        score: f64,
    }

    impl CustomFamily for OneBlockLinearLikelihoodExactFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta = block_states
                .first()
                .ok_or_else(|| "missing block 0".to_string())?
                .beta
                .first()
                .copied()
                .ok_or_else(|| "missing coefficient".to_string())?;
            Ok(FamilyEvaluation {
                log_likelihood: self.score * beta,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![self.score],
                    hessian: SymmetricMatrix::Dense(array![[0.0]]),
                }],
            })
        }
    }

    #[derive(Clone)]
    struct PreferJointExactFamily;

    impl CustomFamily for PreferJointExactFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![0.0],
                    hessian: SymmetricMatrix::Dense(array![[2.0]]),
                }],
            })
        }

        fn exact_newton_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            idx: usize,
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(idx < usize::MAX);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Err(
                "blockwise exact-newton path should not be used when joint path is available"
                    .to_string(),
            )
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[2.0]]))
        }

        fn exact_newton_joint_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(array![[0.0]]))
        }
    }

    #[derive(Clone)]
    struct TwoBlockJointConstrainedFamily {
        coupling: f64,
    }

    impl CustomFamily for TwoBlockJointConstrainedFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta0 = block_states[0].beta[0];
            let beta1 = block_states[1].beta[0];
            let g0 = 1.0 - beta0 - self.coupling * beta1;
            let g1 = 1.0 - beta1 - self.coupling * beta0;
            Ok(FamilyEvaluation {
                log_likelihood: -0.5
                    * (beta0 * beta0 + beta1 * beta1 + 2.0 * self.coupling * beta0 * beta1)
                    + beta0
                    + beta1,
                blockworking_sets: vec![
                    BlockWorkingSet::ExactNewton {
                        gradient: array![g0],
                        hessian: SymmetricMatrix::Dense(array![[1.0]]),
                    },
                    BlockWorkingSet::ExactNewton {
                        gradient: array![g1],
                        hessian: SymmetricMatrix::Dense(array![[1.0]]),
                    },
                ],
            })
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[1.0, self.coupling], [self.coupling, 1.0]]))
        }

        fn exact_newton_joint_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(Array2::zeros((2, 2))))
        }

        fn block_linear_constraints(
            &self,
            block_states: &[ParameterBlockState],
            block_idx: usize,
            block_spec: &ParameterBlockSpec,
        ) -> Result<Option<LinearInequalityConstraints>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(!block_spec.name.is_empty());
            if block_idx >= 2 {
                return Ok(None);
            }
            Ok(Some(LinearInequalityConstraints {
                a: array![[1.0]],
                b: array![0.0],
            }))
        }
    }

    #[derive(Clone)]
    struct TwoBlockPersistentGradientFamily;

    impl CustomFamily for TwoBlockPersistentGradientFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta0 = block_states[0].beta[0];
            let beta1 = block_states[1].beta[0];
            Ok(FamilyEvaluation {
                log_likelihood: beta0 + beta1,
                blockworking_sets: vec![
                    BlockWorkingSet::ExactNewton {
                        gradient: array![1.0],
                        hessian: SymmetricMatrix::Dense(array![[1.0]]),
                    },
                    BlockWorkingSet::ExactNewton {
                        gradient: array![1.0],
                        hessian: SymmetricMatrix::Dense(array![[1.0]]),
                    },
                ],
            })
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[1.0, 0.25], [0.25, 1.0]]))
        }

        fn exact_newton_joint_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(Array2::zeros((2, 2))))
        }

        fn has_explicit_joint_hessian(&self) -> bool {
            true
        }
    }

    #[derive(Clone)]
    struct TwoBlockJointSurrogateFamily;

    impl CustomFamily for TwoBlockJointSurrogateFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let n0 = block_states
                .first()
                .ok_or_else(|| "missing block 0".to_string())?
                .eta
                .len();
            let n1 = block_states
                .get(1)
                .ok_or_else(|| "missing block 1".to_string())?
                .eta
                .len();
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![
                    BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n0),
                        working_weights: Array1::ones(n0),
                    },
                    BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n1),
                        working_weights: Array1::ones(n1),
                    },
                ],
            })
        }

        fn exact_newton_joint_hessian_with_specs(
            &self,
            block_states: &[ParameterBlockState],
            specs: &[ParameterBlockSpec],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            let p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
            Ok(Some(Array2::eye(p)))
        }

        fn exact_newton_joint_hessian_directional_derivative_with_specs(
            &self,
            block_states: &[ParameterBlockState],
            specs: &[ParameterBlockSpec],
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            let p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
            Ok(Some(Array2::zeros((p, p))))
        }

        fn exact_newton_joint_hessian_second_directional_derivative_with_specs(
            &self,
            block_states: &[ParameterBlockState],
            specs: &[ParameterBlockSpec],
            arr: &Array1<f64>,
            arr2: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            assert!(arr2.iter().all(|v| !v.is_nan()));
            let p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
            Ok(Some(Array2::zeros((p, p))))
        }
    }

    #[derive(Clone)]
    struct OneBlockPseudoLaplaceExactFamily {
        target: f64,
    }

    impl CustomFamily for OneBlockPseudoLaplaceExactFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta = block_states
                .first()
                .ok_or_else(|| "missing block 0".to_string())?
                .beta
                .first()
                .copied()
                .ok_or_else(|| "missing coefficient".to_string())?;
            let resid = beta - self.target;
            Ok(FamilyEvaluation {
                log_likelihood: -resid * resid,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![-2.0 * resid],
                    hessian: SymmetricMatrix::Dense(array![[2.0]]),
                }],
            })
        }

        fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
            ExactNewtonOuterObjective::StrictPseudoLaplace
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[2.0]]))
        }

        fn exact_newton_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            idx: usize,
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(idx < usize::MAX);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(array![[0.0]]))
        }

        fn exact_newton_joint_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(array![[0.0]]))
        }
    }

    #[derive(Clone)]
    struct OneBlockExactPsiHookFamily;

    impl CustomFamily for OneBlockExactPsiHookFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![0.0],
                    hessian: SymmetricMatrix::Dense(array![[1.0]]),
                }],
            })
        }

        fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
            ExactNewtonOuterObjective::StrictPseudoLaplace
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[1.0]]))
        }

        fn exact_newton_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            idx: usize,
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(idx < usize::MAX);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(array![[0.0]]))
        }

        fn exact_newton_joint_hessian_directional_derivative(
            &self,
            block_states: &[ParameterBlockState],
            arr: &Array1<f64>,
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(arr.iter().all(|v| !v.is_nan()));
            Ok(Some(array![[0.0]]))
        }

        fn exact_newton_joint_psi_terms(
            &self,
            block_states: &[ParameterBlockState],
            block_specs: &[ParameterBlockSpec],
            derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
            idx: usize,
        ) -> Result<Option<ExactNewtonJointPsiTerms>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(block_specs.len() <= isize::MAX as usize);
            assert!(derivative_blocks.len() <= isize::MAX as usize);
            assert!(idx < usize::MAX);
            Ok(Some(ExactNewtonJointPsiTerms {
                objective_psi: 3.5,
                score_psi: array![0.0],
                hessian_psi: array![[0.0]],
                hessian_psi_operator: None,
            }))
        }
    }

    #[derive(Clone)]
    struct OneBlockIndefinitePseudoLaplaceFamily;

    impl CustomFamily for OneBlockIndefinitePseudoLaplaceFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient: array![0.0],
                    hessian: SymmetricMatrix::Dense(array![[-1.0]]),
                }],
            })
        }

        fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
            ExactNewtonOuterObjective::StrictPseudoLaplace
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[-1.0]]))
        }
    }

    #[derive(Clone)]
    struct OneBlockNearlySymmetricPseudoLaplaceFamily;

    impl CustomFamily for OneBlockNearlySymmetricPseudoLaplaceFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let beta = block_states
                .first()
                .ok_or_else(|| "missing block 0".to_string())?
                .beta
                .clone();
            let h = array![[2.0, 0.1], [3.0, 2.0]];
            let gradient = -h.dot(&beta);
            Ok(FamilyEvaluation {
                log_likelihood: -0.5 * beta.dot(&h.dot(&beta)),
                blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                    gradient,
                    hessian: SymmetricMatrix::Dense(h),
                }],
            })
        }

        fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
            ExactNewtonOuterObjective::StrictPseudoLaplace
        }

        fn exact_newton_joint_hessian(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Ok(Some(array![[2.0, 0.1], [3.0, 2.0]]))
        }
    }

    #[derive(Clone)]
    struct OneBlockAlwaysErrorFamily;

    impl CustomFamily for OneBlockAlwaysErrorFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            Err("synthetic outer objective failure: block[0] evaluate()".to_string())
        }
    }

    #[derive(Clone)]
    struct OneBlockCovarianceErrorFamily;

    impl CustomFamily for OneBlockCovarianceErrorFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let n = block_states[0].eta.len();
            Ok(FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: vec![BlockWorkingSet::Diagonal {
                    working_response: Array1::zeros(n),
                    working_weights: Array1::ones(n),
                }],
            })
        }

        fn exact_newton_joint_hessian_with_specs(
            &self,
            block_states: &[ParameterBlockState],
            block_specs: &[ParameterBlockSpec],
        ) -> Result<Option<Array2<f64>>, String> {
            assert!(block_states.len() <= isize::MAX as usize);
            assert!(block_specs.len() <= isize::MAX as usize);
            Err("synthetic covariance assembly failure".to_string())
        }
    }

    #[test]
    fn effectiveridge_is_never_below_solver_floor() {
        assert!((effective_solverridge(0.0) - 1e-15).abs() < 1e-30);
        assert!((effective_solverridge(1e-8) - 1e-8).abs() < 1e-20);
    }

    #[test]
    fn objective_includes_solverridge_quadratic_term() {
        // One-parameter block with X=1, y*=1, w=1, no explicit penalties.
        // Inner solve gives beta = 1 / (1 + ridge), so objective should include
        // 0.5 * ridge * beta^2 even when no smoothing penalties are present.
        let spec = ParameterBlockSpec {
            name: "b0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            inner_tol: 0.0,
            outer_max_iter: 1,
            outer_tol: 1e-8,
            minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
            ridge_floor: 1e-4,
            ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
            use_remlobjective: false,
            compute_covariance: false,
            use_outer_hessian: false,
            screening_max_inner_iterations: None,
            outer_inner_max_iterations: None,
            seed_screening: false,
            early_exit_threshold: None,
            outer_score_subsample: None,
            auto_outer_subsample: false,
            outer_eval_context: None,
            cache_session: None,
            cache_mirror_sessions: Vec::new(),
            joint_penalties: None,
            screen_initial_rho: true,
        };

        let result = fit_custom_family(&OneBlockIdentityFamily, &[spec], &options)
            .expect("custom family fit should succeed");
        let ridge = effective_solverridge(options.ridge_floor);
        let beta = result.block_states[0].beta[0];
        let expected_penalty = 0.5 * ridge * beta * beta;
        assert!(
            (result.penalized_objective - expected_penalty).abs() < 1e-12,
            "penalized objective should equal ridge quadratic term when ll=0 and S=0; got {}, expected {}",
            result.penalized_objective,
            expected_penalty
        );
    }

    #[test]
    fn inner_block_accepts_penalty_improving_step_even_if_loglik_drops() {
        let family = OneBlockGaussianFamily { y: array![1.0] };
        let spec = ParameterBlockSpec {
            name: "b0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![PenaltyMatrix::Dense(array![[1.0]])],
            nullspace_dims: vec![],
            initial_log_lambdas: array![10.0_f64.ln()],
            initial_beta: Some(array![1.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            inner_max_cycles: 20,
            inner_tol: 1e-10,
            outer_max_iter: 1,
            outer_tol: 1e-8,
            minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
            ridge_floor: 0.0,
            ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
            use_remlobjective: false,
            compute_covariance: false,
            use_outer_hessian: false,
            screening_max_inner_iterations: None,
            outer_inner_max_iterations: None,
            seed_screening: false,
            early_exit_threshold: None,
            outer_score_subsample: None,
            auto_outer_subsample: false,
            outer_eval_context: None,
            cache_session: None,
            cache_mirror_sessions: Vec::new(),
            joint_penalties: None,
            screen_initial_rho: true,
        };
        let per_block_log_lambdas = vec![array![10.0_f64.ln()]];
        let inner = inner_blockwise_fit(&family, &[spec], &per_block_log_lambdas, &options, None)
            .expect("inner blockwise fit should succeed");

        let beta = inner.block_states[0].beta[0];
        assert!(
            beta < 0.5,
            "beta should shrink toward penalized mode; got {}",
            beta
        );
        assert!(
            inner.log_likelihood < -1e-8,
            "raw log-likelihood should drop for this strongly penalized move; got {}",
            inner.log_likelihood
        );
    }

    #[test]
    fn exact_newton_backtracking_descent_includes_explicit_ridge() {
        let family = OneBlockLinearLikelihoodExactFamily { score: 0.5 };
        let spec = ParameterBlockSpec {
            name: "b0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![1.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            inner_tol: 0.0,
            outer_max_iter: 1,
            outer_tol: 1e-8,
            minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
            ridge_floor: 1.0,
            ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
            use_remlobjective: false,
            compute_covariance: false,
            use_outer_hessian: false,
            screening_max_inner_iterations: None,
            outer_inner_max_iterations: None,
            seed_screening: false,
            early_exit_threshold: None,
            outer_score_subsample: None,
            auto_outer_subsample: false,
            outer_eval_context: None,
            cache_session: None,
            cache_mirror_sessions: Vec::new(),
            joint_penalties: None,
            screen_initial_rho: true,
        };
        let inner = inner_blockwise_fit(&family, &[spec], &[Array1::zeros(0)], &options, None)
            .expect("inner blockwise fit should succeed");

        let beta = inner.block_states[0].beta[0];
        let objective = -inner.log_likelihood + inner.penalty_value;
        assert!(
            beta < 1.0 - 1e-12,
            "ridge-aware fallback descent should shrink beta after rejecting the uphill Newton step; got {}",
            beta
        );
        assert!(
            objective < -1e-12,
            "accepted fallback step should lower the penalized objective; got {}",
            objective
        );
    }

    #[test]
    fn outergradient_matches_finite_difference_for_one_block() {
        let n = 8usize;
        let y = Array1::from_vec(vec![0.4, -0.2, 0.8, 1.0, -0.5, 0.3, 0.1, -0.7]);
        let spec = ParameterBlockSpec {
            name: "b0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
                (n, 1),
                1.0,
            ))),
            offset: Array1::zeros(n),
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.2],
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            ridge_floor: 1e-10,
            ..BlockwiseFitOptions::default()
        };
        let penalty_counts = vec![1usize];
        let rho = array![0.1];
        let (f0, g0, _) = outerobjective_andgradient(
            &OneBlockGaussianFamily { y: y.clone() },
            std::slice::from_ref(&spec),
            &options,
            &penalty_counts,
            &rho,
            None,
        )
        .expect("objective/gradient");

        let h = 1e-5;
        let rho_p = array![rho[0] + h];
        let rho_m = array![rho[0] - h];
        let (fp, _, _) = outerobjective_andgradient(
            &OneBlockGaussianFamily { y: y.clone() },
            std::slice::from_ref(&spec),
            &options,
            &penalty_counts,
            &rho_p,
            None,
        )
        .expect("objective+");
        let (fm, _, _) = outerobjective_andgradient(
            &OneBlockGaussianFamily { y },
            std::slice::from_ref(&spec),
            &options,
            &penalty_counts,
            &rho_m,
            None,
        )
        .expect("objective-");
        let gfd = (fp - fm) / (2.0 * h);
        let rel = (g0[0] - gfd).abs() / gfd.abs().max(1e-8);

        assert!(f0.is_finite());
        assert_eq!(
            g0[0].signum(),
            gfd.signum(),
            "outer gradient sign mismatch: analytic={} fd={}",
            g0[0],
            gfd
        );
        assert!(
            rel < 5e-3,
            "outer gradient mismatch: analytic={} fd={} rel={}",
            g0[0],
            gfd,
            rel
        );
    }

    #[test]
    fn outergradient_prefers_joint_exact_pathwhen_available() {
        let spec = ParameterBlockSpec {
            name: "joint_exact".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            ridge_floor: 1e-10,
            ..BlockwiseFitOptions::default()
        };
        let penalty_counts = vec![1usize];
        let rho = array![0.0];

        let result = outerobjective_andgradient(
            &PreferJointExactFamily,
            std::slice::from_ref(&spec),
            &options,
            &penalty_counts,
            &rho,
            None,
        );
        assert!(
            result.is_ok(),
            "joint exact path should be preferred over blockwise fallback: {:?}",
            result.err()
        );
    }

    #[test]
    fn innerfit_uses_joint_exact_path_for_multiblock_constraints() {
        let spec0 = ParameterBlockSpec {
            name: "block0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let spec1 = ParameterBlockSpec {
            name: "block1".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            inner_tol: 1e-10,
            ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
            ..BlockwiseFitOptions::default()
        };
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];

        let result = inner_blockwise_fit(
            &TwoBlockJointConstrainedFamily { coupling: 0.25 },
            &[spec0, spec1],
            &per_block,
            &options,
            None,
        )
        .expect("joint constrained inner fit should succeed");

        assert!(
            result.converged,
            "joint constrained inner fit should converge in one cycle"
        );
        assert_eq!(result.cycles, 1);
        assert!((result.block_states[0].beta[0] - 0.8).abs() < 1e-8);
        assert!((result.block_states[1].beta[0] - 0.8).abs() < 1e-8);
        assert_eq!(result.active_sets, vec![None, None]);
    }

    #[test]
    fn joint_newton_budget_exhaustion_refuses_coupled_exact_inner() {
        let spec0 = ParameterBlockSpec {
            name: "block0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let spec1 = ParameterBlockSpec {
            name: "block1".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            inner_tol: 1e-12,
            ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
            ..BlockwiseFitOptions::default()
        };
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];

        let err = inner_blockwise_fit(
            &TwoBlockPersistentGradientFamily,
            &[spec0, spec1],
            &per_block,
            &options,
            None,
        )
        .expect_err("coupled exact-joint max-budget exhaustion must fail loudly");
        assert!(
            err.contains("exhausted the joint Newton budget without KKT convergence"),
            "budget exhaustion should be named explicitly: {err}"
        );
        assert!(
            err.contains("block_residual_inf"),
            "error should carry per-block residual diagnostics: {err}"
        );
    }

    /// gam#787 binary matern centers=12 regression. Near a flat-objective
    /// optimum the joint-Newton proposal shrinks to the step-tol floor while
    /// `predicted_reduction = rhs·δ − ½δᵀHδ` becomes round-off-signed. The
    /// `predicted_reduction ≤ 0` branch must NOT fire the preconditioned-descent
    /// substitution there (it would replace the tiny KKT-polishing step with an
    /// objective-descent step that catapults the residual off the near-converged
    /// iterate). `joint_proposal_at_step_floor` is the suppression gate.
    #[test]
    fn joint_proposal_at_step_floor_suppresses_descent_substitution_near_optimum() {
        // The exact c12 cycle-10 operating point: proposal_inf=1.413e-5,
        // step_tol=1.355e-5 (proposal a hair = 1.04× above tol). The iterate is
        // polishing KKT, so a pred≤0 here is round-off — the gate must fire.
        assert!(
            joint_proposal_at_step_floor(1.413e-5, 1.355e-5),
            "a proposal within 4× step_tol is at the convergence floor; \
             the descent substitution must be suppressed"
        );
        // Exactly at the 4× band edge: still at the floor.
        assert!(joint_proposal_at_step_floor(4.0 * 1.355e-5, 1.355e-5));
        // A genuinely large proposal (model-invalid direction far from the
        // optimum) is NOT at the floor — the descent substitution must still run.
        assert!(
            !joint_proposal_at_step_floor(1.182e-2, 1.355e-5),
            "an O(1e-2) proposal is far above the step floor; the \
             preconditioned-descent fallback must remain active there"
        );
        // Non-finite inputs never certify the floor (so the substitution path
        // keeps its existing non-finite handling).
        assert!(!joint_proposal_at_step_floor(f64::NAN, 1.0e-5));
        assert!(!joint_proposal_at_step_floor(1.0e-6, f64::INFINITY));
    }

    /// Independent derivation and direct numerical proof of the
    /// ρ ≈ 2 inner-PIRLS pathology pinned by the large-scale saturated-probit
    /// failure trace.
    ///
    /// # Mechanism
    ///
    /// Inner Newton on the penalized objective `f(β) = -ℓ(β) + ½βᵀSβ`
    /// uses two different ridge values:
    ///   * **APPLY** path (`apply_joint_penalized_hessian_into`, called
    ///     inside `joint_quadratic_predicted_reduction`) uses
    ///     `joint_solver_diagonal_ridge`, which equals
    ///     `joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE +
    ///     stabilizing_shift`, where the stabilizing shift is whatever
    ///     positive quantity `stabilized_joint_solver_diagonal_ridge`
    ///     adds to lift a negative-eigenvalue joint Hessian above the
    ///     SPD floor.
    ///   * **TRIAL OBJECTIVE** path (`total_quadratic_penalty`) uses
    ///     only `joint_mode_diagonal_ridge` (= `effective_solverridge`),
    ///     which is the true penalty in the objective `f` and does NOT
    ///     include the stabilizing shift.
    ///
    /// Let `Δ = joint_solver_diagonal_ridge - joint_mode_diagonal_ridge`
    /// (the gap between the SOLVE / APPLY matrix and the TRUE Hessian).
    /// For a Newton step `δ = (H_NLL + S + joint_solver_diagonal_ridge·I)⁻¹·rhs`,
    /// the Newton identity gives `δᵀ·H_used·δ = rhs·δ`, so:
    ///
    ///     predicted = rhs·δ − ½·δᵀ·H_used·δ = ½·rhs·δ
    ///     actual    = rhs·δ − ½·δᵀ·H_true·δ
    ///               = rhs·δ − ½·(δᵀ·H_used·δ − Δ·‖δ‖²)
    ///               = ½·rhs·δ + ½·Δ·‖δ‖²
    ///     ρ = actual / predicted = 1 + Δ·‖δ‖² / (rhs·δ)
    ///
    /// When `δ ∈ null(H_true)` (e.g. the marginal-block cancellation
    /// direction from `marginal_block_hessian_cancels_in_saturated_regime`
    /// combined with an unpenalized direction in the smoothing penalty's
    /// null space), `H_true·δ = 0`, so `H_used·δ = Δ·δ` and therefore
    /// `rhs = Δ·δ`, giving `rhs·δ = Δ·‖δ‖²`. Substituting:
    ///
    ///     ρ = 1 + Δ·‖δ‖² / (Δ·‖δ‖²) = 2  EXACTLY.
    ///
    /// This is independent of `Δ`, of the data size, and of `‖δ‖` — it
    /// is a structural consequence of "SOLVE/APPLY add a stabilizing
    /// shift that TRIAL OBJECTIVE doesn't see" combined with "Newton
    /// step lies in the null space of the true Hessian".
    ///
    /// # Test
    ///
    /// We construct a 2D synthetic case with H_NLL indefinite (one
    /// negative eigenvalue, mimicking the entry-survival concave term),
    /// `S = 0`, and `joint_mode_diagonal_ridge = 0` (i.e. the policy
    /// does NOT include the ridge in the objective). The stabilizing
    /// shift lifts the negative eigenvalue to the SPD floor; the Newton
    /// step lies in the formerly-near-null direction; predicted and
    /// actual are computed by the exact same routines the inner solver
    /// uses; ρ comes out to exactly 2.0 to floating-point precision.
    #[test]
    fn ridge_stabilization_gap_produces_exact_rho_two_in_null_direction() {
        // Synthetic 3D joint Hessian with the structure of the
        // saturated-probit failure case at large scale:
        //   - dim 0: indefinite contribution (eigenvalue −1) from the
        //     concave entry-survival term `+w·log Φ(−η₀)`. This triggers
        //     the SPD stabilizer in the solver.
        //   - dim 1: positive contribution (+1) from a non-saturated
        //     coefficient direction.
        //   - dim 2: ZERO from the marginal-block Hessian cancellation
        //     proven separately in `marginal_block_hessian_cancels_in_saturated_regime`.
        //     This is the saturating direction that sits in null(H_true).
        let h_nll = array![[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]];
        let source = JointHessianSource::Dense(h_nll.clone());
        let ranges = vec![(0, 3)];
        // Smoothing penalty `S` is zero in the saturating direction
        // (dim 2) — mirrors the duchon-smooth polynomial null space
        // containing constants/linears.
        let s_lambdas = vec![Array2::<f64>::zeros((3, 3))];

        // Stabilized solver ridge: should add ~1.0 to lift the
        // -1 eigenvalue to the SPD floor (~ridge_floor).
        let base = JOINT_TRACE_STABILITY_RIDGE;
        let ridge_floor = 1.0e-12_f64;
        let joint_mode_diagonal_ridge = 0.0_f64; // policy: ridge NOT in objective
        // `stabilized_joint_solver_diagonal_ridge` consults the family only
        // for `use_exact_newton_strict_spd`, which defaults to false; we
        // simulate that branch by computing the shift directly via
        // `exact_newton_stabilizing_shift`.
        let mut lhs = h_nll.clone();
        add_joint_penalty_to_matrix(&mut lhs, &ranges, &s_lambdas, base, None);
        let shift = exact_newton_stabilizing_shift(&lhs, ridge_floor)
            .expect("indefinite Hessian must yield a positive stabilizing shift");
        assert!(
            shift > 0.9,
            "shift should lift the -1 eigenvalue; got {shift}"
        );
        let joint_solver_diagonal_ridge = base + shift;
        let big_delta = joint_solver_diagonal_ridge - joint_mode_diagonal_ridge;

        // True Hessian (what TRIAL OBJECTIVE sees):
        //   H_true = H_NLL + S + joint_mode_diagonal_ridge·I
        //          = diag(-1, 1, 0)
        //   ⇒ dim 2 is a null direction of H_true.
        // Used Hessian (what SOLVE / APPLY uses):
        //   H_used = H_NLL + S + joint_solver_diagonal_ridge·I
        //          = diag(-1+Δ, 1+Δ, Δ)   where Δ ≈ 1.0
        //   ⇒ dim 2 has curvature Δ (purely from the stabilizing shift,
        //     which fires because dim 0 is negative).
        // rhs aimed entirely in dim 2 puts the Newton step in null(H_true).
        let rhs = array![0.0_f64, 0.0, 1.0];
        let h_used_22 = 0.0 + joint_solver_diagonal_ridge;
        let delta = array![0.0, 0.0, rhs[2] / h_used_22];

        // Compute hpen_delta via the SAME helper the inner solver uses.
        let mut hpen_delta = Array1::<f64>::zeros(3);
        apply_joint_penalized_hessian_into(
            &source,
            &ranges,
            &s_lambdas,
            joint_solver_diagonal_ridge,
            &delta,
            &mut hpen_delta,
            None,
        )
        .expect("apply joint penalized hessian must succeed");

        // Predicted = the exact formula the inner solver uses.
        let predicted = joint_quadratic_predicted_reduction(&rhs, &hpen_delta, &delta);

        // Actual (true) reduction: f(β=0) − f(β+δ) for the true objective
        //   f(β) = ½·βᵀ·H_NLL·β + ½·βᵀ·S·β + ½·joint_mode_diagonal_ridge·‖β‖² + bᵀ·β
        // taking β_start = 0 and using the Newton identity for the truth:
        //   actual = rhs·δ − ½·δᵀ·H_true·δ
        // where H_true = H_NLL + S + joint_mode_diagonal_ridge·I.
        let mut h_true_delta = Array1::<f64>::zeros(3);
        apply_joint_penalized_hessian_into(
            &source,
            &ranges,
            &s_lambdas,
            joint_mode_diagonal_ridge,
            &delta,
            &mut h_true_delta,
            None,
        )
        .expect("apply true (un-stabilized) hessian must succeed");
        let actual = rhs.dot(&delta) - 0.5 * delta.dot(&h_true_delta);

        let rho = actual / predicted;
        eprintln!(
            "[rho-2 proof] Δ = {big_delta:.6e}, rhs·δ = {rd:.6e}, Δ·‖δ‖² = {dn:.6e}, predicted = {predicted:.6e}, actual = {actual:.6e}, ρ = {rho:.10}",
            rd = rhs.dot(&delta),
            dn = big_delta * delta.dot(&delta),
        );

        // ρ must be EXACTLY 2 to floating-point precision (not just "close to 2").
        // This is the structural fingerprint of the SOLVE/APPLY-vs-OBJECTIVE
        // ridge-stabilization gap in the saturated regime.
        assert!(
            (rho - 2.0).abs() <= 1e-10,
            "ρ should be EXACTLY 2 when Newton step lies in null(H_true) with stabilizing-shift gap; got {rho}",
        );

        // Sanity: the identity rhs·δ = Δ·‖δ‖² must hold (this is the
        // mathematical core of why ρ = 2 specifically and not 1.5 or 3).
        let rhs_dot_delta = rhs.dot(&delta);
        let delta_sq_times_big_delta = big_delta * delta.dot(&delta);
        assert!(
            (rhs_dot_delta - delta_sq_times_big_delta).abs() <= 1e-10 * rhs_dot_delta.abs(),
            "Newton-identity null-space condition: rhs·δ ({rhs_dot_delta}) should equal Δ·‖δ‖² ({delta_sq_times_big_delta})",
        );

        // And ρ = 2 holds AT ALL MAGNITUDES of δ — verify by scaling rhs:
        for scale in [0.001_f64, 0.029, 1.0, 988.0] {
            let scaled_rhs = &rhs * scale;
            let scaled_delta = &delta * scale;
            let mut scaled_hpen = Array1::<f64>::zeros(3);
            apply_joint_penalized_hessian_into(
                &source,
                &ranges,
                &s_lambdas,
                joint_solver_diagonal_ridge,
                &scaled_delta,
                &mut scaled_hpen,
                None,
            )
            .expect("apply scaled");
            let scaled_predicted =
                joint_quadratic_predicted_reduction(&scaled_rhs, &scaled_hpen, &scaled_delta);
            let mut scaled_h_true_delta = Array1::<f64>::zeros(3);
            apply_joint_penalized_hessian_into(
                &source,
                &ranges,
                &s_lambdas,
                joint_mode_diagonal_ridge,
                &scaled_delta,
                &mut scaled_h_true_delta,
                None,
            )
            .expect("apply scaled true");
            let scaled_actual =
                scaled_rhs.dot(&scaled_delta) - 0.5 * scaled_delta.dot(&scaled_h_true_delta);
            let scaled_rho = scaled_actual / scaled_predicted;
            assert!(
                (scaled_rho - 2.0).abs() <= 1e-10,
                "ρ invariance under step rescaling broke at scale {scale}: got {scaled_rho}",
            );
        }
    }

    #[test]
    fn joint_solver_ridge_stabilizes_dense_indefinite_coupled_hessian() {
        let family = TwoBlockJointConstrainedFamily { coupling: 2.0 };
        let source = JointHessianSource::Dense(array![[1.0, 2.0], [2.0, 1.0]]);
        let ranges = vec![(0, 1), (1, 2)];
        let s_lambdas = vec![Array2::zeros((1, 1)), Array2::zeros((1, 1))];
        let ridge = stabilized_joint_solver_diagonal_ridge(
            &family,
            &source,
            &ranges,
            &s_lambdas,
            JOINT_TRACE_STABILITY_RIDGE,
            1e-12,
            None,
        );

        assert!(
            ridge > 1.0,
            "dense joint solver ridge should lift the negative eigenvalue; got {ridge}"
        );
        let mut stabilized = match source {
            JointHessianSource::Dense(matrix) => matrix,
            JointHessianSource::Operator { .. } => {
                panic!("dense joint solver fixture must use a dense Hessian source")
            }
        };
        add_joint_penalty_to_matrix(&mut stabilized, &ranges, &s_lambdas, ridge, None);
        let min_eval = 0.5
            * (stabilized[[0, 0]] + stabilized[[1, 1]]
                - ((stabilized[[0, 0]] - stabilized[[1, 1]]).powi(2)
                    + 4.0 * stabilized[[0, 1]].powi(2))
                .sqrt());
        assert!(
            min_eval > 0.0,
            "stabilized dense joint Hessian should be SPD; min_eval={min_eval}"
        );
    }

    #[test]
    fn outergradient_uses_joint_surrogate_formultiblock_diagonal_family() {
        let spec0 = ParameterBlockSpec {
            name: "block0".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0],
                [1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let spec1 = ParameterBlockSpec {
            name: "block1".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0],
                [1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            ridge_floor: 1e-10,
            outer_max_iter: 1,
            ..BlockwiseFitOptions::default()
        };
        let penalty_counts = vec![1usize, 1usize];
        let rho = array![0.0, 0.0];

        let result = outerobjective_andgradient(
            &TwoBlockJointSurrogateFamily,
            &[spec0, spec1],
            &options,
            &penalty_counts,
            &rho,
            None,
        );
        assert!(
            result.is_ok(),
            "default joint multi-block surrogate path should succeed without blockwise dW callbacks: {:?}",
            result.err()
        );
    }

    #[test]
    fn exact_newton_pseudo_laplace_objective_uses_logdet_h_without_logdet_s() {
        let spec = ParameterBlockSpec {
            name: "pseudo_laplace".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let fit = fit_custom_family(
            &OneBlockPseudoLaplaceExactFamily { target: 1.5 },
            &[spec],
            &options,
        )
        .expect("pseudo-laplace exact-newton fit");
        let expected = 0.5 * 2.0_f64.ln();
        assert!(
            (fit.penalized_objective - expected).abs() < 1e-8,
            "pseudo-Laplace objective mismatch: got {}, expected {}",
            fit.penalized_objective,
            expected
        );
    }

    #[test]
    fn exact_newton_joint_psi_hook_can_supply_fixed_beta_termswithout_quadratic_spsi() {
        let spec = ParameterBlockSpec {
            name: "psi_hook".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let deriv = CustomFamilyBlockPsiDerivative {
            penalty_index: None,
            x_psi: Array2::zeros((1, 1)),
            s_psi: Array2::zeros((1, 1)),
            s_psi_components: None,
            s_psi_penalty_components: None,
            x_psi_psi: None,
            s_psi_psi: None,
            s_psi_psi_components: None,
            s_psi_psi_penalty_components: None,
            implicit_operator: None,
            implicit_axis: 0,
            implicit_group_id: None,
        };
        let result = evaluate_custom_family_joint_hyper(
            &OneBlockExactPsiHookFamily,
            &[spec],
            &BlockwiseFitOptions {
                use_remlobjective: true,
                compute_covariance: false,
                ..BlockwiseFitOptions::default()
            },
            &Array1::zeros(0),
            &[vec![deriv]],
            None,
            EvalMode::ValueAndGradient,
        )
        .expect("joint hyper eval with exact joint psi hook");
        assert_eq!(result.gradient.len(), 1);
        assert!(
            (result.gradient[0] - 3.5).abs() < 1e-12,
            "expected family-supplied joint psi term, got {}",
            result.gradient[0]
        );
    }

    #[test]
    fn pseudo_laplace_exact_newton_rejects_indefinite_hessian() {
        // #748: an indefinite joint coefficient Hessian (here a 1×1 block with
        // H=-1) is a real defect — a mis-signed / non-convex curvature, or a β
        // that is not at the inner block optimum. The strict pseudo-Laplace
        // REML logdet must REJECT such a ρ-trial, not mask it. The earlier path
        // returned `log|H + δI|` with δ escalated to 10 (so H+δI=[[9]],
        // logdet=log 9) and let the fit "succeed" — but the analytic REML
        // gradient still used `tr((H+S_λ)⁻¹·)` on the un-ridged H, so value and
        // gradient described two different objectives. Rejecting is the honest
        // signal: the outer optimizer steps back instead of optimizing a biased,
        // δ-shifted surface. The fit therefore now ERRORS where it formerly
        // returned a masked result.
        let spec = ParameterBlockSpec {
            name: "indefinite".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let result = fit_custom_family(
            &OneBlockIndefinitePseudoLaplaceFamily,
            &[spec],
            &BlockwiseFitOptions {
                use_remlobjective: true,
                compute_covariance: false,
                ..BlockwiseFitOptions::default()
            },
        );
        let err = result
            .expect_err(
                "strict pseudo-Laplace must reject the indefinite Hessian H=[[-1]], not δ-ridge mask it",
            )
            .to_string();
        assert!(
            err.contains("indefinite") || err.contains("below -tol"),
            "rejection error should name the indefiniteness; got: {err}",
        );
    }

    #[test]
    fn auto_determinant_mode_is_exact_full_logdet_policy() {
        let h = array![[6.0, 0.8, 0.1], [0.8, 4.5, 0.4], [0.1, 0.4, 3.2]];
        let exact = stable_logdet_with_ridge_policy(
            &h,
            1e-8,
            RidgePolicy::explicit_stabilization_full_exact(),
        )
        .expect("exact logdet");
        let auto =
            stable_logdet_with_ridge_policy(&h, 1e-8, RidgePolicy::explicit_stabilization_full())
                .expect("auto logdet");
        assert!((auto - exact).abs() < 1e-12, "auto={auto}, exact={exact}");
    }

    #[test]
    fn indefinite_hessian_uses_smooth_regularized_logdet() {
        // Indefinite Hessian: eigenvalues {-1, 2}.
        //
        // Old behaviour: silently drop the -1 direction from logdet, warn,
        // and after enough repeats escalate to an EFS abort (first-order
        // fallback marker).
        //
        // New behaviour: every eigenvalue contributes via the smooth
        // regularizer r_ε(σ) = ½(σ + √(σ² + 4ε²)).  No direction is ignored,
        // no escalation, and the logdet matches what the downstream
        // `DenseSpectralOperator` gradient computes — eliminating the
        // cost/gradient mismatch that broke BFGS line search.
        let h = array![[-1.0, 0.0], [0.0, 2.0]];
        let logdet = stable_logdet_with_ridge_policy(
            &h,
            1e-12,
            RidgePolicy::explicit_stabilization_pospart(),
        )
        .expect("smooth-regularized logdet must be finite for indefinite H");
        assert!(
            logdet.is_finite(),
            "smooth-regularized logdet should be finite, got {logdet}"
        );
        // Reference value using the same formula directly on the eigenvalues
        // of H + ridge·I (ridge = 1e-12 here).  Since ε ≫ ridge (spectral_epsilon
        // floors at √(eps_mach) ≈ 1.5e-8 for p=2), the ridge contribution is
        // absorbed into ε and the expected value is Σ log r_ε(σ_j).
        let eps = spectral_epsilon(&[-1.0_f64, 2.0]).max(1e-12_f64.max(1e-14));
        // A + ridge·I has eigenvalues shifted by 1e-12, negligible relative to ε.
        let expected: f64 = [-1.0_f64 + 1e-12, 2.0 + 1e-12]
            .iter()
            .map(|&s| spectral_regularize(s, eps).ln())
            .sum();
        assert!(
            (logdet - expected).abs() < 1e-10,
            "logdet={logdet}, expected={expected}"
        );
    }

    #[test]
    fn pseudo_laplace_exact_newton_symmetrizes_nearly_symmetrichessian() {
        let spec = ParameterBlockSpec {
            name: "nearly_symmetric".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0, 0.0],
                [0.0, 1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0, 0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let fit = fit_custom_family(
            &OneBlockNearlySymmetricPseudoLaplaceFamily,
            &[spec],
            &BlockwiseFitOptions {
                use_remlobjective: true,
                compute_covariance: false,
                ..BlockwiseFitOptions::default()
            },
        )
        .expect("nearly symmetric pseudo-laplace Hessian should be accepted after symmetrization");
        assert!(
            fit.penalized_objective.is_finite(),
            "expected finite pseudo-laplace objective, got {}",
            fit.penalized_objective
        );
    }

    #[test]
    fn outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active() {
        crate::solver::visualizer::init_logging();
        let BinomialLocationScaleWiggleOuterFixture {
            family,
            specs,
            penalty_counts,
            rho,
            options: base_options,
        } = binomial_location_scale_wiggle_outer_fixture();
        // FD/analytic noise floor below is `EPS·|cost|/h`, valid only when PIRLS
        // converges to f64 precision; HardPseudo + σ_min~1e-10 amplifies the
        // default 1e-6 inner residual into ~1e-7 cost slack that lifts both
        // estimators above the machine-precision floor.
        let options = BlockwiseFitOptions {
            inner_tol: 1e-12,
            inner_max_cycles: 500,
            ..base_options
        };

        let (f0, g0, _) =
            outerobjective_andgradient(&family, &specs, &options, &penalty_counts, &rho, None)
                .expect("objective/gradient");
        assert!(f0.is_finite());
        assert_eq!(g0.len(), rho.len());

        let h = 1e-5;
        for k in 0..rho.len() {
            let mut rho_p = rho.clone();
            let mut rho_m = rho.clone();
            rho_p[k] += h;
            rho_m[k] -= h;
            let (fp, _, _) = outerobjective_andgradient(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_p,
                None,
            )
            .expect("objective+");
            let (fm, _, _) = outerobjective_andgradient(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_m,
                None,
            )
            .expect("objective-");
            let gfd = (fp - fm) / (2.0 * h);

            // Noise floor for FD-vs-analytic comparisons.
            //
            // At a rank-deficient optimum (σ_min(H) ≲ ε_machine) the outer
            // REML gradient is a DIFFERENCE of two nearly-equal O(1)
            // quantities — ½ λ_k (H⁺[k,k] − S⁺[k,k]) — so the true gradient
            // is very close to zero.  The FD estimator `(f_p − f_m)/(2h)`
            // then measures cost-sum round-off: at f64 precision each cost
            // value carries an uncertainty of ~EPS · |cost|, and the
            // symmetric FD inflates that by 1/(2h), producing a noise floor
            // of roughly `EPS · |cost| / h` on |gfd|.  Below that floor
            // neither `|gfd|`, `|g0|`, nor `sign(gfd)` reflect the true
            // derivative — they reflect arithmetic noise.
            //
            // Concretely: for this test `|cost| ~ 6`, `h = 1e-5`, so the
            // floor is ~1.3e-10 (≈ f64::EPSILON · 6 / 1e-5).  We round up
            // to a problem-scale-derived value and treat pairs where BOTH
            // |g0| and |gfd| lie below the floor as a pass (the assertion
            // is making a claim about the TRUE derivative, and a true
            // derivative strictly less than noise is indistinguishable
            // from zero — sign is not a correctness property there).
            let cost_magnitude = f0.abs().max(1.0);
            let noise_floor = (10.0 * f64::EPSILON * cost_magnitude / h).max(1e-9);
            let both_in_noise = g0[k].abs() < noise_floor && gfd.abs() < noise_floor;

            if !both_in_noise {
                assert_eq!(
                    g0[k].signum(),
                    gfd.signum(),
                    "outer LAML gradient sign mismatch at {}: analytic={} fd={} noise_floor={:.3e}",
                    k,
                    g0[k],
                    gfd,
                    noise_floor,
                );
                let rel = (g0[k] - gfd).abs() / gfd.abs().max(noise_floor);
                assert!(
                    rel < 2e-2,
                    "outer LAML gradient mismatch at {}: analytic={} fd={} rel={} noise_floor={:.3e}",
                    k,
                    g0[k],
                    gfd,
                    rel,
                    noise_floor,
                );
            }
        }
    }

    #[test]
    fn rho_only_outer_objective_matches_joint_hyper_when_psi_is_empty() {
        let BinomialLocationScaleWiggleOuterFixture {
            family,
            specs,
            penalty_counts,
            rho,
            options,
        } = binomial_location_scale_wiggle_outer_fixture();

        let (outer_obj, outer_grad, outer_hessian, _) =
            super::test_support::outerobjectivegradienthessian(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho,
                None,
                EvalMode::ValueGradientHessian,
            )
            .expect("rho-only outer objective");
        let derivative_blocks = vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
        let joint_result = evaluate_custom_family_joint_hyper(
            &family,
            &specs,
            &options,
            &rho,
            &derivative_blocks,
            None,
            EvalMode::ValueGradientHessian,
        )
        .expect("joint hyper objective with empty psi");

        assert!(
            (outer_obj - joint_result.objective).abs() < 1e-12,
            "objective mismatch: rho-only={} joint={}",
            outer_obj,
            joint_result.objective
        );
        assert_eq!(outer_grad.len(), joint_result.gradient.len());
        let max_grad_diff = outer_grad
            .iter()
            .zip(joint_result.gradient.iter())
            .map(|(lhs, rhs)| (lhs - rhs).abs())
            .fold(0.0_f64, f64::max);
        assert!(
            max_grad_diff < 1e-12,
            "gradient mismatch: max diff={}",
            max_grad_diff
        );

        let outer_hessian = outer_hessian.expect("rho-only outer Hessian");
        let joint_hessian = joint_result
            .outer_hessian
            .materialize_dense()
            .expect("joint outer Hessian should materialize")
            .expect("joint outer Hessian");
        assert_eq!(outer_hessian.dim(), joint_hessian.dim());
        let max_hessian_diff = outer_hessian
            .iter()
            .zip(joint_hessian.iter())
            .map(|(lhs, rhs)| (lhs - rhs).abs())
            .fold(0.0_f64, f64::max);
        assert!(
            max_hessian_diff < 1e-12,
            "outer Hessian mismatch: max diff={}",
            max_hessian_diff
        );
    }

    /// Shared probit binomial-location-scale outer-derivative test fixture:
    /// builds the (threshold, log_sigma) block specs, family, penalty counts,
    /// and outer options that every `outer_laml*_binomial_location_scale_*`
    /// finite-difference test constructs identically apart from `y` and the
    /// two block initial betas.
    fn binomial_location_scale_outer_fixture(
        y: Array1<f64>,
        threshold_initial_beta: f64,
        log_sigma_initial_beta: f64,
    ) -> (
        BinomialLocationScaleFamily,
        Vec<ParameterBlockSpec>,
        Vec<usize>,
        BlockwiseFitOptions,
    ) {
        let n = y.len();
        let weights = Array1::from_elem(n, 1.0);
        let thresholdspec = ParameterBlockSpec {
            name: "threshold".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
                (n, 1),
                1.0,
            ))),
            offset: Array1::zeros(n),
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![threshold_initial_beta]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let log_sigmaspec = ParameterBlockSpec {
            name: "log_sigma".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
                (n, 1),
                1.0,
            ))),
            offset: Array1::zeros(n),
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![log_sigma_initial_beta]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let threshold_design = thresholdspec.design.clone();
        let log_sigma_design = log_sigmaspec.design.clone();
        let family = BinomialLocationScaleFamily {
            y,
            weights,
            link_kind: crate::types::InverseLink::Standard(crate::types::StandardLink::Probit),
            threshold_design: Some(threshold_design),
            log_sigma_design: Some(log_sigma_design),
            policy: crate::resource::ResourcePolicy::default_library(),
        };
        let specs = vec![thresholdspec, log_sigmaspec];
        let penalty_counts = vec![1usize, 1usize];
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            ridge_floor: 1e-10,
            outer_max_iter: 1,
            ..BlockwiseFitOptions::default()
        };
        (family, specs, penalty_counts, options)
    }

    #[test]
    fn outer_lamlgradient_diagonal_binomial_location_scale_matchesfd() {
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
        let (family, specs, penalty_counts, options) =
            binomial_location_scale_outer_fixture(y, 0.0, 0.0);
        let rho = array![0.0, 0.0];

        let (f0, g0, _) =
            outerobjective_andgradient(&family, &specs, &options, &penalty_counts, &rho, None)
                .expect("objective/gradient");
        assert!(f0.is_finite());
        assert_eq!(g0.len(), rho.len());

        let h = 1e-5;
        for k in 0..rho.len() {
            let mut rho_p = rho.clone();
            let mut rho_m = rho.clone();
            rho_p[k] += h;
            rho_m[k] -= h;
            let (fp, _, _) = outerobjective_andgradient(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_p,
                None,
            )
            .expect("objective+");
            let (fm, _, _) = outerobjective_andgradient(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_m,
                None,
            )
            .expect("objective-");
            let gfd = (fp - fm) / (2.0 * h);
            let abs = (g0[k] - gfd).abs();
            let rel = abs / gfd.abs().max(1e-8);
            if abs >= 2e-3 {
                assert_eq!(
                    g0[k].signum(),
                    gfd.signum(),
                    "outer diagonal LAML gradient sign mismatch at {}: analytic={} fd={}",
                    k,
                    g0[k],
                    gfd
                );
            }
            assert!(
                abs < 2e-3 || rel < 2e-3,
                "outer diagonal LAML gradient mismatch at {}: analytic={} fd={} abs={} rel={}",
                k,
                g0[k],
                gfd,
                abs,
                rel
            );
        }
    }

    #[test]
    fn outer_lamlgradient_diagonal_binomial_location_scale_hard_case_matchesfd() {
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
        let (family, specs, penalty_counts, options) =
            binomial_location_scale_outer_fixture(y, 0.2, -0.1);
        let rho = array![0.15, -0.25];

        let (f0, g0, _) =
            outerobjective_andgradient(&family, &specs, &options, &penalty_counts, &rho, None)
                .expect("objective/gradient");
        assert!(f0.is_finite());
        assert_eq!(g0.len(), rho.len());

        let h = 1e-5;
        for k in 0..rho.len() {
            let mut rho_p = rho.clone();
            let mut rho_m = rho.clone();
            rho_p[k] += h;
            rho_m[k] -= h;
            let (fp, _, _) = outerobjective_andgradient(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_p,
                None,
            )
            .expect("objective+");
            let (fm, _, _) = outerobjective_andgradient(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_m,
                None,
            )
            .expect("objective-");
            let gfd = (fp - fm) / (2.0 * h);
            let abs = (g0[k] - gfd).abs();
            let rel = abs / gfd.abs().max(1e-8);
            if abs >= 2e-3 {
                assert_eq!(
                    g0[k].signum(),
                    gfd.signum(),
                    "outer diagonal hard-case LAML gradient sign mismatch at {}: analytic={} fd={}",
                    k,
                    g0[k],
                    gfd
                );
            }
            assert!(
                abs < 2e-3 || rel < 2e-3,
                "outer diagonal hard-case LAML gradient mismatch at {}: analytic={} fd={} abs={} rel={}",
                k,
                g0[k],
                gfd,
                abs,
                rel
            );
        }
    }

    #[test]
    fn outer_lamlhessian_joint_exact_binomial_location_scale_matchesfd() {
        // Asymmetric y (6 ones / 4 zeros). A balanced 5/5 vector forces
        // β̂_threshold = 0 by probit-link symmetry, which makes the joint
        // observed Hessian block-diagonal in (threshold, log_sigma) at the
        // inner mode. The outer LAML Hessian off-diagonals are then ~1e-11,
        // below the central-FD noise floor (≈ pirls_tol / h) at h=1e-5, so
        // FD-vs-analytic agreement cannot be enforced. Asymmetric y gives
        // β̂_threshold ≠ 0, coupling the (β_0, β_1) blocks through the
        // observed-information weights and making all four entries validatable.
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]);
        let (family, specs, penalty_counts, options) =
            binomial_location_scale_outer_fixture(y, 0.15, -0.05);
        let rho = array![0.1, -0.2];

        let (_, _, h0_opt, _) = super::test_support::outerobjectivegradienthessian(
            &family,
            &specs,
            &options,
            &penalty_counts,
            &rho,
            None,
            EvalMode::ValueGradientHessian,
        )
        .expect("objective/gradient/hessian");
        let h0 = h0_opt.expect("analytic outer Hessian should be available");
        assert_eq!(h0.nrows(), rho.len());
        assert_eq!(h0.ncols(), rho.len());

        let h = 1e-5;
        for l in 0..rho.len() {
            let mut rho_p = rho.clone();
            let mut rho_m = rho.clone();
            rho_p[l] += h;
            rho_m[l] -= h;
            let (_, gp, _, _) = super::test_support::outerobjectivegradienthessian(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_p,
                None,
                EvalMode::ValueAndGradient,
            )
            .expect("objective/gradient +");
            let (_, gm, _, _) = super::test_support::outerobjectivegradienthessian(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_m,
                None,
                EvalMode::ValueAndGradient,
            )
            .expect("objective/gradient -");

            for k in 0..rho.len() {
                let hfd = (gp[k] - gm[k]) / (2.0 * h);
                let abs_err = (h0[[k, l]] - hfd).abs();
                let rel = (h0[[k, l]] - hfd).abs() / hfd.abs().max(1e-7);
                if h0[[k, l]].abs().max(hfd.abs()) > 1e-10 {
                    assert_eq!(
                        h0[[k, l]].signum(),
                        hfd.signum(),
                        "outer Hessian sign mismatch at ({k},{l}): analytic={} fd={}",
                        h0[[k, l]],
                        hfd
                    );
                }
                assert!(
                    abs_err < 1e-8 || rel < 2e-2,
                    "outer Hessian mismatch at ({k},{l}): analytic={} fd={} abs={} rel={}",
                    h0[[k, l]],
                    hfd,
                    abs_err,
                    rel
                );
            }
        }

        for i in 0..h0.nrows() {
            for j in 0..i {
                let asym = (h0[[i, j]] - h0[[j, i]]).abs();
                assert!(
                    asym < 1e-8,
                    "outer Hessian not symmetric at ({i},{j}): {asym}"
                );
            }
        }
    }

    #[test]
    fn block_solve_sparse_matches_dense() {
        let x_dense = array![
            [1.0, 0.0, 2.0],
            [0.0, 3.0, 0.0],
            [4.0, 0.0, 5.0],
            [0.0, 6.0, 0.0]
        ];
        let y_star = array![1.0, -1.0, 0.5, 2.0];
        let w = array![1.0, 0.5, 2.0, 1.5];
        let s_lambda = Array2::<f64>::eye(3) * 0.1;

        let mut triplets = Vec::new();
        for i in 0..x_dense.nrows() {
            for j in 0..x_dense.ncols() {
                let v = x_dense[[i, j]];
                if v != 0.0 {
                    triplets.push(Triplet::new(i, j, v));
                }
            }
        }
        let x_sparse = SparseColMat::try_new_from_triplets(4, 3, &triplets)
            .expect("sparse matrix build should succeed");

        let beta_dense = solve_blockweighted_system(
            &DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x_dense.clone())),
            &y_star,
            &w,
            &s_lambda,
            1e-12,
            RidgePolicy::explicit_stabilization_pospart(),
        )
        .expect("dense solve should succeed");

        let beta_sparse = solve_blockweighted_system(
            &DesignMatrix::from(x_sparse),
            &y_star,
            &w,
            &s_lambda,
            1e-12,
            RidgePolicy::explicit_stabilization_pospart(),
        )
        .expect("sparse solve should succeed");

        for j in 0..beta_dense.len() {
            assert!(
                (beta_dense[j] - beta_sparse[j]).abs() < 1e-10,
                "dense/sparse mismatch at {}: {} vs {}",
                j,
                beta_dense[j],
                beta_sparse[j]
            );
        }
    }

    #[test]
    fn outer_lamlhessian_joint_exact_binomial_location_scale_hard_case_matchesfd() {
        let y = Array1::from_vec(vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
        let (family, specs, penalty_counts, options) =
            binomial_location_scale_outer_fixture(y, 0.2, -0.1);
        let rho = array![0.15, -0.25];

        let (_, _, h0_opt, _) = super::test_support::outerobjectivegradienthessian(
            &family,
            &specs,
            &options,
            &penalty_counts,
            &rho,
            None,
            EvalMode::ValueGradientHessian,
        )
        .expect("objective/gradient/hessian");
        let h0 = h0_opt.expect("analytic outer Hessian should be available");
        assert_eq!(h0.nrows(), rho.len());
        assert_eq!(h0.ncols(), rho.len());

        let h = 1e-5;
        for l in 0..rho.len() {
            let mut rho_p = rho.clone();
            let mut rho_m = rho.clone();
            rho_p[l] += h;
            rho_m[l] -= h;
            let (_, gp, _, _) = super::test_support::outerobjectivegradienthessian(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_p,
                None,
                EvalMode::ValueAndGradient,
            )
            .expect("objective/gradient +");
            let (_, gm, _, _) = super::test_support::outerobjectivegradienthessian(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho_m,
                None,
                EvalMode::ValueAndGradient,
            )
            .expect("objective/gradient -");

            for k in 0..rho.len() {
                let hfd = (gp[k] - gm[k]) / (2.0 * h);
                let abs_err = (h0[[k, l]] - hfd).abs();
                let rel = abs_err / hfd.abs().max(1e-7);
                if h0[[k, l]].abs().max(hfd.abs()) > 1e-10 {
                    assert_eq!(
                        h0[[k, l]].signum(),
                        hfd.signum(),
                        "hard-case outer Hessian sign mismatch at ({k},{l}): analytic={} fd={}",
                        h0[[k, l]],
                        hfd
                    );
                }
                assert!(
                    abs_err < 1e-8 || rel < 2e-2,
                    "hard-case outer Hessian mismatch at ({k},{l}): analytic={} fd={} abs={} rel={}",
                    h0[[k, l]],
                    hfd,
                    abs_err,
                    rel
                );
            }
        }
    }

    #[test]
    fn block_solve_falls_backwhen_llt_rejects_indefinite_system() {
        let x_dense = array![[1.0, 0.0], [0.0, 0.0]];
        let y_star = array![2.0, 0.0];
        let w = array![1.0, 1.0];
        let s_lambda = array![[0.0, 0.0], [0.0, -1e-12]];

        let beta = solve_blockweighted_system(
            &DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(x_dense)),
            &y_star,
            &w,
            &s_lambda,
            1e-12,
            RidgePolicy::explicit_stabilization_pospart(),
        )
        .expect("fallback solve should succeed");

        assert!(beta.iter().all(|v| v.is_finite()));
        assert!(
            (beta[0] - 2.0).abs() < 1e-10,
            "unexpected solved coefficient"
        );
        assert!(
            beta[1].abs() < 1e-8,
            "null-space coefficient should stay near zero"
        );
    }

    #[test]
    fn exact_newton_block_enforces_linear_constraints() {
        let spec = ParameterBlockSpec {
            name: "exact_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![1.5]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let family = OneBlockConstrainedExactFamily {
            target: 0.0,
            lower: 1.0,
        };
        let fit = fit_custom_family(&family, &[spec], &BlockwiseFitOptions::default())
            .expect("constrained exact-newton fit");
        let beta = fit.block_states[0].beta[0];
        assert!(
            (beta - 1.0).abs() < 1e-8,
            "expected constrained optimum at lower bound, got {beta}"
        );
    }

    #[test]
    fn extract_simple_lower_bounds_accepts_axis_aligned_rows() {
        let constraints = LinearInequalityConstraints {
            a: array![[1.0, 0.0], [0.0, 2.0], [3.0, 0.0]],
            b: array![0.25, 1.0, 1.5],
        };
        let bounds = extract_simple_lower_bounds(&constraints, 2)
            .expect("lower-bound extraction should succeed")
            .expect("axis-aligned rows should map to lower bounds");
        assert_relative_eq!(bounds.lower_bounds[0], 0.5, epsilon = 1e-12);
        assert_relative_eq!(bounds.lower_bounds[1], 0.5, epsilon = 1e-12);
        assert_eq!(bounds.coeff_to_row, vec![Some(2), Some(1)]);
    }

    #[test]
    fn extract_simple_lower_bounds_rejects_coupled_rows() {
        let constraints = LinearInequalityConstraints {
            a: array![[1.0, 1.0]],
            b: array![0.0],
        };
        assert!(
            extract_simple_lower_bounds(&constraints, 2)
                .expect("lower-bound extraction should not error on valid shapes")
                .is_none(),
            "coupled rows must stay on the generic linear-constraint path"
        );
    }

    #[test]
    fn constrained_exact_newton_indefinite_hessian_uses_stabilized_delta_solve() {
        let spec = ParameterBlockSpec {
            name: "exact_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![1.5]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let states = vec![ParameterBlockState {
            beta: array![1.5],
            eta: array![1.5],
        }];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0]],
            b: array![1.0],
        };
        let hessian = SymmetricMatrix::Dense(array![[-1.0]]);
        let updater = ExactNewtonBlockUpdater {
            gradient: &array![-1.0],
            hessian: &hessian,
        };
        let s_lambda = Array2::zeros((1, 1));
        let update = updater
            .compute_update_step(&BlockUpdateContext {
                family: &OneBlockConstrainedIndefiniteHessianFamily,
                states: &states,
                spec: &spec,
                block_idx: 0,
                s_lambda: &s_lambda,
                options: &BlockwiseFitOptions::default(),
                linear_constraints: Some(&constraints),
                cached_active_set: None,
            })
            .expect("indefinite constrained exact-newton update should be stabilized");
        assert_relative_eq!(update.beta_new_raw[0], 1.0, epsilon = 1e-12);
        assert_eq!(update.active_set, Some(vec![0]));
    }

    #[test]
    fn quadratic_linear_constraints_release_positive_kkt_systemmultiplier() {
        // max ll with exact Newton equivalent to minimizing
        // 0.5 * x^2 - rhs*x with rhs=1 under 0 <= x <= 0.1.
        // At x=0, active-set KKT solve gives lambda_sys=+1 for the lower bound,
        // which must be released (lambda_true = -lambda_sys).
        let hessian = array![[1.0]];
        let rhs = array![1.0];
        let beta_start = array![0.0];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0], [-1.0]],
            b: array![0.0, -0.1],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            None,
        )
        .expect("constrained quadratic solve should succeed");

        assert!(
            (beta[0] - 0.1).abs() <= 1e-10,
            "expected constrained optimum at upper bound 0.1, got {}",
            beta[0]
        );
        assert_eq!(active.len(), 1);
    }

    #[test]
    fn quadratic_linear_constraints_ignore_near_tangential_inactiverows() {
        let hessian = array![[1.0, 0.0], [0.0, 1.0]];
        let rhs = array![1.0, 0.0];
        let beta_start = array![0.0, 0.0];
        let constraints = LinearInequalityConstraints {
            a: array![[-1e-16, 1.0]],
            b: array![-1.0],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            None,
        )
        .expect("near-tangential inactive row should not block the quadratic step");

        assert!(
            (beta[0] - 1.0).abs() <= 1e-12,
            "expected unconstrained x-solution of 1.0, got {}",
            beta[0]
        );
        assert!(
            beta[1].abs() <= 1e-12,
            "expected zero y-solution, got {}",
            beta[1]
        );
        assert!(active.is_empty(), "no row should become active");
    }

    #[test]
    fn quadratic_linear_constraints_projectwarm_activerows_back_to_boundary() {
        let hessian = array![[2.0]];
        let rhs = array![0.0];
        let beta_start = array![1e-9];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0]],
            b: array![0.0],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            Some(&[0]),
        )
        .expect("constrained quadratic solve should project back to the boundary");

        assert_relative_eq!(beta[0], 0.0, epsilon = 1e-14);
        assert_eq!(active, vec![0]);
    }

    #[test]
    fn quadratic_linear_constraints_handles_near_dependent_rows() {
        // Three constraints in R^2 where the third is nearly a linear
        // combination of the first two, making the naive KKT system
        // ill-conditioned.  The rank-reducing compression should drop
        // the dependent row and the QP should converge cleanly.
        //
        //   x1 >= 0,  x2 >= 0,  x1 + x2 + eps >= 0   (eps ≈ 0)
        //
        // Minimize 0.5 * ||x - [−1, −1]||^2  =>  optimum at origin.
        let hessian = Array2::eye(2);
        let rhs = array![-1.0, -1.0]; // gradient points toward (−1,−1)
        let beta_start = array![0.0, 0.0];
        let eps = 1e-14;
        let constraints = LinearInequalityConstraints {
            a: array![[1.0, 0.0], [0.0, 1.0], [1.0 + eps, 1.0]],
            b: array![0.0, 0.0, 0.0],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            Some(&[0, 1, 2]), // all three active
        )
        .expect("near-dependent constraint QP should converge");

        assert!(
            beta[0].abs() <= 1e-10 && beta[1].abs() <= 1e-10,
            "expected optimum at origin, got ({}, {})",
            beta[0],
            beta[1]
        );
        assert!(
            active.len() <= 2,
            "at most 2 independent constraints should remain active, got {}",
            active.len()
        );
    }

    #[test]
    fn quadratic_linear_constraints_release_merged_constraint_group_by_id() {
        // Two redundant lower-bound rows compress into one active KKT row.
        // Releasing that merged row must drop both original constraint ids,
        // not transient positions in the active vector.
        let hessian = array![[1.0]];
        let rhs = array![1.0];
        let beta_start = array![0.0];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0], [2.0], [-1.0]],
            b: array![0.0, 0.0, -0.1],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            Some(&[0, 1]),
        )
        .expect("merged active constraint group should release cleanly");

        assert!(
            (beta[0] - 0.1).abs() <= 1e-10,
            "expected constrained optimum at upper bound 0.1, got {}",
            beta[0]
        );
        assert_eq!(active, vec![2]);
    }

    #[test]
    fn quadratic_linear_constraints_release_merged_group_with_unsorted_active_positions() {
        let hessian = array![[1.0]];
        let rhs = array![1.0];
        let beta_start = array![0.0];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0], [2.0], [-1.0]],
            b: array![0.0, 0.0, -0.1],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            Some(&[2, 0, 1]),
        )
        .expect("merged active group release should handle unsorted active positions");

        assert!(
            (beta[0] - 0.1).abs() <= 1e-10,
            "expected constrained optimum at upper bound 0.1, got {}",
            beta[0]
        );
        assert_eq!(active, vec![2]);
    }

    #[test]
    fn quadratic_linear_constraints_accept_boundary_kkt_after_rank_reduction() {
        let hessian = array![[2.0]];
        let rhs = array![0.0];
        let beta_start = array![1e-9];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0], [1.0 + 1e-13], [2.0], [3.0]],
            b: array![0.0, 0.0, 0.0, 0.0],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            Some(&[0, 1, 2, 3]),
        )
        .expect("degenerate boundary KKT point should be accepted");

        assert_relative_eq!(beta[0], 0.0, epsilon = 1e-14);
        assert!(
            active.len() <= 1,
            "rank-reduced boundary solution should keep at most one representative, got {:?}",
            active
        );
    }

    #[test]
    fn quadratic_linear_constraints_singular_kkt_uses_pseudoinverse_fallback() {
        let hessian = Array2::<f64>::zeros((2, 2));
        let rhs = array![0.0, 0.0];
        let beta_start = array![0.0, 0.0];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0, 1.0]],
            b: array![0.0],
        };

        let (beta, active) = solve_quadratic_with_linear_constraints(
            &hessian,
            &rhs,
            &beta_start,
            &constraints,
            Some(&[0]),
        )
        .expect("singular KKT system should fall back to a finite pseudoinverse solve");

        assert!(beta.iter().all(|value| value.is_finite()));
        assert_relative_eq!(beta[0], 0.0, epsilon = 1e-14);
        assert_relative_eq!(beta[1], 0.0, epsilon = 1e-14);
        assert_eq!(active, vec![0]);
    }

    #[test]
    fn rank_reduce_drops_exactly_dependent_row() {
        // Row 3 = Row 1 + Row 2 exactly. Rank reduction should drop it.
        let a = array![[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 0.0],];
        let b = array![0.0, 0.0, 0.0];
        let member_constraint_ids = vec![vec![0], vec![1], vec![2]];
        let (a_out, b_out, member_constraint_ids_out, _) =
            crate::solver::active_set::rank_reduce_rows_pivoted_qr_with_dependence(
                a,
                b,
                member_constraint_ids,
            );
        assert_eq!(
            a_out.nrows(),
            2,
            "should keep 2 independent rows, got {}",
            a_out.nrows()
        );
        assert_eq!(b_out.len(), 2);
        // The third constraint id should have been merged into one of the first two rows.
        let total_constraint_ids: usize = member_constraint_ids_out.iter().map(|g| g.len()).sum();
        assert_eq!(
            total_constraint_ids, 3,
            "all original constraint ids must be preserved"
        );
    }

    #[test]
    fn rank_reduce_preserves_full_rank_matrix() {
        let a = array![[1.0, 0.0], [0.0, 1.0], [1.0, 1.0],];
        let b = array![0.0, 0.0, 0.0];
        let member_constraint_ids = vec![vec![0], vec![1], vec![2]];
        let (a_out, b_out, member_constraint_ids_out, _) =
            crate::solver::active_set::rank_reduce_rows_pivoted_qr_with_dependence(
                a,
                b,
                member_constraint_ids,
            );
        // All three rows are independent in R^2 (but we only have rank 2).
        // The first two span R^2, so row 3 = row 1 + row 2 is dependent.
        assert_eq!(a_out.nrows(), 2);
        assert_eq!(b_out.len(), 2);
        let total_constraint_ids: usize = member_constraint_ids_out.iter().map(|g| g.len()).sum();
        assert_eq!(total_constraint_ids, 3);
    }

    #[test]
    fn constrained_exact_newton_nan_hessian_returns_feasible_noop_instead_of_failing() {
        let spec = ParameterBlockSpec {
            name: "exact_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![[1.0]])),
            offset: array![0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let states = vec![ParameterBlockState {
            beta: array![0.0],
            eta: array![0.0],
        }];
        let constraints = LinearInequalityConstraints {
            a: array![[1.0]],
            b: array![0.0],
        };
        let hessian = SymmetricMatrix::Dense(array![[f64::NAN]]);
        let updater = ExactNewtonBlockUpdater {
            gradient: &array![0.0],
            hessian: &hessian,
        };
        let s_lambda = Array2::zeros((1, 1));
        let update = updater
            .compute_update_step(&BlockUpdateContext {
                family: &OneBlockConstrainedNaNHessianFamily,
                states: &states,
                spec: &spec,
                block_idx: 0,
                s_lambda: &s_lambda,
                options: &BlockwiseFitOptions::default(),
                linear_constraints: Some(&constraints),
                cached_active_set: None,
            })
            .expect("constrained exact-newton NaN Hessian should produce a no-op update");
        assert_relative_eq!(update.beta_new_raw[0], 0.0, epsilon = 1e-14);
        assert_eq!(update.active_set, Some(vec![0]));
    }

    #[test]
    fn outerobjective_failure_context_is_preserved() {
        // One penalty forces the outer rho optimizer to run, which should now preserve
        // the real evaluation error instead of returning an opaque line-search failure.
        let spec = ParameterBlockSpec {
            name: "err_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0],
                [1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: vec![PenaltyMatrix::Dense(Array2::eye(1))],
            nullspace_dims: vec![],
            initial_log_lambdas: array![0.0],
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            outer_max_iter: 3,
            ..BlockwiseFitOptions::default()
        };
        let err = match fit_custom_family(&OneBlockAlwaysErrorFamily, &[spec], &options) {
            Ok(_) => panic!("fit should fail when family evaluate always errors"),
            Err(e) => e,
        };
        assert!(
            err.to_string().contains(
                "last objective error: synthetic outer objective failure: block[0] evaluate()"
            ),
            "expected preserved root-cause context in error, got: {err}"
        );
    }

    #[test]
    fn fit_fails_when_requested_covariance_cannot_be_computed() {
        let spec = ParameterBlockSpec {
            name: "cov_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0],
                [1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let options = BlockwiseFitOptions {
            use_remlobjective: false,
            compute_covariance: true,
            ..BlockwiseFitOptions::default()
        };
        let err = match fit_custom_family(&OneBlockCovarianceErrorFamily, &[spec], &options) {
            Ok(_) => panic!("fit should fail when covariance computation fails"),
            Err(e) => e,
        };
        assert!(
            err.to_string()
                .contains("synthetic covariance assembly failure"),
            "expected covariance root cause in fit error, got: {err}"
        );
    }

    // Exact analytic Hessians must be finite. Non-finite Hessians are rejected
    // loudly instead of being masked by a surrogate update.

    /// A QuadraticReml family whose log_sigma block returns a Hessian containing
    /// NaN, simulating what happens when exp(eta_sigma) overflows during
    /// location-scale fitting.
    #[derive(Clone)]
    struct TwoBlockNaNHessianFamily;

    impl CustomFamily for TwoBlockNaNHessianFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let n0 = block_states[0].eta.len();
            let p1 = block_states[1].beta.len();
            // Block 0 (mu): well-behaved diagonal working set.
            // Block 1 (log_sigma): ExactNewton with NaN in the Hessian,
            // simulating overflow from extreme coefficients.
            let mut hessian = Array2::<f64>::eye(p1);
            hessian[[0, 0]] = f64::NAN; // overflow poison
            Ok(FamilyEvaluation {
                log_likelihood: -0.5 * block_states[0].eta.iter().map(|&v| v * v).sum::<f64>(),
                blockworking_sets: vec![
                    BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n0),
                        working_weights: Array1::ones(n0),
                    },
                    BlockWorkingSet::ExactNewton {
                        gradient: Array1::zeros(p1),
                        hessian: SymmetricMatrix::Dense(hessian),
                    },
                ],
            })
        }
    }

    /// Same two-block layout but with finite Hessians — the control group.
    #[derive(Clone)]
    struct TwoBlockFiniteHessianFamily;

    impl CustomFamily for TwoBlockFiniteHessianFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let n0 = block_states[0].eta.len();
            let p1 = block_states[1].beta.len();
            let beta1 = &block_states[1].beta;
            let resid1: f64 = beta1.iter().map(|&b| b * b).sum();
            Ok(FamilyEvaluation {
                log_likelihood: -0.5 * block_states[0].eta.iter().map(|&v| v * v).sum::<f64>()
                    - 0.5 * resid1,
                blockworking_sets: vec![
                    BlockWorkingSet::Diagonal {
                        working_response: Array1::zeros(n0),
                        working_weights: Array1::ones(n0),
                    },
                    BlockWorkingSet::ExactNewton {
                        gradient: -beta1.clone(),
                        hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
                    },
                ],
            })
        }
    }

    /// Same NaN-Hessian family but with PseudoLaplace objective, which takes
    /// the strict-SPD path and skips the eigendecomposition in compute_update_step.
    #[derive(Clone)]
    struct TwoBlockNaNHessianPseudoLaplaceFamily;

    impl CustomFamily for TwoBlockNaNHessianPseudoLaplaceFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            TwoBlockNaNHessianFamily.evaluate(block_states)
        }

        fn exact_newton_outerobjective(&self) -> ExactNewtonOuterObjective {
            ExactNewtonOuterObjective::StrictPseudoLaplace
        }
    }

    fn make_two_block_specs(n: usize) -> Vec<ParameterBlockSpec> {
        vec![
            ParameterBlockSpec {
                name: "mu".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((n, 1), 1.0),
                )),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: Some(array![0.0]),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "log_sigma".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((n, 2), 1.0),
                )),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: Some(array![0.0, 0.0]),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ]
    }

    #[test]
    fn exact_newton_nan_hessian_fails_loudly_before_eigendecomposition() {
        // Exact Newton Hessians are part of the mathematical contract.  A
        // NaN in a block Hessian means the family derivative is invalid; we
        // should reject it at the logdet boundary instead of hiding it behind
        // a conservative eigendecomposition fallback.
        let specs = make_two_block_specs(4);
        let per_block_log_lambdas = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let result = inner_blockwise_fit(
            &TwoBlockNaNHessianFamily,
            &specs,
            &per_block_log_lambdas,
            &options,
            None,
        );
        let err = result.expect_err("NaN exact Hessian must fail loudly");
        assert!(
            err.contains("smooth-regularized logdet Hessian contains non-finite entry"),
            "expected explicit non-finite Hessian error, got: {err}"
        );
    }

    #[test]
    fn exact_newton_finite_hessian_succeeds_where_nan_hessian_fails() {
        // SUFFICIENCY (control): The identical two-block structure with a
        // finite Hessian succeeds, proving that NaN in the Hessian is the
        // specific trigger — not the block layout, penalty structure, or
        // solver configuration.
        let specs = make_two_block_specs(4);
        let per_block_log_lambdas = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let result = inner_blockwise_fit(
            &TwoBlockFiniteHessianFamily,
            &specs,
            &per_block_log_lambdas,
            &options,
            None,
        );
        assert!(
            result.is_ok(),
            "inner fit should succeed with finite Hessian: {:?}",
            result.err()
        );
    }

    #[test]
    fn checked_penalizedobjective_rejects_non_finite_values() {
        let err = checked_penalizedobjective(-1.0, 0.5, f64::NAN, "test objective")
            .expect_err("non-finite objective should fail loudly");
        assert!(
            err.contains("non-finite penalized objective"),
            "unexpected error: {err}"
        );
    }

    #[test]
    fn exact_newton_dh_closure_rejects_non_finite_directional_derivative() {
        #[derive(Clone)]
        struct OneBlockNonFiniteJointDhFamily;

        impl CustomFamily for OneBlockNonFiniteJointDhFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let beta = block_states
                    .first()
                    .ok_or_else(|| "missing block 0".to_string())?
                    .beta
                    .clone();
                Ok(FamilyEvaluation {
                    log_likelihood: -0.5 * beta.dot(&beta),
                    blockworking_sets: vec![BlockWorkingSet::ExactNewton {
                        gradient: beta.mapv(|v| -v),
                        hessian: SymmetricMatrix::Dense(array![[1.0]]),
                    }],
                })
            }

            fn exact_newton_joint_hessian(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<Option<Array2<f64>>, String> {
                assert!(block_states.len() <= isize::MAX as usize);
                Ok(Some(array![[1.0]]))
            }

            fn exact_newton_joint_hessian_directional_derivative(
                &self,
                block_states: &[ParameterBlockState],
                arr: &Array1<f64>,
            ) -> Result<Option<Array2<f64>>, String> {
                assert!(block_states.len() <= isize::MAX as usize);
                assert!(arr.iter().all(|v| !v.is_nan()));
                Ok(Some(array![[f64::NAN]]))
            }
        }

        let family = OneBlockNonFiniteJointDhFamily;
        let specs = vec![ParameterBlockSpec {
            name: "beta".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(Array2::from_elem(
                (2, 1),
                1.0,
            ))),
            offset: Array1::zeros(2),
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: Some(array![0.0]),
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        }];
        let states = vec![ParameterBlockState {
            beta: array![0.0],
            eta: Array1::zeros(2),
        }];
        let synced_states = Arc::new(
            synchronized_states_from_flat_beta(&family, &specs, &states, &array![0.0])
                .expect("sync states for exact_newton_dh_closure"),
        );
        let compute_dh =
            exact_newton_dh_closure(&family, synced_states, &specs, 1, false, 1.0, None);
        let err = compute_dh(&array![1.0]).expect_err("non-finite dH should fail loudly");
        assert!(err.contains("non-finite"), "unexpected error: {err}");
    }

    #[test]
    fn nan_propagating_min_detects_nan_eigenvalues() {
        // Verify the fix: our NaN-propagating min correctly detects
        // NaN eigenvalues, unlike f64::min which silently ignored them.
        let mut mat = Array2::<f64>::eye(3);
        mat[[1, 0]] = f64::NAN;
        mat[[0, 1]] = f64::NAN;

        use crate::faer_ndarray::FaerEigh;
        match FaerEigh::eigh(&mat, faer::Side::Lower) {
            Err(_) => {
                // eigh failed — the fallback chain in compute_update_step
                // now catches this and applies a conservative ridge.
            }
            Ok((evals, _)) => {
                // NaN-propagating fold (matches the production code):
                let new_min = evals.iter().copied().fold(f64::INFINITY, |a, b| {
                    if a.is_nan() || b.is_nan() {
                        f64::NAN
                    } else {
                        a.min(b)
                    }
                });
                assert!(
                    !new_min.is_finite(),
                    "NaN-propagating min should detect NaN eigenvalues, got {new_min}"
                );
            }
        }
    }

    #[test]
    fn multiblock_generic_outer_fallback_returns_error_instead_of_panicking() {
        let family = TwoBlockFiniteHessianFamily;
        let specs = make_two_block_specs(4);
        let penalty_counts = vec![0usize, 0usize];
        let rho = Array1::zeros(0);
        let options = BlockwiseFitOptions {
            use_remlobjective: true,
            outer_max_iter: 1,
            ..BlockwiseFitOptions::default()
        };

        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            super::test_support::outerobjectivegradienthessian(
                &family,
                &specs,
                &options,
                &penalty_counts,
                &rho,
                None,
                EvalMode::ValueGradientHessian,
            )
        }));

        let outcome = result.expect("multi-block outer fallback must return an error, not panic");
        let err = match outcome {
            Ok(_) => panic!("multi-block family without a joint path should fail loudly"),
            Err(err) => err.to_string(),
        };
        assert!(
            err.contains("multi-block families must provide a joint outer path"),
            "unexpected error: {err}"
        );
    }

    #[test]
    fn pseudo_laplace_path_skips_eigendecomposition_avoiding_nan_crash() {
        // SUFFICIENCY: The PseudoLaplace path takes strict_solve_spd instead
        // of eigendecomposition-based ridging.  It will still fail (the Hessian
        // is NaN so the solve produces garbage), but the failure is NOT the
        // eigendecomposition NoConvergence error — it's a different error
        // downstream.  This proves the eigendecomposition call is the unique
        // failure point for QuadraticReml families.
        let specs = make_two_block_specs(4);
        let per_block_log_lambdas = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let result = inner_blockwise_fit(
            &TwoBlockNaNHessianPseudoLaplaceFamily,
            &specs,
            &per_block_log_lambdas,
            &options,
            None,
        );
        // The PseudoLaplace path may fail for other reasons (NaN in solve),
        // but it must NOT fail with the eigendecomposition error.
        match result {
            Ok(_) => {} // Acceptable — strict_solve_spd might produce NaN
            // betas which don't trigger a hard error.
            Err(ref msg) => {
                assert!(
                    !msg.contains("exact-newton eigendecomposition failed"),
                    "PseudoLaplace path should NOT hit eigendecomposition; \
                     got eigendecomposition error anyway: {msg}"
                );
            }
        }
    }

    /// Regression check: when `strict_solve_spd_with_lm_continuation` is given a
    /// strongly negative-definite matrix whose `|λ_min|` exceeds the LM δ-ridge
    /// schedule's terminal δ (≈ ε · trace_scale · 10¹⁶), the bare schedule can't
    /// rescue Cholesky and the terminal eigen-floor fallback must return a
    /// finite solution equal to `Q diag(1/Λ̃) Qᵀ rhs`, with
    /// `Λ̃_i = max(Λ_i, ε λ_max)`.
    ///
    /// We also exercise the schedule-success path with a milder matrix to lock
    /// in that the eigen-floor doesn't perturb the LM-δ output for cases the
    /// schedule can already handle.
    #[test]
    fn strict_solve_spd_falls_back_to_eigen_floor_on_indefinite_matrix() {
        // δ schedule from `delta0 = max(ε·tr/p, 1e-12)`, growth 10×, 16 steps.
        // With `tr = 4·1e30` we get `delta0 ≈ ε·1e30 ≈ 2.2e14`; terminal δ at
        // escalation 16 is `2.2e14 · 1e16 = 2.2e30`. Set `λ_min ≈ -1e32` to
        // outpace the schedule and force the eigen-floor branch.
        let p = 4usize;
        let mut h = Array2::<f64>::zeros((p, p));
        for i in 0..p {
            h[[i, i]] = -1e32 - (i as f64) * 1e30;
        }
        h[[0, 1]] = 5e29;
        h[[1, 0]] = 5e29;
        let rhs = Array1::from_vec(vec![1e30, -5e29, 2.5e29, 7.5e29]);

        let (x, stats) = strict_solve_spd_with_lm_continuation(&h, &rhs)
            .expect("eigen-floor fallback must succeed on the negative-definite matrix");
        assert!(
            stats.escalations > 16,
            "expected eigen-floor terminal fallback (escalations > MAX_ESCALATIONS), got {}",
            stats.escalations,
        );
        for &v in x.iter() {
            assert!(
                v.is_finite(),
                "eigen-floor solve returned non-finite component {v}"
            );
        }

        // Reconstruct the analytic floored solve and compare component-wise.
        let mut sym = h.clone();
        symmetrize_dense_in_place(&mut sym);
        let (evals, evecs) = FaerEigh::eigh(&sym, Side::Lower).expect("eigh");
        let max_abs_eval = evals.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
        let eps_floor = (CUSTOM_FAMILY_EVAL_FLOOR * max_abs_eval).max(1e-300);
        let mut want = Array1::<f64>::zeros(p);
        for k in 0..p {
            let mut q_t_rhs = 0.0;
            for i in 0..p {
                q_t_rhs += evecs[[i, k]] * rhs[i];
            }
            let scaled = q_t_rhs / evals[k].max(eps_floor);
            for i in 0..p {
                want[i] += evecs[[i, k]] * scaled;
            }
        }
        for i in 0..p {
            let tol = 1e-9 * want[i].abs().max(1.0) + 1e-9;
            assert!(
                (want[i] - x[i]).abs() <= tol,
                "eigen-floor solve component {i}: want={:.6e}, got={:.6e}",
                want[i],
                x[i],
            );
        }
    }

    // ---------- eta_backup heterogeneous-shape regression tests ----------
    //
    // Regression note: a previous `inner_blockwise_fit` implementation
    // reused a single `eta_backup` buffer across blocks during line search.
    // With heterogeneous eta lengths (e.g. survival time block = 3n,
    // threshold/log-sigma = n), that buffer could be left at the wrong
    // shape for the next block update and trigger an ndarray broadcast
    // panic:
    //   "could not broadcast array from shape: [n] to: [3n]"

    /// Minimal two-block family where block 0 has design nrows=3n and
    /// block 1 has design nrows=n. Both use ExactNewton. Block 0's
    /// gradient is nonzero so the Newton step exceeds tol and exercises
    /// the line-search path that previously mishandled heterogeneous
    /// eta buffer shapes.
    #[derive(Clone)]
    struct HeterogeneousEtaLengthFamily {
        n: usize,
    }

    impl CustomFamily for HeterogeneousEtaLengthFamily {
        fn evaluate(
            &self,
            block_states: &[ParameterBlockState],
        ) -> Result<FamilyEvaluation, String> {
            let n = self.n;
            let eta0 = &block_states[0].eta;
            let eta1 = &block_states[1].eta;
            assert_eq!(eta0.len(), 3 * n, "block 0 eta must be 3n");
            assert_eq!(eta1.len(), n, "block 1 eta must be n");
            let p0 = block_states[0].beta.len();
            let p1 = block_states[1].beta.len();
            // Simple quadratic log-likelihood so optimum is at beta=0.
            let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
            // Nonzero gradient drives a real step in both blocks.
            let grad0 = &(-&block_states[0].beta) + &Array1::from_elem(p0, 0.1);
            let grad1 = &(-&block_states[1].beta) + &Array1::from_elem(p1, 0.1);
            Ok(FamilyEvaluation {
                log_likelihood: ll,
                blockworking_sets: vec![
                    BlockWorkingSet::ExactNewton {
                        gradient: grad0,
                        hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
                    },
                    BlockWorkingSet::ExactNewton {
                        gradient: grad1,
                        hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
                    },
                ],
            })
        }
    }

    fn make_heterogeneous_eta_specs(n: usize) -> Vec<ParameterBlockSpec> {
        let p0 = 2;
        let p1 = 2;
        vec![
            ParameterBlockSpec {
                name: "big_block".to_string(),
                // 3n rows — mimics survival time block stacking
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((3 * n, p0), 1.0),
                )),
                offset: Array1::zeros(3 * n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: Some(Array1::from_elem(p0, 1.0)),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "small_block".to_string(),
                // n rows — mimics threshold/log-sigma block
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((n, p1), 1.0),
                )),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: Some(Array1::from_elem(p1, 1.0)),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ]
    }

    /// Regression guard: blocks with identical eta lengths never exercised
    /// the old heterogeneous-shape failure mode.
    #[test]
    fn uniform_eta_lengths_do_not_panic() {
        let n = 10;
        #[derive(Clone)]
        struct UniformEtaFamily;
        impl CustomFamily for UniformEtaFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let p0 = block_states[0].beta.len();
                let p1 = block_states[1].beta.len();
                let eta0 = &block_states[0].eta;
                let eta1 = &block_states[1].eta;
                let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
                Ok(FamilyEvaluation {
                    log_likelihood: ll,
                    blockworking_sets: vec![
                        BlockWorkingSet::ExactNewton {
                            gradient: &(-&block_states[0].beta) + &Array1::from_elem(p0, 0.1),
                            hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
                        },
                        BlockWorkingSet::ExactNewton {
                            gradient: &(-&block_states[1].beta) + &Array1::from_elem(p1, 0.1),
                            hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
                        },
                    ],
                })
            }
        }
        // Both blocks have n rows — no shape mismatch possible.
        let specs = vec![
            ParameterBlockSpec {
                name: "block_a".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((n, 2), 1.0),
                )),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: Some(Array1::from_elem(2, 1.0)),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
            ParameterBlockSpec {
                name: "block_b".to_string(),
                design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                    Array2::from_elem((n, 2), 1.0),
                )),
                offset: Array1::zeros(n),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: Some(Array1::from_elem(2, 1.0)),
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            },
        ];
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 3,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        // Must NOT panic — uniform eta lengths keep eta_backup
        // compatible with every block's eta after mem::swap.
        let result = inner_blockwise_fit(&UniformEtaFamily, &specs, &per_block, &options, None);
        assert!(
            result.is_ok(),
            "uniform eta lengths should not panic: {result:?}"
        );
    }

    /// Regression guard: heterogeneous eta lengths (3n vs n) must not
    /// prevent the inner fit from completing. Older code could panic with
    /// "could not broadcast array from shape: [n] to: [3n]" due to the
    /// eta_backup swap bug.
    #[test]
    fn heterogeneous_eta_lengths_inner_fit_completes() {
        let n = 10;
        let family = HeterogeneousEtaLengthFamily { n };
        let specs = make_heterogeneous_eta_specs(n);
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 3,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
        assert!(result.is_ok(), "inner fit should complete: {result:?}");
    }

    /// SUFFICIENCY (single-cycle): even one inner cycle must complete
    /// without panic when blocks have heterogeneous eta lengths.
    #[test]
    fn heterogeneous_eta_single_cycle_completes() {
        let n = 10;
        let family = HeterogeneousEtaLengthFamily { n };
        let specs = make_heterogeneous_eta_specs(n);
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
        assert!(
            result.is_ok(),
            "single-cycle inner fit should complete: {result:?}"
        );
    }

    /// Regression guard: when all blocks have step <= tol, the line-search
    /// path is skipped for every block, so this case should remain safe
    /// even with heterogeneous eta lengths.
    #[test]
    fn heterogeneous_eta_no_panic_when_all_blocks_converged() {
        let n = 10;
        #[derive(Clone)]
        struct AllConvergedFamily {
            n: usize,
        }
        impl CustomFamily for AllConvergedFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let n = self.n;
                let eta0 = &block_states[0].eta;
                let eta1 = &block_states[1].eta;
                assert_eq!(eta0.len(), 3 * n);
                assert_eq!(eta1.len(), n);
                let p0 = block_states[0].beta.len();
                let p1 = block_states[1].beta.len();
                let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
                Ok(FamilyEvaluation {
                    log_likelihood: ll,
                    blockworking_sets: vec![
                        BlockWorkingSet::ExactNewton {
                            gradient: Array1::zeros(p0),
                            hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
                        },
                        BlockWorkingSet::ExactNewton {
                            gradient: Array1::zeros(p1),
                            hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
                        },
                    ],
                })
            }
        }
        let mut specs = make_heterogeneous_eta_specs(n);
        specs[0].initial_beta = Some(Array1::zeros(2));
        specs[1].initial_beta = Some(Array1::zeros(2));
        let family = AllConvergedFamily { n };
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        // All blocks converged → step=0 → `continue` before swap →
        // eta_backup never participates → no broadcast panic.
        let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
        assert!(
            result.is_ok(),
            "should not panic when all blocks are converged: {result:?}"
        );
    }

    /// Regression guard: even when only the second (smaller) block takes
    /// a step, the fit must complete. Earlier code could still panic here
    /// after reusing an oversized eta_backup buffer across blocks.
    #[test]
    fn heterogeneous_eta_completes_when_only_small_block_steps() {
        let n = 10;
        #[derive(Clone)]
        struct OnlySmallBlockStepsFamily {
            n: usize,
        }
        impl CustomFamily for OnlySmallBlockStepsFamily {
            fn evaluate(
                &self,
                block_states: &[ParameterBlockState],
            ) -> Result<FamilyEvaluation, String> {
                let n = self.n;
                let eta0 = &block_states[0].eta;
                let eta1 = &block_states[1].eta;
                assert_eq!(eta0.len(), 3 * n);
                assert_eq!(eta1.len(), n);
                let p0 = block_states[0].beta.len();
                let p1 = block_states[1].beta.len();
                let ll = -0.5 * eta0.dot(eta0) - 0.5 * eta1.dot(eta1);
                Ok(FamilyEvaluation {
                    log_likelihood: ll,
                    blockworking_sets: vec![
                        BlockWorkingSet::ExactNewton {
                            // Block 0: converged, step=0
                            gradient: Array1::zeros(p0),
                            hessian: SymmetricMatrix::Dense(Array2::eye(p0)),
                        },
                        BlockWorkingSet::ExactNewton {
                            // Block 1: nontrivial step
                            gradient: &(-&block_states[1].beta) + &Array1::from_elem(p1, 0.1),
                            hessian: SymmetricMatrix::Dense(Array2::eye(p1)),
                        },
                    ],
                })
            }
        }
        let mut specs = make_heterogeneous_eta_specs(n);
        specs[0].initial_beta = Some(Array1::zeros(2)); // block 0 at optimum
        let family = OnlySmallBlockStepsFamily { n };
        let per_block = vec![Array1::zeros(0), Array1::zeros(0)];
        let options = BlockwiseFitOptions {
            inner_max_cycles: 1,
            use_remlobjective: false,
            compute_covariance: false,
            ..BlockwiseFitOptions::default()
        };
        let result = inner_blockwise_fit(&family, &specs, &per_block, &options, None);
        assert!(
            result.is_ok(),
            "fit should complete when only small block steps: {result:?}"
        );
    }

    /// Direct test of the KKT-aware projection in
    /// `projected_stationarity_inf_norm`.
    ///
    /// Contract:
    ///   (i)   with no constraints, returns the plain inf-norm of the residual;
    ///   (ii)  at an active lower bound with multiplier-signed residual
    ///         (`β_j == lb_j` and `residual_j > 0`) the coordinate is skipped;
    ///   (iii) at an active lower bound with wrong-signed residual
    ///         (`residual_j < 0`) the coordinate still contributes;
    ///   (iv)  interior coordinates always contribute regardless of
    ///         residual sign.
    ///
    /// This pins the exact convergence semantics that the joint-Newton loop
    /// relies on: a genuine constrained-KKT optimum must score zero, while
    /// infeasibility and interior non-stationarity remain observable.
    #[test]
    fn projected_stationarity_inf_norm_respects_kkt_multipliers() {
        assert!(file!().ends_with(".rs"));
        // Test (i): no constraints → plain inf-norm.
        let beta = array![1.0, 2.0, -0.5];
        let residual = array![0.3, -0.1, 0.2];
        let inf_nocon = projected_stationarity_inf_norm(&residual, &beta, None, None);
        assert_relative_eq!(inf_nocon, 0.3_f64, epsilon = 1e-12);

        // Test (ii): β_j at its lower bound with residual_j > 0 is a KKT
        // multiplier; projection drops it, so only the interior entry (-0.1)
        // contributes.
        let beta_active = array![0.0, 2.0];
        let residual_active = array![0.5, -0.1];
        let constraints_lb0 = LinearInequalityConstraints {
            a: array![[1.0, 0.0], [0.0, 1.0]],
            b: array![0.0, f64::NEG_INFINITY], // only β_0 has a finite lower bound
        };
        // Build a minimal single-row constraint first (β_0 ≥ 0) so the
        // "active lower bound + positive residual" branch of the projection
        // is exercised in isolation.  β_1 is left unconstrained relative to
        // this single-row constraint matrix (it's not pinned by any row),
        // so its contribution (|-0.1| = 0.1) stays in the inf-norm.
        let single = LinearInequalityConstraints {
            a: array![[1.0, 0.0]],
            b: array![0.0],
        };
        let inf_projected =
            projected_stationarity_inf_norm(&residual_active, &beta_active, Some(&single), None);
        assert_relative_eq!(inf_projected, 0.1_f64, epsilon = 1e-12);
        let vec_projected = projected_linear_constraint_stationarity_vector(
            &residual_active,
            &beta_active,
            &single,
            None,
        )
        .expect("active lower-bound projection should succeed");
        assert_relative_eq!(vec_projected[0], 0.0_f64, epsilon = 1e-10);
        assert_relative_eq!(vec_projected[1], -0.1_f64, epsilon = 1e-12);

        // Also verify the per-coord handling of an explicitly-unconstrained
        // row (b = -inf) in the two-row form: β_0 has a finite lower bound
        // of 0 (from row 0), β_1 gets lb = -inf (from row 1 via b/a), which
        // `lb.is_finite() == false` routes to the "no lower bound" branch of
        // the projection.  The active-bound drop still fires on coord 0, so
        // the result matches the single-row case: 0.1.  This documents that
        // the projection's per-coord `lb.is_finite()` gate is what makes the
        // unconstrained-coord case work — NOT rejection of the whole
        // constraint set by `extract_simple_lower_bounds`.
        let inf_with_two_row = projected_stationarity_inf_norm(
            &residual_active,
            &beta_active,
            Some(&constraints_lb0),
            None,
        );
        assert_relative_eq!(inf_with_two_row, 0.1_f64, epsilon = 1e-12);

        // Test (iii): β_j at its bound but residual points the WRONG way
        // (residual_j < 0 means the KKT dual feasibility λ_j ≥ 0 is violated
        // — i.e. the bound should release).  Keep that coordinate in the
        // norm so the optimizer does not declare convergence on an infeasible
        // multiplier.
        let beta_wrong_sign = array![0.0];
        let residual_wrong_sign = array![-0.2];
        let single1 = LinearInequalityConstraints {
            a: array![[1.0]],
            b: array![0.0],
        };
        let inf_wrong_sign = projected_stationarity_inf_norm(
            &residual_wrong_sign,
            &beta_wrong_sign,
            Some(&single1),
            None,
        );
        assert_relative_eq!(inf_wrong_sign, 0.2_f64, epsilon = 1e-12);

        // Test (iv): an interior coordinate with a valid lower bound keeps
        // contributing to the norm, whatever the residual sign.
        let beta_interior = array![1.5];
        let residual_interior = array![0.4];
        let inf_interior = projected_stationarity_inf_norm(
            &residual_interior,
            &beta_interior,
            Some(&single1),
            None,
        );
        assert_relative_eq!(inf_interior, 0.4_f64, epsilon = 1e-12);
    }

    /// Pins the constrained-stationary certificate semantics.
    ///
    /// The certificate combines three local signals from the most recent
    /// accepted Newton step:
    ///
    ///   1. `linearized_rel = ‖g + Hδ‖∞ / (1 + ‖g‖∞)` ≥ 0.5
    ///      — the linear solve refused to neutralise most of `g`; the
    ///        unreduced component lives in the constraint-active subspace
    ///        and IS a Lagrange multiplier, not a defect of the solve.
    ///
    ///   2. `scalar_model_relative_error()` ≤ 1e-3
    ///      — the local quadratic Newton model agrees with the observed
    ///        objective change to roundoff, proving the Hessian+gradient
    ///        are correct at this β.  Rules out genuine model mismatch
    ///        masquerading as a multiplier.
    ///
    ///   3. `|Δobjective|` ≤ `objective_tol`
    ///      — the objective has ceased moving.
    ///
    /// Reproduces the large-scale survival-marginal-slope failure numerics:
    /// `old_kkt ≈ 8.6e5`, `linearized_next ≈ 8.6e5`, `actual ≈ pred ≈ 1.6e-2`.
    #[test]
    fn joint_newton_math_constrained_stationary_signature_matches_aou_failure() {
        let math = JointNewtonMathDiagnostic {
            old_kkt_inf: 8.613e5,
            linearized_next_kkt_inf: 8.580e5,
            predicted_reduction: 1.589e-2,
            actual_reduction: 1.589e-2,
            trust_ratio: 1.000,
            step_inf: 1.270e-2,
            proposal_inf: 1.270e-2,
        };
        // (1) The linearized solve neutralised <1% of g — Lagrange multiplier
        // pattern, not a defect of the solve.
        let linearized_rel = math.linearized_next_kkt_inf / (1.0 + math.old_kkt_inf);
        assert!(
            linearized_rel >= 0.5,
            "large-scale exit has linearized_rel = {:.3e}, must be >= 0.5 for the \
             constrained-stationary certificate to fire",
            linearized_rel,
        );
        // (2) Scalar Newton model is correct to roundoff — Hessian+gradient OK.
        let relerr = math.scalar_model_relative_error();
        assert!(
            relerr <= 1e-3,
            "large-scale exit has scalar_model_relerr = {:.3e}, must be <= 1e-3 \
             (model agrees with actual ⇒ residual is a real multiplier)",
            relerr,
        );
        // (3) Objective change at obj_tol scale. At |obj| ~ 3.5e5 and
        // inner_tol ~ 1e-6, obj_tol ≈ 0.348, and observed Δobj ≈ 1.6e-2.
        let objective_change = 1.589e-2_f64;
        let objective_tol = 1e-6 * (1.0 + 3.484783e5_f64);
        assert!(
            objective_change <= objective_tol,
            "large-scale exit has |Δobj| = {:.3e}, must be <= obj_tol {:.3e}",
            objective_change,
            objective_tol,
        );
    }

    /// Reproduces the post-diagnostic large-scale trace: the scalar Newton model
    /// and objective plateau tests alone look like a constrained-stationary
    /// point, but the projected KKT residual is hundreds of times above
    /// tolerance and the accepted Newton step is still macroscopic. That is
    /// not a terminal certificate; it is a normal in-progress Newton cycle.
    #[test]
    fn constrained_stationary_certificate_keeps_iterating_when_step_is_large() {
        let math = JointNewtonMathDiagnostic {
            old_kkt_inf: 2.708e4,
            linearized_next_kkt_inf: 2.707e4,
            predicted_reduction: 3.421e-1,
            actual_reduction: 3.421e-1,
            trust_ratio: 1.0,
            step_inf: 2.891e-2,
            proposal_inf: 2.891e-2,
        };
        let objective_change = 3.421e-1;
        let objective_tol = 3.479e-1;
        let residual = 8.102;
        let residual_tol = 2.707e-2;
        let step_tol = 1.2e-5;

        // These are the three non-step conditions that made 0.1.126 reject a
        // seed as soon as objective change touched tolerance.
        let linearized_rel = math.linearized_next_kkt_inf / (1.0 + math.old_kkt_inf);
        assert!(linearized_rel >= 0.5);
        assert!(math.scalar_model_relative_error() <= 1e-3);
        assert!(objective_change <= objective_tol);
        assert!(math.step_inf > step_tol);

        // The projected residual still rules out accepting convergence, but
        // the large step rules out terminal refusal. The loop must continue.
        assert!(residual > residual_tol);
        assert_eq!(
            constrained_stationary_certificate_decision(
                &math,
                objective_change,
                objective_tol,
                step_tol,
                None,
                residual,
                residual_tol,
            ),
            ConstrainedStationaryCertificate::NotCandidate,
        );
    }

    #[test]
    fn residual_steady_geometric_descent_distinguishes_converging_from_plateau() {
        use std::collections::VecDeque;
        // gam#787 duchon centers≥20: the logslope block converged geometrically
        // (~0.33×/cycle) but `linearized_rel ≥ 0.5` + flat objective routed it
        // into the plateau-refusal break a few cycles short of tol. The
        // steady-descent guard must keep it iterating.
        let converging: VecDeque<f64> = [6.985e-4, 2.388e-4, 7.987e-5, 2.597e-5]
            .into_iter()
            .collect();
        assert!(
            residual_in_steady_geometric_descent(&converging),
            "a steadily ~0.33x/cycle descending residual must be recognized as converging"
        );
        // A genuine multiplier/null plateau: residual flat/oscillating above tol.
        let plateau: VecDeque<f64> = [2.066e0, 2.063e0, 2.066e0, 2.063e0].into_iter().collect();
        assert!(
            !residual_in_steady_geometric_descent(&plateau),
            "a flat/oscillating residual plateau must NOT be treated as converging"
        );
        // A single lucky drop inside an otherwise flat window must not qualify.
        let noisy: VecDeque<f64> = [2.0e0, 2.0e0, 1.0e-3].into_iter().collect();
        assert!(
            !residual_in_steady_geometric_descent(&noisy),
            "a single-cycle drop must not be mistaken for steady descent"
        );
        // Too few cycles to judge steadiness.
        let short: VecDeque<f64> = [1.0e-3, 3.0e-4].into_iter().collect();
        assert!(
            !residual_in_steady_geometric_descent(&short),
            "fewer than the window of cycles must not assert steady descent"
        );
    }

    #[test]
    fn constrained_stationary_certificate_refuses_only_when_step_is_exhausted() {
        let math = JointNewtonMathDiagnostic {
            old_kkt_inf: 2.708e4,
            linearized_next_kkt_inf: 2.707e4,
            predicted_reduction: 3.421e-1,
            actual_reduction: 3.421e-1,
            trust_ratio: 1.0,
            step_inf: 2.891e-7,
            proposal_inf: 2.891e-7,
        };
        let objective_change = 3.421e-1;
        let objective_tol = 3.479e-1;
        let step_tol = 1.0e-6;
        let residual_tol = 2.707e-2;

        // Inside the certification band (`residual <= 4x residual_tol`, the
        // documented gam#797 conditioning/round-off allowance) a fully
        // stationary iterate is accepted.
        assert_eq!(
            constrained_stationary_certificate_decision(
                &math,
                objective_change,
                objective_tol,
                step_tol,
                None,
                residual_tol,
                residual_tol,
            ),
            ConstrainedStationaryCertificate::Accept,
        );
        assert_eq!(
            constrained_stationary_certificate_decision(
                &math,
                objective_change,
                objective_tol,
                step_tol,
                None,
                // Still within 4x: a residual a hair above 1x must remain
                // accepted, because the active-projected residual genuinely
                // floors just above the scale-relative tolerance.
                residual_tol + 1.0e-12,
                residual_tol,
            ),
            ConstrainedStationaryCertificate::Accept,
        );
        // Beyond the 4x band the residual is too large to be a mere
        // conditioning floor: the certificate must refuse the phantom
        // multiplier rather than fake convergence.
        assert_eq!(
            constrained_stationary_certificate_decision(
                &math,
                objective_change,
                objective_tol,
                step_tol,
                None,
                4.0 * residual_tol + 1.0e-6,
                residual_tol,
            ),
            ConstrainedStationaryCertificate::RefusePhantomMultiplier,
        );
    }

    /// Negative case: a genuine non-stationary state must NOT trigger
    /// the certificate. We construct numbers where the linear solve
    /// successfully neutralises g (linearized_rel small) — meaning Newton
    /// is making real progress on an unconstrained problem — and verify
    /// the certificate does NOT fire.
    #[test]
    fn joint_newton_math_unconstrained_progress_does_not_match_certificate() {
        let math = JointNewtonMathDiagnostic {
            // Unconstrained Newton: linear solve reduces ‖g‖ by O(1e-12).
            old_kkt_inf: 1.0e3,
            linearized_next_kkt_inf: 1.0e-9,
            predicted_reduction: 5.0e-1,
            actual_reduction: 5.0e-1,
            trust_ratio: 1.0,
            step_inf: 1.0e-1,
            proposal_inf: 1.0e-1,
        };
        let linearized_rel = math.linearized_next_kkt_inf / (1.0 + math.old_kkt_inf);
        assert!(
            linearized_rel < 0.5,
            "unconstrained Newton must have linearized_rel < 0.5 (was {:.3e})",
            linearized_rel,
        );
    }

    #[test]
    fn projected_stationarity_inf_norm_projects_coupled_linear_kkt_multipliers() {
        assert!(file!().ends_with(".rs"));
        let constraints = LinearInequalityConstraints {
            a: array![[1.0, 1.0]],
            b: array![1.0],
        };
        let beta_active = array![0.25, 0.75];

        let residual_valid_multiplier = array![3.0, 3.0];
        let inf_valid = projected_stationarity_inf_norm(
            &residual_valid_multiplier,
            &beta_active,
            Some(&constraints),
            None,
        );
        assert_relative_eq!(inf_valid, 0.0_f64, epsilon = 1e-10);
        let vec_valid = projected_linear_constraint_stationarity_vector(
            &residual_valid_multiplier,
            &beta_active,
            &constraints,
            None,
        )
        .expect("coupled active projection should succeed");
        assert_relative_eq!(vec_valid[0], 0.0_f64, epsilon = 1e-10);
        assert_relative_eq!(vec_valid[1], 0.0_f64, epsilon = 1e-10);

        let residual_wrong_sign = array![-3.0, -3.0];
        let inf_wrong = projected_stationarity_inf_norm(
            &residual_wrong_sign,
            &beta_active,
            Some(&constraints),
            None,
        );
        assert_relative_eq!(inf_wrong, 3.0_f64, epsilon = 1e-12);

        let beta_interior = array![0.75, 0.75];
        let inf_interior = projected_stationarity_inf_norm(
            &residual_valid_multiplier,
            &beta_interior,
            Some(&constraints),
            None,
        );
        assert_relative_eq!(inf_interior, 3.0_f64, epsilon = 1e-12);
    }

    #[test]
    fn joint_stationarity_from_gradient_projects_coupled_linear_constraints() {
        assert!(file!().ends_with(".rs"));
        let spec = ParameterBlockSpec {
            name: "coupled".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0, 0.0],
                [0.0, 1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let state = ParameterBlockState {
            beta: array![0.25, 0.75],
            eta: array![0.25, 0.75],
        };
        let constraints = LinearInequalityConstraints {
            a: array![[1.0, 1.0]],
            b: array![1.0],
        };
        let s_lambdas = vec![Array2::<f64>::zeros((2, 2))];

        // residual = S beta - gradient = [4, 4] = A_active^T lambda,
        // lambda=4.  This is a valid constrained KKT point and must not be
        // reported as a large free-gradient residual.
        let residual_multiplier = array![4.0, 4.0];
        let gradient = -&residual_multiplier;
        let projected = exact_newton_joint_stationarity_inf_norm_from_gradient(
            &gradient,
            &[state.clone()],
            std::slice::from_ref(&spec),
            &s_lambdas,
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            &[Some(constraints.clone())],
            None,
        )
        .expect("stationarity projection should succeed");
        assert_relative_eq!(projected, 0.0_f64, epsilon = 1e-10);
        let kkt_residual = exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
            &gradient,
            std::slice::from_ref(&spec),
            &[state.clone()],
            &s_lambdas,
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            &[Some(constraints.clone())],
            None,
        )
        .expect("KKT residual assembly should succeed")
        .expect("exact-gradient path should produce residual");
        assert_relative_eq!(kkt_residual.as_array()[0], 0.0_f64, epsilon = 1e-10);
        assert_relative_eq!(kkt_residual.as_array()[1], 0.0_f64, epsilon = 1e-10);

        // Wrong-signed normal residual means the active constraint wants to
        // release. That is not convergence and must remain visible.
        let wrong_signed_gradient = residual_multiplier;
        let unprojected = exact_newton_joint_stationarity_inf_norm_from_gradient(
            &wrong_signed_gradient,
            &[state],
            &[spec],
            &s_lambdas,
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            &[Some(constraints)],
            None,
        )
        .expect("stationarity projection should succeed");
        assert_relative_eq!(unprojected, 4.0_f64, epsilon = 1e-12);
    }

    #[test]
    fn kkt_residual_uses_cached_joint_gradient_without_re_evaluating_family() {
        assert!(file!().ends_with(".rs"));
        let spec = ParameterBlockSpec {
            name: "cached-gradient".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0, 0.0],
                [0.0, 1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let state = ParameterBlockState {
            beta: array![2.0, -1.0],
            eta: array![2.0, -1.0],
        };
        let s_lambda = Array2::<f64>::eye(2);
        let expected_residual = array![0.25, -0.5];
        let cached_gradient = s_lambda.dot(&state.beta) - &expected_residual;

        let residual = exact_newton_joint_kkt_residual_for_ift_from_cached_gradient(
            &OneBlockAlwaysErrorFamily,
            std::slice::from_ref(&spec),
            std::slice::from_ref(&state),
            std::slice::from_ref(&s_lambda),
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            None,
            Some(&cached_gradient),
        )
        .expect("cached gradient path should not call family.evaluate()")
        .expect("cached gradient should produce a KKT residual");

        assert_relative_eq!(
            residual.as_array()[0],
            expected_residual[0],
            epsilon = 1e-12
        );
        assert_relative_eq!(
            residual.as_array()[1],
            expected_residual[1],
            epsilon = 1e-12
        );
    }

    #[test]
    fn projected_stationarity_vector_uses_penalized_residual_not_raw_score() {
        let spec = ParameterBlockSpec {
            name: "score-cancellation".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(array![
                [1.0, 0.0],
                [0.0, 1.0]
            ])),
            offset: array![0.0, 0.0],
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let state = ParameterBlockState {
            beta: array![10.0, -4.0],
            eta: array![10.0, -4.0],
        };
        let s_lambda = array![[2.0, 0.0], [0.0, 3.0]];
        let gradient = array![19.5, -12.25];

        let residual = exact_newton_joint_projected_stationarity_vector_from_gradient(
            &gradient,
            std::slice::from_ref(&state),
            std::slice::from_ref(&spec),
            std::slice::from_ref(&s_lambda),
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            &[None],
            None,
        )
        .expect("projected stationarity residual should assemble");

        assert_relative_eq!(residual[0], 0.5, epsilon = 1e-12);
        assert_relative_eq!(residual[1], 0.25, epsilon = 1e-12);
    }

    #[test]
    fn zero_psi_derivative_operator_acts_as_zero_map() {
        let n = 17usize;
        let p = 5usize;
        let op = ZeroPsiDerivativeOperator::new(n, p);

        assert_eq!(op.n_data(), n);
        assert_eq!(op.p_out(), p);

        let u = Array1::from_iter((0..p).map(|k| 1.0 + k as f64));
        let v = Array1::from_iter((0..n).map(|k| 1.0 - 0.5 * k as f64));

        let fwd = op.forward_mul(0, &u.view()).expect("forward_mul");
        assert_eq!(fwd.len(), n);
        assert!(fwd.iter().all(|x| *x == 0.0));

        let trn = op.transpose_mul(0, &v.view()).expect("transpose_mul");
        assert_eq!(trn.len(), p);
        assert!(trn.iter().all(|x| *x == 0.0));

        let fwd2 = op
            .forward_mul_second_diag(0, &u.view())
            .expect("forward_mul_second_diag");
        assert_eq!(fwd2.len(), n);
        assert!(fwd2.iter().all(|x| *x == 0.0));

        let trn2 = op
            .transpose_mul_second_diag(0, &v.view())
            .expect("transpose_mul_second_diag");
        assert_eq!(trn2.len(), p);
        assert!(trn2.iter().all(|x| *x == 0.0));

        let fwd_cross = op
            .forward_mul_second_cross(0, 1, &u.view())
            .expect("forward_mul_second_cross");
        assert_eq!(fwd_cross.len(), n);
        assert!(fwd_cross.iter().all(|x| *x == 0.0));

        let trn_cross = op
            .transpose_mul_second_cross(0, 1, &v.view())
            .expect("transpose_mul_second_cross");
        assert_eq!(trn_cross.len(), p);
        assert!(trn_cross.iter().all(|x| *x == 0.0));

        let chunk = op.row_chunk_first(0, 3..7).expect("row_chunk_first");
        assert_eq!(chunk.dim(), (4, p));
        assert!(chunk.iter().all(|x| *x == 0.0));

        let chunk_diag = op
            .row_chunk_second_diag(0, 0..n)
            .expect("row_chunk_second_diag");
        assert_eq!(chunk_diag.dim(), (n, p));
        assert!(chunk_diag.iter().all(|x| *x == 0.0));

        let chunk_cross = op
            .row_chunk_second_cross(0, 1, 1..3)
            .expect("row_chunk_second_cross");
        assert_eq!(chunk_cross.dim(), (2, p));
        assert!(chunk_cross.iter().all(|x| *x == 0.0));

        let mut row = Array1::from_elem(p, 9.5);
        op.row_vector_first_into(0, 4, row.view_mut())
            .expect("row_vector_first_into");
        assert!(row.iter().all(|x| *x == 0.0));

        // The operator must not advertise dense materialization — production
        // hot paths rely on this to avoid forming an (n, p) buffer.
        assert!(op.as_materializable().is_none());
    }

    /// At large scale (n=320 000, p=101) a dense `Array2::zeros((n, p))`
    /// for an unused ψ-derivative slot consumes ≈ 0.24 GiB; the spatial-
    /// adaptive baseline used to allocate one per ψ coordinate (≈ 1.4 GiB
    /// of guaranteed-zero memory at six coords). Replacing the dense zero
    /// matrix with a `(0, 0)` shape sentinel — without an implicit
    /// operator — must still resolve to `PsiDesignMap::Zero` so callers
    /// see exact-zero semantics with O(1) memory.
    #[test]
    fn spatial_adaptive_zero_xpsi_uses_zero_map_without_dense_allocation() {
        let n = 320_000usize;
        let p = 101usize;
        let deriv = CustomFamilyBlockPsiDerivative {
            penalty_index: None,
            x_psi: Array2::<f64>::zeros((0, 0)),
            s_psi: Array2::<f64>::zeros((0, 0)),
            s_psi_components: None,
            s_psi_penalty_components: None,
            x_psi_psi: None,
            s_psi_psi: None,
            s_psi_psi_components: None,
            s_psi_psi_penalty_components: None,
            implicit_operator: None,
            implicit_axis: 0,
            implicit_group_id: None,
        };
        let policy = ResourcePolicy::default_library();
        let map = resolve_custom_family_x_psi_map(
            &deriv,
            n,
            p,
            0..n,
            "spatial-adaptive zero sentinel",
            &policy,
        )
        .expect("resolve x_psi map for (0, 0)-sentinel deriv");
        match map {
            PsiDesignMap::Zero { nrows, ncols } => {
                assert_eq!(nrows, n);
                assert_eq!(ncols, p);
            }
            other => panic!(
                "(0, 0) x_psi sentinel must resolve to PsiDesignMap::Zero, got {:?}",
                std::mem::discriminant(&other)
            ),
        }
    }

    #[test]
    fn zero_psi_derivative_operator_resolves_to_zero_design_map() {
        let n = 12usize;
        let p = 4usize;
        let zero_op: Arc<dyn CustomFamilyPsiDerivativeOperator> =
            Arc::new(ZeroPsiDerivativeOperator::new(n, p));
        let deriv = CustomFamilyBlockPsiDerivative {
            penalty_index: None,
            x_psi: Array2::<f64>::zeros((0, 0)),
            s_psi: Array2::<f64>::zeros((0, 0)),
            s_psi_components: None,
            s_psi_penalty_components: None,
            x_psi_psi: None,
            s_psi_psi: None,
            s_psi_psi_components: None,
            s_psi_psi_penalty_components: None,
            implicit_operator: Some(Arc::clone(&zero_op)),
            implicit_axis: 0,
            implicit_group_id: None,
        };
        let policy = ResourcePolicy::default_library();
        let map = resolve_custom_family_x_psi_map(&deriv, n, p, 0..n, "zero", &policy)
            .expect("resolve x_psi map");
        let u = Array1::from_iter((0..p).map(|k| 1.0 + k as f64));
        let fwd = map.forward_mul(u.view()).expect("forward_mul map");
        assert_eq!(fwd.len(), n);
        assert!(fwd.iter().all(|x| *x == 0.0));

        let chunk = map.row_chunk(2..5).expect("row_chunk map");
        assert_eq!(chunk.dim(), (3, p));
        assert!(chunk.iter().all(|x| *x == 0.0));

        let map_second =
            resolve_custom_family_x_psi_psi_map(&deriv, &deriv, 0, n, p, 0..n, "zero", &policy)
                .expect("resolve x_psi_psi map");
        let fwd_second = map_second
            .forward_mul(u.view())
            .expect("forward_mul second");
        assert_eq!(fwd_second.len(), n);
        assert!(fwd_second.iter().all(|x| *x == 0.0));
    }

    #[test]
    fn rowwise_kronecker_psi_row_chunks_are_window_consistent() {
        let first = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
        let second_diag = array![[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]];
        let second_cross = array![[-1.0, 0.25], [-1.5, 0.5], [-2.0, 0.75]];
        let base = build_embedded_dense_psi_operator(
            &first,
            &second_diag,
            Some(&vec![(1, second_cross.clone())]),
            0..2,
            2,
            0,
        )
        .expect("embedded dense base");
        let time_a = Arc::new(array![[1.0, 0.0], [0.5, 1.0], [1.5, -0.5]]);
        let time_b = Arc::new(array![[0.25, 2.0], [-1.0, 0.75], [0.0, 1.25]]);
        let op = build_rowwise_kronecker_psi_operator(base, vec![time_a, time_b])
            .expect("rowwise kronecker psi operator");
        let mat = op
            .as_materializable()
            .expect("rowwise operator dense reference");
        let rows = 1..5;

        let first_dense = mat.materialize_first(0).expect("dense first");
        let first_chunk = op.row_chunk_first(0, rows.clone()).expect("chunk first");
        assert_eq!(
            first_chunk,
            first_dense.slice(ndarray::s![rows.clone(), ..]).to_owned()
        );

        let diag_full = op
            .row_chunk_second_diag(0, 0..op.n_data())
            .expect("full row-chunk diag");
        let diag_chunk = op
            .row_chunk_second_diag(0, rows.clone())
            .expect("chunk diag");
        assert_eq!(
            diag_chunk,
            diag_full.slice(ndarray::s![rows.clone(), ..]).to_owned()
        );

        let cross_full = op
            .row_chunk_second_cross(0, 1, 0..op.n_data())
            .expect("full row-chunk cross");
        let cross_chunk = op
            .row_chunk_second_cross(0, 1, rows.clone())
            .expect("chunk cross");
        assert_eq!(
            cross_chunk,
            cross_full.slice(ndarray::s![rows, ..]).to_owned()
        );
    }

    #[test]
    fn joint_trust_region_radius_update_accept_reject_logic() {
        let accepted = update_joint_trust_region_radius(1.0, 1.0, 2.0, 2.0, 1.0);
        assert!(accepted.accepted);
        assert!((accepted.rho - 1.0).abs() < 1.0e-12);
        assert!((accepted.radius - 2.0).abs() < 1.0e-12);
        assert_eq!(accepted.decision.label(), "grow_at_boundary");

        let rejected = update_joint_trust_region_radius(1.0, 0.5, -0.1, 2.0, 1.0);
        assert!(!rejected.accepted);
        assert!(rejected.rho < 0.0);
        assert!((rejected.radius - 0.25).abs() < 1.0e-12);
        assert_eq!(rejected.decision.label(), "shrink_reject");

        let rejected_inside_radius = update_joint_trust_region_radius(1.0, 1.0e-3, -0.1, 2.0, 1.0);
        assert!(!rejected_inside_radius.accepted);
        assert!(
            rejected_inside_radius.radius < 1.0e-3,
            "a rejected in-radius step must be outside the next trust region"
        );
        assert!((rejected_inside_radius.radius - 5.0e-4).abs() < 1.0e-12);
        assert_eq!(rejected_inside_radius.decision.label(), "shrink_reject");

        let poor = update_joint_trust_region_radius(1.0, 0.5, 0.1, 1.0, 1.0);
        assert!(poor.accepted);
        assert!((poor.rho - 0.1).abs() < 1.0e-12);
        assert!((poor.radius - 0.25).abs() < 1.0e-12);
        assert_eq!(poor.decision.label(), "shrink_marginal_accept");
    }

    #[test]
    fn joint_trust_region_noise_floor_accepts_round_off_negative_actual() {
        // Near-converged iterate at large objective scale: both the
        // model-predicted decrease and the realized objective change are
        // below the noise floor. Round-off can flip the sign of `actual`;
        // the principled response is to accept (rho ≈ 1) rather than
        // declare failure on the sign of noise. Mirrors the noise-floor
        // branch in `src/solver/pirls.rs`.
        let objective_scale = 1.66e5;
        let noise_floor = objective_scale * 1e-14;
        let predicted = noise_floor * 0.1;
        let actual = -noise_floor * 0.5;
        let update =
            update_joint_trust_region_radius(1.0, 0.05, actual, predicted, objective_scale);
        assert!(
            update.accepted,
            "sub-noise-floor sign flip must not reject as failure"
        );
        assert!((update.rho - 1.0).abs() < 1.0e-12);
    }

    #[test]
    fn joint_trust_region_noise_floor_rejects_genuine_increase() {
        // Genuine objective increase clearly beyond the noise floor must
        // still be rejected even when predicted_reduction is sub-floor:
        // this is real model failure, not round-off.
        let objective_scale = 1.66e5;
        let noise_floor = objective_scale * 1e-14;
        let predicted = noise_floor * 0.1;
        let actual = -1.0;
        let update = update_joint_trust_region_radius(1.0, 0.5, actual, predicted, objective_scale);
        assert!(
            !update.accepted,
            "objective increase beyond noise must reject"
        );
        assert!(update.rho.is_infinite() && update.rho < 0.0);
    }

    #[test]
    fn joint_objective_roundoff_slack_accepts_large_scale_wobble() {
        let old_objective = 1.218530e5;
        let trial_objective = old_objective + 2.183e-10;
        assert!(
            trial_objective
                <= old_objective + joint_objective_roundoff_slack(old_objective, trial_objective),
            "sub-nanounit objective wobble at large scale should not burn all trust attempts"
        );
    }

    #[test]
    fn joint_objective_floor_only_accepts_sub_tolerance_model_steps() {
        let old_objective = 1.218942e5_f64;
        let objective_tol = 1e-6 * (1.0 + old_objective.abs());
        let actual_reduction = -3.783e-10;
        let predicted_reduction = 9.481e-15;
        let trial_objective = old_objective - actual_reduction;
        assert!(
            joint_objective_floor_reached(
                old_objective,
                trial_objective,
                actual_reduction,
                predicted_reduction,
                objective_tol,
            ),
            "the repeated large-scale roundoff wobble should terminate immediately"
        );

        assert!(
            !joint_objective_floor_reached(
                old_objective,
                old_objective + 2.0,
                -2.0,
                predicted_reduction,
                objective_tol,
            ),
            "real objective increases must still be rejected"
        );
        assert!(
            !joint_objective_floor_reached(
                old_objective,
                trial_objective,
                actual_reduction,
                10.0 * objective_tol,
                objective_tol,
            ),
            "non-negligible predicted progress must not be hidden by the floor exit"
        );
        // A positive-but-noise-level `actual_reduction` must NOT trigger the
        // floor (asymmetric guard). At rank-deficient optima the outer-gradient
        // FD identity (`outer_lamlgradient_matches_finite_differencewhen_joint_exact_path_is_active`,
        // inner_tol=1e-12) relies on the trust-region loop running the same
        // number of attempts at neighbouring λ probes; accepting positive-noise
        // reductions exits a cycle earlier on the probe where round-off
        // happened to land positive and decorrelates the null-space drift.
        let positive_noise_actual = 3.783e-10_f64;
        let positive_noise_trial = old_objective - positive_noise_actual;
        assert!(
            !joint_objective_floor_reached(
                old_objective,
                positive_noise_trial,
                positive_noise_actual,
                predicted_reduction,
                objective_tol,
            ),
            "positive-noise reductions must NOT trigger the floor; symmetric exit breaks rank-deficient FD identity"
        );
    }

    #[test]
    fn joint_inner_convergence_rejects_objective_flat_non_kkt_stall() {
        // Direct reproduction of the bad 0.1.79 log shape:
        //
        //   obj=4.472714e5 Δobj=5.381e-2 |δ|∞=2.794e-2
        //   residual=5.980e1 tol=4.473e-1
        //
        // The objective and step are both flat at this scale, but the KKT
        // residual is 134x tolerance. Accepting this as an inner optimum makes
        // the envelope-theorem outer gradient invalid, which is what surfaced
        // as outer BFGS objective stalls with |g|≈1e14-1e16.
        let objective = 4.472714e5_f64;
        let inner_tol = 1.0e-6_f64;
        let objective_change = 5.381e-2_f64;
        let accepted_step_inf = 2.794e-2_f64;
        let residual = 5.980e1_f64;
        let residual_tol = inner_tol * (1.0 + objective);
        let step_tol = 1.242e-3_f64;
        let objective_tol = residual_tol;
        let old_flat_step_predicate = objective_change <= objective_tol
            && accepted_step_inf <= objective_tol.sqrt().max(step_tol);

        assert!(
            old_flat_step_predicate,
            "the historical objective-flat/step-flat predicate would have accepted this stalled inner solve"
        );
        assert!(
            !joint_inner_kkt_converged(residual, residual_tol),
            "inner convergence must require KKT residual <= tolerance"
        );
        assert!(
            !joint_inner_kkt_converged(1.5 * residual_tol, residual_tol),
            "near-miss residual slack would still invalidate the outer envelope gradient"
        );
    }

    #[test]
    fn joint_trust_region_block_metric_does_not_starve_unrelated_blocks() {
        const TIME_W: usize = 12;
        const MARG_W: usize = 11;
        const LOG_W: usize = 10;
        const P: usize = TIME_W + MARG_W + LOG_W;

        let mut h = Array2::<f64>::zeros((P, P));
        let mut g = Array1::<f64>::zeros(P);
        h[[0, 0]] = 2.24e8;
        g[0] = -5.6e8;
        for i in 1..TIME_W {
            h[[i, i]] = 1.0 + 0.3 * i as f64;
            g[i] = -0.3 - 0.07 * i as f64;
        }
        for j in 0..MARG_W {
            let idx = TIME_W + j;
            h[[idx, idx]] = 1.2 + 0.2 * j as f64;
            g[idx] = -0.9;
        }
        let log0 = TIME_W + MARG_W;
        h[[log0, log0]] = 1.0e-5;
        g[log0] = -2.173;
        for k in 1..LOG_W {
            let idx = log0 + k;
            h[[idx, idx]] = 1.5 + 0.1 * k as f64;
            g[idx] = -0.4;
        }

        let mut newton = Array1::<f64>::zeros(P);
        for i in 0..P {
            newton[i] = -g[i] / h[[i, i]];
        }

        let mut raw_global = newton.clone();
        let raw_norm = raw_global.iter().map(|v| v * v).sum::<f64>().sqrt();
        if raw_norm.is_finite() && raw_norm > 20.0 {
            raw_global.mapv_inplace(|v| v * (20.0 / raw_norm));
        }
        let raw_linearized = (&g + &h.dot(&raw_global))
            .iter()
            .map(|v| v.abs())
            .fold(0.0_f64, f64::max)
            / (1.0 + g.iter().map(|v| v.abs()).fold(0.0_f64, f64::max));
        assert!(
            raw_linearized > 0.99,
            "raw concatenated L2 truncation should reproduce the starvation mechanism"
        );

        let ranges = vec![(0, TIME_W), (TIME_W, TIME_W + MARG_W), (TIME_W + MARG_W, P)];
        let metric_diag = h.diag().to_owned();
        let full_block_norms =
            joint_trust_region_block_metric_norms(&newton, &ranges, &metric_diag);
        let mut block_metric = newton.clone();
        let block_radii = vec![full_block_norms[0], full_block_norms[1], 20.0];
        truncate_joint_step_to_block_metric_radii(
            &mut block_metric,
            &ranges,
            &metric_diag,
            &block_radii,
        );
        let block_linearized = (&g + &h.dot(&block_metric))
            .iter()
            .map(|v| v.abs())
            .fold(0.0_f64, f64::max)
            / (1.0 + g.iter().map(|v| v.abs()).fold(0.0_f64, f64::max));
        assert!(
            block_linearized < 1.0e-6,
            "block-local curvature metric must let the time block neutralize its KKT defect; got {block_linearized:.3e}"
        );
    }

    #[test]
    fn shrink_active_joint_block_trust_radii_strictly_decreases_max_radius() {
        // Regression for the joint-Newton fully-rejected stall. Before the
        // fix, when a boundary block's radius was already at the 1e-12 floor
        // and an interior block held the max, `shrink_active_joint_block_trust_radii`
        // returned the same `max(block_radii)` on every call — the trust
        // region never actually shrank, the dogleg recomputed an identical
        // joint δ, and the inner solver burned `inner_loop_hard_ceiling`
        // cycles before the 8-cycle stall guard finally bailed it out. The
        // fix must guarantee that every call strictly decreases the joint
        // trust radius until the floor.
        let mut block_radii = vec![1.0, 1.0e-12];
        // Boundary block (#1) sits at the radius floor with step at boundary;
        // interior block (#0) has step well inside its radius. Before the
        // fix: only block #1 participates, its radius re-clamps to 1e-12,
        // returned max stays at 1.0 — byte-identical to the previous call.
        let block_step_norms = vec![1.0e-3, 1.0e-12];
        let old_max = block_radii.iter().copied().fold(0.0_f64, f64::max);
        let new_max =
            shrink_active_joint_block_trust_radii(&mut block_radii, &block_step_norms, 0.25);
        assert!(
            new_max < old_max,
            "joint trust radius must strictly decrease when a step is rejected (was {old_max:.3e}, now {new_max:.3e})"
        );
        // Interior block must have shrunk below its current step norm so the
        // next dogleg step is forced strictly smaller in that block.
        assert!(
            block_radii[0] < block_step_norms[0],
            "interior block radius must drop below its step norm to force a strictly smaller next step (radius {:.3e}, step {:.3e})",
            block_radii[0],
            block_step_norms[0]
        );
    }

    #[test]
    fn shrink_active_joint_block_trust_radii_pulls_radius_below_step_norm() {
        // The accept-path radius update (`update_joint_trust_region_radius`)
        // pulls the new radius below `0.5 * step_norm` on rejection so the
        // next step is provably smaller; the reject-path block shrink must
        // do the same. Otherwise an interior block with `step_norm <<
        // factor * radius` re-takes the identical Newton step on the next
        // dogleg attempt and the trust-region globalization is degenerate.
        let mut block_radii = vec![1.0];
        let block_step_norms = vec![1.0e-3];
        let new_max =
            shrink_active_joint_block_trust_radii(&mut block_radii, &block_step_norms, 0.25);
        assert!(
            new_max <= 0.5 * block_step_norms[0],
            "shrunken radius must be ≤ 0.5 · step_norm to force a strictly smaller next step (was {new_max:.3e}, step {:.3e})",
            block_step_norms[0]
        );
    }

    #[test]
    fn blockwise_trust_region_uses_penalized_metric_not_raw_coefficient_size() {
        let spec = ParameterBlockSpec {
            name: "single_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                Array2::<f64>::zeros((1, 3)),
            )),
            offset: Array1::zeros(1),
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let h: Array2<f64> = array![[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0e-10]];
        let work = BlockWorkingSet::ExactNewton {
            gradient: array![0.0, 0.0, 0.0],
            hessian: SymmetricMatrix::Dense(h.clone()),
        };
        let s_lambda = Array2::<f64>::zeros((3, 3));
        let raw_delta: Array1<f64> = array![2.0, -1.0, 2.0e5];
        let raw_inf = raw_delta.iter().fold(0.0_f64, |m, v| {
            let value: f64 = *v;
            m.max(value.abs())
        });
        let radius = 20.0_f64;

        let raw_inf_scaled = &raw_delta * (radius / raw_inf);
        assert!(
            raw_inf_scaled[0].abs() < 1.0e-3,
            "the old raw coefficient cap would starve ordinary coordinates inside the block"
        );

        let (metric_delta, metric_norm) = truncate_block_step_to_metric_radius(
            &spec,
            &work,
            &s_lambda,
            raw_delta,
            radius,
            0.0,
            RidgePolicy::explicit_stabilization_pospart(),
        )
        .expect("block metric truncation should succeed");
        assert!(
            metric_norm < radius,
            "the near-null coordinate is large in beta-space but small in the block's penalized-Hessian metric"
        );
        assert!(
            (metric_delta[0] - 2.0).abs() < 1.0e-12
                && (metric_delta[1] + 1.0).abs() < 1.0e-12
                && (metric_delta[2] - 2.0e5).abs() < 1.0e-6,
            "blockwise trust regions must size steps in objective curvature units, not raw coefficient units"
        );
    }

    #[test]
    fn blockwise_trust_region_never_reverts_to_raw_beta_norm_on_indefinite_curvature() {
        let spec = ParameterBlockSpec {
            name: "single_block".to_string(),
            design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
                Array2::<f64>::zeros((1, 3)),
            )),
            offset: Array1::zeros(1),
            penalties: vec![],
            nullspace_dims: vec![],
            initial_log_lambdas: Array1::zeros(0),
            initial_beta: None,
            gauge_priority: 100,
            jacobian_callback: None,
            stacked_design: None,
            stacked_offset: None,
        };
        let h: Array2<f64> = array![[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0e-8]];
        let work = BlockWorkingSet::ExactNewton {
            gradient: array![0.0, 0.0, 0.0],
            hessian: SymmetricMatrix::Dense(h),
        };
        let s_lambda = Array2::<f64>::zeros((3, 3));
        let raw_delta: Array1<f64> = array![2.0, -1.0, 2.0e5];
        let radius = 20.0_f64;

        let old_quadratic = raw_delta.dot(&array![2.0, -1.0, -2.0e-3]);
        assert!(
            old_quadratic < 0.0,
            "fixture must hit the historical non-SPD branch"
        );

        let (metric_delta, metric_norm) = truncate_block_step_to_metric_radius(
            &spec,
            &work,
            &s_lambda,
            raw_delta,
            radius,
            0.0,
            RidgePolicy::explicit_stabilization_pospart(),
        )
        .expect("block metric truncation should succeed");
        assert!(
            metric_norm < radius,
            "indefinite curvature must still use the positive penalized diagonal metric, not raw beta length"
        );
        assert!(
            (metric_delta[0] - 2.0).abs() < 1.0e-12
                && (metric_delta[1] + 1.0).abs() < 1.0e-12
                && (metric_delta[2] - 2.0e5).abs() < 1.0e-6,
            "non-SPD local curvature must not resurrect coefficient-space trust-region scaling"
        );
    }

    #[test]
    fn joint_trust_region_rosenbrock_like_quadratic_is_armijo_safe() {
        // Local Rosenbrock-at-the-valley quadratic in variables (x, y):
        // f ≈ 0.5 * [dx, dy]' H [dx, dy], H = [[802, -400], [-400, 200]].
        // Add a tiny ridge to make the test SPD and use a gradient whose full
        // Newton step crosses the radius, exercising truncation before the
        // objective is evaluated.
        let h = array![[802.0, -400.0], [-400.0, 200.1]];
        let unconstrained = array![1.0, 1.0];
        let gradient = -h.dot(&unconstrained);
        let rhs = -&gradient;
        let mut step = unconstrained.clone();
        let unconstrained_norm = unconstrained.iter().map(|v| v * v).sum::<f64>().sqrt();
        assert!(unconstrained_norm > 0.25);
        step.mapv_inplace(|v| v * (0.25 / unconstrained_norm));
        let step_norm = step.iter().map(|v| v * v).sum::<f64>().sqrt();
        assert!(step_norm <= 0.25 + 1.0e-12);

        let h_step = h.dot(&step);
        let predicted = joint_quadratic_predicted_reduction(&rhs, &h_step, &step);
        let old_objective = 0.0;
        let trial_objective = gradient.dot(&step) + 0.5 * step.dot(&h_step);
        let actual = old_objective - trial_objective;
        assert!(predicted > 0.0);
        assert!((predicted - actual).abs() < 1.0e-10);

        let update =
            update_joint_trust_region_radius(0.25, step_norm, actual, predicted, old_objective);
        assert!(update.accepted);
        assert!(trial_objective < old_objective);
    }

    // Inline RED REPRO moved to tests/joint_newton_isotropic_tr_starvation.rs
    // so it survives in-progress refactors of the surrounding test
    // support module (this `mod tests { }` currently does not compile due
    // to `crate::test_support::*` / `test_outerobjective_andgradient` WIP).

    /// Synthetic 3-block fixture where the joint penalized Hessian is
    /// rank-deficient inside block 2 (block-diagonal H with two
    /// well-conditioned 3x3 identity blocks and a rank-1 third block; all
    /// s_lambdas are zero so the penalty does not lift the deficiency).
    /// The gradient is concentrated on block 2's null directions so the
    /// stationarity residual is dominated by block 2. The report must
    /// (a) classify the refusal as `RankDeficientHPen`, (b) record
    /// nullity > 0, and (c) name block 2 as the carrying block.
    #[test]
    fn kkt_refusal_report_classifies_rank_deficient_hpen_third_block() {
        let block_widths = [3usize, 3, 3];
        let total_p: usize = block_widths.iter().sum();
        let block_count = block_widths.len();

        let mut specs: Vec<ParameterBlockSpec> = Vec::with_capacity(block_count);
        let mut states: Vec<ParameterBlockState> = Vec::with_capacity(block_count);
        let mut s_lambdas: Vec<Array2<f64>> = Vec::with_capacity(block_count);
        let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_count);
        let names = ["block_a", "block_b", "block_c_rank_deficient"];
        let mut offset = 0usize;
        for (b, &width) in block_widths.iter().enumerate() {
            let start = offset;
            let end = start + width;
            offset = end;
            ranges.push((start, end));
            specs.push(ParameterBlockSpec {
                name: names[b].to_string(),
                design: DesignMatrix::from(Array2::<f64>::zeros((1, width))),
                offset: Array1::zeros(1),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            });
            states.push(ParameterBlockState {
                beta: Array1::zeros(width),
                eta: Array1::zeros(1),
            });
            s_lambdas.push(Array2::<f64>::zeros((width, width)));
        }

        // Block-diagonal H: I(3) ⊕ I(3) ⊕ e0 e0ᵀ (third block rank 1, nullity 2).
        let mut h = Array2::<f64>::zeros((total_p, total_p));
        for i in 0..3 {
            h[[i, i]] = 1.0;
            h[[3 + i, 3 + i]] = 1.0;
        }
        h[[6, 6]] = 1.0;

        let source = JointHessianSource::Dense(h);

        // Concentrate the gradient on block 2's null directions (rows 7,8).
        // With s_lambdas all zero and β=0, the stationarity residual equals
        // -gradient, so block 2 carries the dominant residual mass.
        let mut joint_grad = Array1::<f64>::zeros(total_p);
        joint_grad[7] = 5.0;
        joint_grad[8] = 3.0;
        joint_grad[0] = 1.0e-6;

        let cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; block_count];
        let block_constraints: Vec<Option<LinearInequalityConstraints>> = vec![None; block_count];

        let math = JointNewtonMathDiagnostic {
            old_kkt_inf: 5.0,
            linearized_next_kkt_inf: 4.9,
            predicted_reduction: 1.0e-4,
            actual_reduction: 1.0e-4,
            trust_ratio: 1.0,
            step_inf: 1.0e-9,
            proposal_inf: 1.0e-3,
        };

        let residual_tol = 1.0e-6;
        let projected_residual_inf = 5.0;

        let report = compute_kkt_refusal_report(
            42,
            &states,
            &specs,
            &s_lambdas,
            &ranges,
            Some(&joint_grad),
            &cached_active_sets,
            &block_constraints,
            Some(&source),
            total_p,
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            1.0e-9,
            1.0e-3,
            1.0,
            residual_tol,
            1.0e-6,
            1.0e-6,
            1.0e-8,
            projected_residual_inf,
            Some(&math),
        );

        assert_eq!(
            report.diagnosis,
            KktRefusalDiagnosis::RankDeficientHPen,
            "block-2 rank-1 H_pen with zero s_lambdas must classify as RankDeficientHPen, got {:?}",
            report.diagnosis,
        );
        assert!(
            report.hpen_nullity_at_rank_tol > 0,
            "rank-1 block embedded in 9x9 block-diagonal H must register nullity > 0, got {}",
            report.hpen_nullity_at_rank_tol,
        );
        assert_eq!(
            report.block_carrying_residual,
            Some(2),
            "block 2 must carry the largest |∇L − Sβ|∞ component; got {:?}, residuals={:?}",
            report.block_carrying_residual,
            report.block_residual_inf,
        );
        assert_eq!(report.block_names.len(), block_count);
        assert_eq!(
            report.block_names[2], "block_c_rank_deficient",
            "carrying-block name should be the third block",
        );
        assert!(
            report
                .format_structured_log(residual_tol)
                .contains("rank_deficient_H_pen"),
            "structured log must surface the diagnosis label",
        );
        assert!(
            report
                .format_bubbled_error()
                .contains("block_c_rank_deficient"),
            "bubbled error must name the carrying block by spec.name",
        );
        assert!(
            report
                .format_bubbled_error()
                .contains("structural or numerical null direction"),
            "rank-deficient refusals should no longer emit the old polynomial-only guidance",
        );
    }

    /// Round-trip: every variant's `as_str()` output, when embedded in the
    /// `diagnosis: <label>` slot of the bubbled-error format, must parse
    /// back via `parse_from_error`. seed-accounting's `InnerStatus`
    /// classifier reads diagnoses out of bubbled error strings via that
    /// parser; if a variant's label diverges between formatter and parser
    /// the classifier silently falls back to "unknown" and the early-exit
    /// canary degrades to a generic non-converged result.
    #[test]
    fn kkt_refusal_diagnosis_string_round_trip_through_bubbled_error_parser() {
        for diagnosis in [
            KktRefusalDiagnosis::RankDeficientHPen,
            KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH,
            KktRefusalDiagnosis::ActiveSetIncomplete,
            KktRefusalDiagnosis::AliasingDetectedAtFit,
        ] {
            let label = diagnosis.as_str();
            // Mimic the trailing slot exactly as `format_bubbled_error`
            // emits it (label at the very end after `; diagnosis: `).
            let synthetic_error = format!(
                "coupled exact-joint inner solve exited the joint Newton path before convergence \
                 — cycle=7 cert REFUSED: residual=1.0e-2 > tol=1.0e-6; \
                 diagnosis: {label}"
            );
            let parsed = KktRefusalDiagnosis::parse_from_error(&synthetic_error);
            assert_eq!(
                parsed,
                Some(diagnosis),
                "label '{label}' must round-trip through parse_from_error; got {:?}",
                parsed,
            );
        }
    }

    #[test]
    fn kkt_refusal_guidance_distinguishes_marginal_slope_coupling_from_polynomial_nullspace() {
        let phantom = KktRefusalDiagnosis::PhantomMultiplierWithWellConditionedH.guidance();
        assert!(phantom.contains("marginal/logslope coupling"));
        assert!(phantom.contains("rather than a"));
        assert!(phantom.contains("Matérn/Duchon polynomial-nullspace failure"));

        let active = KktRefusalDiagnosis::ActiveSetIncomplete.guidance();
        assert!(active.contains("active-set certification failure"));
        assert!(active.contains("not a polynomial-nullspace diagnosis"));

        let alias = KktRefusalDiagnosis::AliasingDetectedAtFit.guidance();
        assert!(alias.contains("drop or reparameterize"));
    }

    /// Regression canary: a synthetic 3-block fixture chosen to mimic the
    /// large-scale rank-deficient-H_pen failure mode — block-diagonal H with
    /// a fully degenerate third block and zero s_lambdas — must classify
    /// as `RankDeficientHPen` with nullity matching the structural rank
    /// deficiency. When `nullspace-lead`'s smooth-construction
    /// reparameterization lands and absorbs polynomial null spaces into
    /// the parametric block, the SAME fixture (rewritten with a
    /// full-rank reparameterized basis) should fit cleanly with no
    /// refusal. That follow-up half is wired below behind `#[ignore]`
    /// per the lead's note; the diagnosis half here is active so the
    /// canary fires today on the failure mode the rework targets.
    #[test]
    fn rank_deficient_hpen_canary_fires_on_large_scale_shaped_failure() {
        let block_widths = [4usize, 4, 4];
        let total_p: usize = block_widths.iter().sum();
        let block_count = block_widths.len();

        let mut specs: Vec<ParameterBlockSpec> = Vec::with_capacity(block_count);
        let mut states: Vec<ParameterBlockState> = Vec::with_capacity(block_count);
        let mut s_lambdas: Vec<Array2<f64>> = Vec::with_capacity(block_count);
        let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_count);
        let names = ["location_block", "scale_block", "marginal_slope_block"];
        let mut offset = 0usize;
        for (b, &width) in block_widths.iter().enumerate() {
            let start = offset;
            let end = start + width;
            offset = end;
            ranges.push((start, end));
            specs.push(ParameterBlockSpec {
                name: names[b].to_string(),
                design: DesignMatrix::from(Array2::<f64>::zeros((1, width))),
                offset: Array1::zeros(1),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            });
            states.push(ParameterBlockState {
                beta: Array1::zeros(width),
                eta: Array1::zeros(1),
            });
            s_lambdas.push(Array2::<f64>::zeros((width, width)));
        }

        // H = I(4) ⊕ I(4) ⊕ 0 — the third block is the marginal-slope
        // pathology: zero Hessian curvature on a 4-D null space the
        // penalty does not constrain (s_lambdas are zero everywhere).
        let mut h = Array2::<f64>::zeros((total_p, total_p));
        for i in 0..4 {
            h[[i, i]] = 1.0;
            h[[4 + i, 4 + i]] = 1.0;
        }
        // Marginal-slope block left as the zero matrix → nullity = 4.

        let source = JointHessianSource::Dense(h);

        // Gradient mass concentrated on the marginal-slope block. With
        // β=0 and S=0, the stationarity residual on that block equals
        // −gradient there, so the carrying block is unambiguous.
        let mut joint_grad = Array1::<f64>::zeros(total_p);
        joint_grad[8] = 4.2;
        joint_grad[9] = 1.7;
        joint_grad[10] = -2.5;
        joint_grad[11] = 0.9;

        let cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; block_count];
        let block_constraints: Vec<Option<LinearInequalityConstraints>> = vec![None; block_count];
        let math = JointNewtonMathDiagnostic {
            old_kkt_inf: 4.2,
            linearized_next_kkt_inf: 4.2,
            predicted_reduction: 0.0,
            actual_reduction: 0.0,
            trust_ratio: 0.0,
            step_inf: 0.0,
            proposal_inf: 1.0e-3,
        };

        let report = compute_kkt_refusal_report(
            123,
            &states,
            &specs,
            &s_lambdas,
            &ranges,
            Some(&joint_grad),
            &cached_active_sets,
            &block_constraints,
            Some(&source),
            total_p,
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            0.0,
            1.0e-3,
            1.0,
            1.0e-6,
            1.0e-6,
            1.0e-6,
            0.0,
            4.2,
            Some(&math),
        );

        assert_eq!(
            report.diagnosis,
            KktRefusalDiagnosis::RankDeficientHPen,
            "large-scale-shaped marginal-slope failure must classify as RankDeficientHPen \
             (this is the canary nullspace-lead's smooth-construction rework targets)",
        );
        assert!(
            report.hpen_nullity_at_rank_tol >= 4,
            "fully degenerate marginal-slope block (4 zero eigenvalues) must contribute \
             nullity >= 4; got {}",
            report.hpen_nullity_at_rank_tol,
        );
        assert_eq!(
            report.block_carrying_residual,
            Some(2),
            "marginal_slope_block (idx 2) must carry the residual; got {:?}, residuals={:?}",
            report.block_carrying_residual,
            report.block_residual_inf,
        );
        let bubbled = report.format_bubbled_error();
        assert_eq!(
            KktRefusalDiagnosis::parse_from_error(&bubbled),
            Some(KktRefusalDiagnosis::RankDeficientHPen),
            "canary's bubbled-error string must parse back via the classifier's parser",
        );
        assert!(
            bubbled.contains("marginal-slope fits can also expose callback-owned weak directions"),
            "BMS-shaped refusal should mention the callback-owned weak-direction mechanism"
        );
    }

    /// Post-fix half of the canary: once `nullspace-lead`'s smooth
    /// reparameterization absorbs polynomial null spaces into the
    /// parametric block, the marginal-slope synthetic above (rewritten
    /// to use a full-rank reparameterized basis with the absorbed null
    /// columns moved into a separate identifiable block) should fit
    /// without any cert refusal.
    #[test]
    fn rank_deficient_hpen_canary_disappears_after_nullspace_absorption() {
        let block_widths = [4usize, 4, 4];
        let total_p: usize = block_widths.iter().sum();
        let block_count = block_widths.len();

        let mut specs: Vec<ParameterBlockSpec> = Vec::with_capacity(block_count);
        let mut states: Vec<ParameterBlockState> = Vec::with_capacity(block_count);
        let mut s_lambdas: Vec<Array2<f64>> = Vec::with_capacity(block_count);
        let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(block_count);
        let names = ["location_block", "scale_block", "marginal_slope_block"];
        let mut offset = 0usize;
        for (b, &width) in block_widths.iter().enumerate() {
            let start = offset;
            let end = start + width;
            offset = end;
            ranges.push((start, end));
            specs.push(ParameterBlockSpec {
                name: names[b].to_string(),
                design: DesignMatrix::from(Array2::<f64>::zeros((1, width))),
                offset: Array1::zeros(1),
                penalties: vec![],
                nullspace_dims: vec![],
                initial_log_lambdas: Array1::zeros(0),
                initial_beta: None,
                gauge_priority: 100,
                jacobian_callback: None,
                stacked_design: None,
                stacked_offset: None,
            });
            states.push(ParameterBlockState {
                beta: Array1::zeros(width),
                eta: Array1::zeros(1),
            });
            s_lambdas.push(Array2::<f64>::zeros((width, width)));
        }

        // Full-rank H across all three blocks — the post-absorption
        // shape: the polynomial null space has been moved out of the
        // smooth and the remaining basis is fully identified by the
        // likelihood Hessian.
        let h = Array2::<f64>::eye(total_p);
        let source = JointHessianSource::Dense(h);
        let joint_grad = Array1::<f64>::zeros(total_p);
        let cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; block_count];
        let block_constraints: Vec<Option<LinearInequalityConstraints>> = vec![None; block_count];
        let math = JointNewtonMathDiagnostic {
            old_kkt_inf: 0.0,
            linearized_next_kkt_inf: 0.0,
            predicted_reduction: 0.0,
            actual_reduction: 0.0,
            trust_ratio: 1.0,
            step_inf: 0.0,
            proposal_inf: 0.0,
        };

        let report = compute_kkt_refusal_report(
            0,
            &states,
            &specs,
            &s_lambdas,
            &ranges,
            Some(&joint_grad),
            &cached_active_sets,
            &block_constraints,
            Some(&source),
            total_p,
            0.0,
            RidgePolicy::explicit_stabilization_full(),
            0.0,
            0.0,
            1.0,
            1.0e-6,
            1.0e-6,
            1.0e-6,
            0.0,
            0.0,
            Some(&math),
        );

        assert_eq!(
            report.hpen_nullity_at_rank_tol, 0,
            "post-absorption: full-rank H_pen must register nullity 0",
        );
        assert_ne!(
            report.diagnosis,
            KktRefusalDiagnosis::RankDeficientHPen,
            "post-absorption: the rank-deficiency diagnosis must no longer fire",
        );
    }

    /// Pins the structural effective-df machinery to the exact trace identity
    ///
    /// ```text
    /// Σ_j γ_j/(γ_j + λ) = tr{ G (G + λ S)⁻¹ }
    /// ```
    ///
    /// on a NON-commuting Gram/penalty pair, where the historical Rayleigh-quotient
    /// implementation (diagonal of B only) gave the wrong answer. With
    /// `S = diag(1, 4)` and `G = [[1, 0.8], [0.8, 1]]` the true generalized
    /// eigenvalues are eig(D^{-1/2} Uᵀ G U D^{-1/2}) ≈ [0.0767072, 1.1732928],
    /// whereas the Rayleigh quotients are [1, 0.25]; only the former reproduce the
    /// trace identity, and they disagree at λ = 1 (≈0.6111 vs the buggy 0.7000).
    #[test]
    fn structural_edf_matches_trace_identity_noncommuting_pair() {
        // Penalty S = diag(1, 4).
        let s = array![[1.0, 0.0], [0.0, 4.0]];
        // Design with Gram G = XᵀX = [[1, 0.8], [0.8, 1]]. Use the symmetric
        // square root G^{1/2} so that XᵀX = G exactly:
        //   G = 1.8·v1v1ᵀ + 0.2·v2v2ᵀ, v1=[1,1]/√2, v2=[1,-1]/√2.
        let off = 0.5 * (1.8_f64.sqrt() - 0.2_f64.sqrt());
        let diag = 0.5 * (1.8_f64.sqrt() + 0.2_f64.sqrt());
        let x = array![[diag, off], [off, diag]];
        let design = DesignMatrix::from(x);
        let penalty = PenaltyMatrix::Dense(s.clone());

        let gammas = design_penalty_range_gammas(&design, &penalty)
            .expect("2x2 full-rank p×p pair must yield generalized eigenvalues");
        assert_eq!(gammas.len(), 2, "range(S) is full rank ⇒ two γ_j");

        // Reference: G = XᵀX, and tr(G (G+λS)⁻¹) computed via the closed-form
        // 2×2 inverse of M = G + λ S (det/adjugate), independent of the helper.
        let g = array![[1.0, 0.8], [0.8, 1.0]];
        let trace_g_minv = |lambda: f64| -> f64 {
            let m00 = g[(0, 0)] + lambda * s[(0, 0)];
            let m01 = g[(0, 1)] + lambda * s[(0, 1)];
            let m10 = g[(1, 0)] + lambda * s[(1, 0)];
            let m11 = g[(1, 1)] + lambda * s[(1, 1)];
            let det = m00 * m11 - m01 * m10;
            // M⁻¹ = (1/det) [[m11, -m01], [-m10, m00]];
            // tr(G M⁻¹) = (1/det) · [ G00·m11 - G01·m10 - G10·m01 + G11·m00 ].
            (g[(0, 0)] * m11 - g[(0, 1)] * m10 - g[(1, 0)] * m01 + g[(1, 1)] * m00) / det
        };

        for &lambda in &[1.0_f64, 0.3] {
            let rho = lambda.ln();
            let edf = unit_weight_term_edf(&gammas, rho);
            let trace = trace_g_minv(lambda);
            assert!(
                (edf - trace).abs() < 1e-9,
                "structural edf {edf} must equal tr(G(G+λS)⁻¹) {trace} at λ={lambda}",
            );
        }

        // Sanity: the buggy Rayleigh quotients [1, 0.25] would give 0.7 at λ=1,
        // which the trace identity (≈0.6111) rejects — guard against regression
        // to the diagonal-only computation.
        let edf_at_one = unit_weight_term_edf(&gammas, 0.0_f64);
        assert!(
            (edf_at_one - 0.611111_f64).abs() < 1e-5,
            "edf at λ=1 must be ≈0.6111 (true), not 0.7000 (Rayleigh-quotient bug): got {edf_at_one}",
        );
    }
}