gam 0.3.117 - Docs.rs

//! The outer (ρ) objective: the `inner_blockwise_fit` driver, the joint
//! derivative providers (borrowed / owned / Jeffreys-aware), the ext-coord bundle
//! and scaled hyper-operators, inner-assembly construction, the unified joint
//! cost/gradient/EFS evaluators, and the outer-objective entry points
//! (`outerobjectivegradienthessian_internal`, `outerobjectiveefs`). Also the
//! blockwise-fit assembly-from-parts, warm-start carriers, outer-Hessian operator
//! wrappers, and labeled-lambda layout helpers shared with the outer engine.

use super::*;

pub(crate) fn inner_blockwise_fit<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
    warm_start: Option<&ConstrainedWarmStart>,
) -> Result<BlockwiseInnerResult, String> {
    // Inner-blockwise prelude waypoints. At large-scale n the cold-start
    // path between function entry and the first PIRLS/JN cycle-summary
    // log can run for many minutes (sometimes hours) silently while
    // row-kernel workspace builds run. Emit a `[STAGE] PIRLS/inner`
    // line at each transition so the next failed run pinpoints which
    // named step holds time. Gated on large-scale n so small-fit
    // tests stay quiet.
    let inner_started = std::time::Instant::now();
    let mut states = buildblock_states(family, specs)?;
    refresh_all_block_etas(family, specs, &mut states)?;
    let total_joint_p = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    let total_joint_n = joint_observation_count(&states);
    const INNER_PRELUDE_LOG_MIN_N: usize = 100_000;
    let prelude_log = total_joint_n >= INNER_PRELUDE_LOG_MIN_N;
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=buildblock_states+refresh_etas elapsed={:.3}s n={} p={} blocks={}",
            inner_started.elapsed().as_secs_f64(),
            total_joint_n,
            total_joint_p,
            specs.len(),
        );
    }
    let matrix_free_joint_requested = use_joint_matrix_free_path(total_joint_p, total_joint_n)
        || family.prefers_matrix_free_inner_joint(specs, &states);
    let has_workspace_source = family.inner_coefficient_hessian_hvp_available(specs);
    // Probe the *spec-aware* joint Hessian: it is the canonical source of the
    // coupled joint curvature. A family may override only
    // `exact_newton_joint_hessian_with_specs` (the variant that has access to
    // the realized block designs needed to assemble the cross-block
    // `X_aᵀ diag(w_ab) X_b` blocks — e.g. the Dirichlet common-parameterization
    // family, whose `evaluate` emits diagonal working sets so the spec-less
    // default block assembler returns `None`). Routing the inner joint-Newton
    // availability gate through the spec-less `exact_newton_joint_hessian`
    // would then mis-classify such a family as "no joint Hessian" and drop it
    // onto pure block-diagonal backfitting, which fails to reach KKT on small,
    // concentrated coupled likelihoods. The `_with_specs` path subsumes the
    // spec-less one for every family (single-block / uncoupled delegate
    // identically), so it is the correct probe here.
    let has_joint_exacthessian = if has_workspace_source {
        true
    } else {
        family
            .exact_newton_joint_hessian_with_specs(&states, specs)?
            .is_some()
    };
    let coupled_exact_joint_required = specs.len() >= 2
        && !family.likelihood_blocks_uncoupled()
        && (family.has_explicit_joint_hessian() || has_workspace_source);
    // When the family declares its likelihood blocks UNCOUPLED
    // (`∂²L/∂β_a∂β_b = 0` for every a ≠ b) the joint penalized objective is
    // fully separable across blocks: the joint Hessian is exactly
    // block-diagonal and each block carries only its own penalty. On a
    // separable objective block-coordinate descent solves each block's
    // (possibly inequality-constrained) subproblem to its own exact optimum —
    // it IS the joint solve, and each block gets its OWN trust radius, its OWN
    // active-set QP, and its OWN KKT certificate.
    //
    // Forcing the coupled joint-Newton onto such a problem instead couples two
    // independent blocks under ONE shared trust radius and ONE concatenated
    // KKT residual. That is actively harmful when the blocks differ sharply in
    // conditioning — the competing-risks twin time-basis fit (#1025) is the
    // canonical case: two cause-specific baselines share the same I-spline
    // evaluated at the same event times, but one cause sits near its
    // monotonicity-constraint boundary with an O(1e5) hazard-derivative
    // gradient while the other is interior. The shared globalization cannot
    // satisfy both blocks' KKT conditions at once; the joint residual stalls
    // far above tolerance, the inner solve burns its whole cycle budget on
    // every outer ρ-eval, and the fit only survives by falling through to the
    // block-coordinate path anyway (which then converges in a handful of
    // cycles). Route uncoupled multi-block specs straight to that exact
    // separable path. `coupled_exact_joint_required` is already gated the same
    // way (uncoupled families are designed to fall through to blockwise), so
    // this only stops the engine from attempting — and grinding on — a joint
    // solve it was never required to run.
    //
    // Single-block families and genuinely coupled multi-block families are
    // unaffected: the former never had cross-block coupling to begin with, the
    // latter still take the joint path (their objective is NOT separable, so
    // block-coordinate descent would drop the cross-block ∂²L/∂β_a∂β_b
    // curvature).
    let blocks_separable = specs.len() >= 2 && family.likelihood_blocks_uncoupled();
    let use_joint_newton =
        has_joint_exacthessian && (specs.len() >= 2 || has_workspace_source) && !blocks_separable;
    let joint_workspace_requested = use_joint_newton && has_workspace_source;
    let inner_tol = options.inner_tol;
    let inner_max_cycles_base = options.inner_max_cycles;
    // Per-outer-call inner-cycle cap. The earlier "adaptive inner cycle
    // cap" doubled this mid-loop on plateaus, but that turned out to be
    // the wrong response to stalled descent (descent ratios pinned at
    // ~0.999 paired with a sub-tolerance objective change is the
    // no-descent signal, not a "give Newton more cycles" signal). The
    // plateau-flat-objective convergence certificate in the inner-cycle
    // body now handles that case directly, so the cap stays fixed at the
    // baseline for the lifetime of this outer call.
    let inner_max_cycles = capped_inner_max_cycles(options, inner_max_cycles_base);
    // Each block's assembled penalty matrix depends only on that block's
    // penalties and smoothing parameters. Build these setup matrices in
    // parallel, but keep the coordinate-descent and line-search loops below
    // strictly serial because each accepted block update changes the state seen
    // by later blocks.
    use rayon::iter::{IntoParallelIterator, ParallelIterator};
    let s_lambdas_launch_started = std::time::Instant::now();
    let s_lambdas_par_iter = (0..specs.len()).into_par_iter().map(|b| {
        let spec = &specs[b];
        let Some(block_log_lambda) = block_log_lambdas.get(b) else {
            return Err(CustomFamilyError::UnsupportedConfiguration {
                reason: format!("missing log-smoothing parameter vector for block {b}"),
            }
            .into());
        };
        if block_log_lambda.len() != spec.penalties.len() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {b} log-smoothing parameter length {} does not match penalties {}",
                    block_log_lambda.len(),
                    spec.penalties.len()
                ),
            }
            .into());
        }

        let p = spec.design.ncols();
        let lambdas = block_log_lambda.mapv(f64::exp);
        let mut s_lambda = Array2::<f64>::zeros((p, p));
        for (k, s) in spec.penalties.iter().enumerate() {
            s.add_scaled_to(lambdas[k], &mut s_lambda);
        }
        Ok(s_lambda)
    });
    let s_lambdas_collect_started = std::time::Instant::now();
    let s_lambdas_launch_elapsed = s_lambdas_launch_started.elapsed();
    let s_lambdas = s_lambdas_par_iter.collect::<Result<Vec<_>, String>>()?;
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=s_lambdas par_iter launch={:.3}s collect={:.3}s blocks={} (since inner-start={:.3}s)",
            s_lambdas_launch_elapsed.as_secs_f64(),
            s_lambdas_collect_started.elapsed().as_secs_f64(),
            specs.len(),
            inner_started.elapsed().as_secs_f64(),
        );
    }
    let ridge = effective_solverridge(options.ridge_floor);
    let joint_bundle: Option<&crate::families::joint_penalty::JointPenaltyBundle> =
        options.joint_penalties.as_deref();
    if let Some(bundle) = joint_bundle {
        for (i, spec) in bundle.specs.iter().enumerate() {
            if spec.dim() != total_joint_p {
                return Err(format!(
                    "joint penalty {i}: dim {} != total compiled p {}",
                    spec.dim(),
                    total_joint_p,
                ));
            }
        }
        if bundle.specs.len() != bundle.log_lambdas.len() {
            return Err(format!(
                "joint penalty bundle: {} specs vs {} log_lambdas",
                bundle.specs.len(),
                bundle.log_lambdas.len(),
            ));
        }
    }
    let mut cached_active_sets: Vec<Option<Vec<usize>>> = vec![None; specs.len()];
    if let Some(seed) = warm_start
        && seed.block_beta.len() == states.len()
        && seed.active_sets.len() == states.len()
    {
        if warm_start_matches_block_log_lambdas(seed, block_log_lambdas)
            && let Some(cached) = seed.cached_inner.as_ref()
            && cached.converged
            && seed
                .block_beta
                .iter()
                .zip(&states)
                .all(|(beta_seed, state)| beta_seed.len() == state.beta.len())
        {
            for (state, beta_seed) in states.iter_mut().zip(&seed.block_beta) {
                state.beta.assign(beta_seed);
            }
            cached_active_sets = seed.active_sets.clone();
            refresh_all_block_etas(family, specs, &mut states)?;
            log::info!(
                "[PIRLS/joint-Newton warm-start] reused cached same-rho inner mode | cycles={} logdet_h={:.6e} logdet_s={:.6e}",
                cached.cycles,
                cached.block_logdet_h,
                cached.block_logdet_s,
            );
            return Ok(BlockwiseInnerResult {
                block_states: states,
                active_sets: normalize_active_sets(cached_active_sets),
                log_likelihood: cached.log_likelihood,
                penalty_value: cached.penalty_value,
                cycles: cached.cycles,
                converged: cached.converged,
                block_logdet_h: cached.block_logdet_h,
                block_logdet_s: cached.block_logdet_s,
                s_lambdas,
                joint_workspace: cached.joint_workspace.clone(),
                kkt_residual: cached.kkt_residual.clone(),
                active_constraints: cached.active_constraints.clone(),
            });
        }
        // Cold-start path: copy prior β where dimensions match
        // (best-effort; mismatched blocks keep the freshly-built
        // initial state).
        for (b, beta_seed) in seed.block_beta.iter().enumerate() {
            if beta_seed.len() == states[b].beta.len() {
                let beta_projected =
                    family.post_update_block_beta(&states, b, &specs[b], beta_seed.clone())?;
                states[b].beta.assign(&beta_projected);
            }
        }
        cached_active_sets = seed.active_sets.clone();
        refresh_all_block_etas(family, specs, &mut states)?;
    }
    let load_joint_started = std::time::Instant::now();
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=load_joint_gradient_evaluation begin use_joint_newton={} joint_workspace_requested={} (since inner-start={:.3}s)",
            use_joint_newton,
            joint_workspace_requested,
            inner_started.elapsed().as_secs_f64(),
        );
    }
    let (
        mut current_log_likelihood,
        mut cached_eval,
        mut cached_joint_gradient,
        mut cached_joint_workspace,
    ) = if use_joint_newton {
        let (log_likelihood, gradient, eval, workspace) = load_joint_gradient_evaluation(
            family,
            specs,
            options,
            &states,
            joint_workspace_requested,
            None,
        )?;
        (log_likelihood, eval, gradient, workspace)
    } else {
        let eval = family.evaluate(&states)?;
        let log_likelihood = eval.log_likelihood;
        (log_likelihood, Some(eval), None, None)
    };
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=load_joint_gradient_evaluation end elapsed={:.3}s log_likelihood={:.6e} has_gradient={} has_workspace={}",
            load_joint_started.elapsed().as_secs_f64(),
            current_log_likelihood,
            cached_joint_gradient.is_some(),
            cached_joint_workspace.is_some(),
        );
    }
    // Validate exact-Newton block Hessians at the family-evaluation
    // boundary. A non-finite entry is a contract violation against the
    // family's analytic second derivative; refuse to iterate before
    // any factorization rather than letting it slip through to a
    // downstream logdet check that may be gated off by the outer
    // optimizer's flags.
    let validate_started = std::time::Instant::now();
    if let Some(eval) = cached_eval.as_ref() {
        validate_block_hessians_finite(eval)?;
    }
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=validate_block_hessians_finite elapsed={:.3}s checked={}",
            validate_started.elapsed().as_secs_f64(),
            cached_eval.is_some(),
        );
    }
    let penalty_started = std::time::Instant::now();
    let mut current_penalty = total_quadratic_penalty(
        &states,
        &s_lambdas,
        ridge,
        options.ridge_policy,
        joint_bundle,
        Some(specs),
    );
    if prelude_log {
        log::info!(
            "[STAGE] PIRLS/inner step=total_quadratic_penalty elapsed={:.3}s penalty={:.6e} (prelude_total={:.3}s)",
            penalty_started.elapsed().as_secs_f64(),
            current_penalty,
            inner_started.elapsed().as_secs_f64(),
        );
    }
    let mut lastobjective = -current_log_likelihood + current_penalty;
    let mut converged = false;
    let mut cycles_done = 0usize;
    // Pre-allocate per-block eta backup buffers to avoid O(n) allocation
    // per block per cycle in the backtracking line search.
    let mut eta_backups: Vec<Array1<f64>> =
        states.iter().map(|s| Array1::zeros(s.eta.len())).collect();

    // ── Joint Newton fast path ──
    //
    // When the family provides an exact joint Hessian (GAMLSS location-scale),
    // solve the full (p_mu + p_ls) × (p_mu + p_ls) system in one Newton step
    // per cycle instead of iterating between blocks. This converges quadratically
    // (5-10 steps) instead of linearly (20-100+ blockwise cycles).
    //
    // Generic block-diagonal surrogate families may still fall back to
    // blockwise iteration if the joint surrogate is unavailable. Families that
    // advertise a real coupled joint Hessian must not: the blockwise loop only
    // sees principal blocks, so it drops the cross-block curvature that makes
    // the joint problem well conditioned near saturated optima.

    // `last_residual_tol` mirrors the per-cycle KKT tolerance computed inside
    // the joint-Newton loop (`inner_tol · (1 + max(‖∇L‖∞, ‖Sβ‖∞))`). It must
    // live at function scope so both the post-converged exit block inside
    // `if use_joint_newton` AND the post-block-fit IFT residual builder
    // outside that branch can thread the same tolerance into the
    // `ProjectedKktResidual::with_metadata(...)` builder. Seed at `inner_tol`
    // so a path that skips the loop entirely (no joint-Newton, or zero
    // cycles) still records a finite, non-NaN tolerance on the residual
    // carrier rather than NaN.
    let mut last_residual_tol: f64 = inner_tol;

    if use_joint_newton {
        // Build block ranges for the joint system.
        let ranges: Vec<(usize, usize)> = {
            let mut offset = 0;
            specs
                .iter()
                .map(|s| {
                    let start = offset;
                    offset += s.design.ncols();
                    (start, offset)
                })
                .collect()
        };
        let total_p: usize = ranges.last().map_or(0, |r| r.1);

        // Universal full-span Jeffreys/Firth robustness. Build `Z_J` once and
        // use the same term in the coupled Newton step, objective value, and
        // stationarity checks so a near-separating coefficient is bounded by
        // the likelihood's own Fisher geometry instead of an ad-hoc ridge.
        // `None` (empty coefficient system) leaves every step and objective at
        // the un-augmented inner Newton.
        //
        // Continuous-response families (the canonical example: transformation-
        // normal h(Y|x) ~ N(0,1)) opt out via
        // `joint_jeffreys_term_required() = false`. They have no separation
        // regime, the Fisher information is `O(n)` on every identified
        // direction by construction, and each Jeffreys evaluation costs
        // `p` directional-derivative calls into the family's exact joint
        // Hessian — at large scale (CTN duchon16d, p=144, n=20000) that
        // is the dominant per-cycle cost (~200 s/cycle on three calls per
        // cycle), exhausting the inner budget before the algorithm converges
        // while contributing essentially zero to the gradient/curvature.
        let joint_jeffreys_subspace = if family.joint_jeffreys_term_required() {
            build_joint_jeffreys_subspace(specs, &ranges)?
        } else {
            None
        };
        // FIRTH MERIT BOOKKEEPING (gam#826/#872 — per-cycle Φ fold, not a carried
        // value). `current_penalty` / `lastobjective` hold ONLY the quadratic
        // penalty `½βᵀSβ` (NO Φ). The Firth value `−Φ` is folded into the
        // accept/reject comparison FRESH at each β under the same
        // `jeffreys_skippable_this_cycle` gate the step and KKT residual use, so
        // `old_objective` (old β) and `trialobjective` (trial β) are always on the
        // same objective `−ℓ + ½βᵀSβ − Φ` regardless of whether a cycle skips the
        // term. Carrying Φ in `current_penalty` (the previous design) desynced
        // old-vs-trial by ±Φ whenever the per-cycle skippable decision flipped —
        // and the cycle-0 baseline folded Φ UNCONDITIONALLY while the trial folded
        // it gated, so a skippable cycle 0 saw a spurious `Δobj = ±Φ`, rejected
        // every backtrack, and refused as a `phantom_multiplier` at a zero step
        // (the binomial location-scale coupled non-convergence). SIGN: Firth ADDS
        // ½log|I| to the log-likelihood ⇒ the NLL objective SUBTRACTS Φ, matching
        // the Newton step rhs / KKT residual which ADD `∇Φ` to `∇L − Sβ`.

        let joint_mode_diagonal_ridge =
            if ridge > 0.0 && options.ridge_policy.include_quadratic_penalty {
                ridge
            } else {
                0.0
            };

        // Exact joint Newton steps are guarded by two independent mechanisms:
        // family-owned feasibility (`max_feasible_step_size`) and the adaptive
        // trust region below. There is intentionally no family hook for a
        // hard per-attempt coefficient-space clamp; keeping the policy local
        // avoids stale no-op configuration and makes the trust-region behavior
        // explicit at the only place it is used.

        // Cross-cycle convergence carry-over: set at the end of every
        // accepted cycle so the next cycle can distinguish a true KKT
        // optimum on a rank-deficient null mode (objective stuck
        // because every direction is along the null space) from
        // genuine non-convergence. The residual signal does not need
        // a carry-over — `residual <= residual_tol` is the canonical
        // KKT certificate and the end-of-cycle test consumes it
        // directly when it fires.

        // Predicted-reduction tracker for the principled trust-region
        // stopping criterion (Conn-Gould-Toint, *Trust-Region Methods*,
        // Theorem 6.4.6). The Newton model at the accepted step has a
        // predicted decrease `m(0) − m(δ) = −g·δ − 0.5·δ·H·δ`. For an
        // unclipped Newton step (H·δ = −g) this is `0.5·g·H⁻¹·g`, the
        // Newton decrement squared / 2. When the model itself predicts
        // a decrease smaller than the objective tolerance, no descent
        // direction the Hessian can resolve will lower the objective
        // by more than `objective_tol`, and continuing is wall-clock
        // waste regardless of whether the raw gradient residual or
        // step-norm gates have closed.
        //
        // Cross-cycle convergence carry-over: set at the end of every
        // accepted cycle so the next cycle's line-search-failure path
        // can distinguish a true KKT optimum on a rank-deficient
        // Hessian (no meaningful trial step, even though step_inf is
        // O(1) along the null mode) from genuine non-convergence.
        let mut last_cycle_residual_below_tol = false;
        let mut last_cycle_obj_change_below_tol = false;

        let mut joint_trust_radius = 1.0_f64;
        let mut joint_block_trust_radii = vec![1.0_f64; ranges.len()];
        let mut last_accepted_hit_joint_trust_boundary = false;
        // Hard upper bound for the for-loop's range. The cap is fixed at
        // `inner_max_cycles` for the lifetime of this outer call (the
        // earlier mid-loop cap extension was removed in favor of the
        // plateau-flat-objective convergence certificate), but the
        // sentinel pattern is retained — the `.max(200)` floor is a
        // harmless safety pad and the explicit `cycle >= inner_max_cycles`
        // break keeps the existing `continue` statements in the body
        // working
        // (they advance `cycle` via the iterator), unlike a `while` +
        // manual-counter rewrite.
        let inner_loop_hard_ceiling = inner_max_cycles.max(200);
        // Verbose cadence for the inner joint-Newton log block. Boring cycles
        // (first-attempt accepts with no convergence event) emit ONE compact
        // one-liner instead of the 4-line pre-cycle/TR/cycle-summary/convergence
        // block. Verbose cycles (first, last, every 20th, all rejections,
        // convergence events) keep the full detail. JOINT_LOG_VERBOSE_PERIOD is
        // tuned so a 200-cycle inner solve emits ~10 detailed waypoints plus
        // 1 compact line per remaining cycle (~210 lines), down from ~800.
        const JOINT_LOG_VERBOSE_PERIOD: usize = 50;
        // Residual-stall detector for joint Newton. Distinct from the
        // blockwise loglik-frozen divergence detector lower in the file:
        // that one requires the log-likelihood to be unchanged for K
        // cycles AND the per-block Newton step pinned at the cap.
        //
        // Large-scale survival marginal-slope hits a different pattern —
        // the joint objective decreases monotonically by O(1) per cycle
        // (so loglik is NOT frozen), the TR repeatedly clamps proposals
        // with |prop|∞ >> trust_radius, and the post-step KKT residual
        // oscillates in a band orders of magnitude above residual_tol
        // without trending down. Burning the rest of the cycle budget on
        // this pattern reaches inner_max_cycles "non-converged", which
        // then drops the outer optimizer into the first-order bridge
        // fallback with a stale-mode gradient that ‖g‖ ≈ 10⁷ kills BFGS
        // line search at iter 0.
        //
        // Track the best residual seen and the number of cycles since
        // any meaningful improvement (≥10% drop). Once we've burned at
        // least RESIDUAL_STALL_MIN_CYCLES with no improvement AND the
        // TR has been clamping aggressively, exit `converged=false` so
        // the outer optimizer sees a non-converged signal while we still
        // have a finite, in-range β to return (instead of running to the
        // hard ceiling and then handing BFGS a junk gradient).
        const RESIDUAL_STALL_NO_IMPROVE_CYCLES: usize = 30;
        const RESIDUAL_STALL_MIN_CYCLES: usize = 40;
        const RESIDUAL_STALL_IMPROVEMENT_FACTOR: f64 = 0.9;
        const RESIDUAL_STALL_BLOCK_GRADIENT_FACTOR: f64 = 50.0;
        let mut best_residual_seen: f64 = f64::INFINITY;
        // Smallest *certified* stationarity residual the solve actually computed,
        // tracked independently of `best_residual_seen` (whose updates are bound
        // to the residual-stall counters at the post-step site below and so are
        // skipped by every head-of-cycle / pre-line-search certificate exit). The
        // terminal verdict reports THIS so a legitimate early-certificate exit
        // (e.g. the cycle-0 pre-line-search KKT exit on intercept-only / already-
        // stationary data) reports the finite residual it certified on instead of
        // the sentinel `inf` — converged=true must never be paired with a non-
        // finite residual in the log (#1040 inner-report truthfulness).
        let mut min_certified_residual: f64 = f64::INFINITY;
        let mut cycles_since_residual_improved: usize = 0;
        // Number of consecutive non-improving cycles after which the
        // conditioning-based self-vanishing Levenberg–Marquardt damping is
        // ARMED inside the spectral-range Newton solve, for EVERY family
        // (#826/#808). The undamped range-restricted Newton step oscillates on a
        // full-rank-but-ill-conditioned penalized Hessian at the oversmoothed-ρ
        // operating point: the tiny-but-above-cutoff curvature of the lightly
        // identified mean/threshold/wiggle block takes an enormous `component/λ`
        // proposal that the trust region clips every cycle, so the residual on
        // that block freezes while its β stays ≈0 (the exact #826 signature).
        // The conditioning-gated `μ = c·‖∇L − Sβ‖∞` caps that component into a
        // bounded descent step. It is SELF-VANISHING (μ → 0 as the residual → 0)
        // so the converged β and the KKT certificate are byte-identical to the
        // undamped solve — zero REML/LAML bias. Arming it on OBSERVED non-
        // progress rather than a static per-family flag keeps the AFT /
        // constant-scale endgame (which converges quadratically and never
        // stalls) byte-identical: a quadratically-converging solve reaches
        // tolerance in a handful of cycles and never trips this threshold, so μ
        // is never engaged there. Only a genuinely oscillating ill-conditioned
        // solve crosses it, which is exactly when the damping is sound. Set a
        // few cycles below the stall-exit window so the damping gets a chance to
        // rescue the solve well before the early-exit / budget tripwire fires.
        // (The conditioning-gated self-vanishing μ this armed now lives ONLY in the
        // test-retained `solve_joint_newton_step_on_spectral_range`; the production
        // joint step takes the exact trust-region multiplier λ instead — gam#979.)
        // Recent KKT-residual values (oldest→newest) used to detect STEADY
        // geometric descent at the certificate-refusal gate. A still-converging
        // Newton direction (residual dropping by a steady factor < 1 each cycle)
        // must not be misclassified as a multiplier/null plateau and exited
        // early (gam#787 duchon centers≥20: the logslope block converges
        // geometrically — residual ~0.33×/cycle — but `linearized_rel ≥ 0.5`
        // routed it into the plateau-refusal break a few cycles short of tol).
        const RESIDUAL_DESCENT_WINDOW: usize = 3;
        let mut residual_descent_history: std::collections::VecDeque<f64> =
            std::collections::VecDeque::with_capacity(RESIDUAL_DESCENT_WINDOW);
        let mut tr_clamped_during_stall: bool = false;
        // Fully-rejected stall guard. The residual-stall guard below
        // (post-grad-reload) only fires on cycles that produced an accepted
        // step, because every termination check it gates lives after the
        // `if !accepted { continue; }` exit at the bottom of the trust-region
        // attempt loop. When every cycle in a row is fully rejected — all
        // JOINT_TRUST_MAX_ATTEMPTS trial steps fail the line-search check —
        // none of those guards ever see the iterate, the cycle loop spins
        // up to `inner_loop_hard_ceiling` cycles, and the inner solver burns
        // ~120 s of wall-clock per outer ρ-evaluation that the outer
        // optimizer will reject anyway. The signature is exact and local:
        // (i) every trust attempt this cycle was rejected by SOME path —
        // model, likelihood, OR objective (the three counters partition the
        // JOINT_TRUST_MAX_ATTEMPTS attempts), so `model_rejects +
        // likelihood_rejects + objective_rejects == JOINT_TRUST_MAX_ATTEMPTS`,
        // AND (ii) the joint trust radius has NOT shrunk relative to the
        // previous fully-rejected cycle. Condition (i) was originally
        // objective-only (`objective_rejects == MAX`, others 0), which never
        // fired on the biobank gauge-flat marginal/logslope fit: there the
        // objective is flat to f64 precision along the residual direction and
        // the BMS line search rejects every trial on the LIKELIHOOD early-exit
        // path, so the guard's increment was unreachable and the loop spun to
        // the cap. A full likelihood-path rejection at a collapsed radius is
        // the same no-descent stall, so any-path full rejection counts.
        // Condition (ii) is what proves no progress is possible: β is
        // reverted to its pre-cycle value on every fully-rejected cycle, so
        // with an identical Newton system AND an identical trust radius the
        // next cycle's trust-region search is byte-deterministically the
        // same as this one's. The radius can stall above the 1e-12 floor
        // when `shrink_active_joint_block_trust_radii` only shrinks blocks
        // that hit their per-block boundary — an interior block keeps its
        // radius forever, so `max(block_radii)` is held by that block while
        // the boundary block's radius collapses to 1e-12 without changing
        // the max. After `FULLY_REJECTED_STALL_MAX_CYCLES` consecutive cycles
        // with both conditions, judge convergence on the identified (range)
        // subspace: a stall at a collapsed radius proves the descent direction
        // is gauge-flat, so if the range-projected KKT residual is at tolerance
        // the fit is at a numerically-stationary penalized optimum and is
        // returned converged; only when the identified-subspace residual is
        // ALSO above tol is this a genuine non-convergence the outer optimizer
        // should reject — exit non-converged so it rejects this ρ cleanly
        // instead of waiting for the cycle cap.
        const FULLY_REJECTED_STALL_MAX_CYCLES: usize = 8;
        let mut prev_rejected_trust_radius: Option<f64> = None;
        let mut consecutive_held_rejected_cycles: usize = 0;
        let mut last_joint_math: Option<JointNewtonMathDiagnostic> = None;
        // Cross-cycle cache of the joint Jeffreys/Firth triple `(β_key, ∇Φ, H_Φ)`
        // (gam#729/#826/#808). Computing `(∇Φ, H_Φ)` costs `p` family
        // directional-derivative calls plus the `½ S Sᵀ` GEMM; for a K-block
        // coupled family that is the dominant per-inner-cycle cost. The post-step
        // KKT residual recomputes the triple at the just-accepted β; the NEXT
        // cycle's head needs the SAME triple at that SAME β. Carry it forward
        // keyed on the flattened β so the head reuses the post-step result instead
        // of recomputing — collapsing two O(p)-directional-derivative evaluations
        // per accepted cycle to one. The key is an exact-equality check on the
        // flattened β (β is byte-identical between an accepted post-step residual
        // and the next head), so the reused term is the exact term at the current
        // iterate — no staleness, no tolerance fudge.
        let mut jeffreys_triple_cache: Option<(Array1<f64>, Array1<f64>, Array2<f64>)> = None;
        // Stash for the structured cert-REFUSED report computed inside the
        // cycle loop, so the post-loop bubbled error (`coupled exact-joint
        // inner solve exited the joint Newton path …`) can emit the same
        // per-block + spectrum breakdown without re-materializing H_pen.
        let mut last_kkt_refusal_report: Option<KktRefusalReport> = None;
        let mut prev_kkt_norm: Option<f64> = None;
        // Convergence-endgame flag for the Jeffreys second-order completion
        // (gam#979): set once the post-step KKT residual enters
        // `JEFFREYS_COMPLETION_RESIDUAL_BAND × residual_tol`, consumed by the
        // next cycle's dense-spectral step assembly.
        let mut jeffreys_completion_endgame = false;
        // Plateau streak on |Δobj| ≤ objective_tol. The scale-aware
        // flatness predicate stays local to this loop; the streak/window
        // discipline (grow on flat, reset on recovery) is the shared
        // loop_guard::FlatStreak so it cannot drift from the other
        // stagnation detectors in the tree (#968).
        let mut obj_flat_streak = crate::solver::loop_guard::FlatStreak::new(
            crate::solver::loop_guard::PLATEAU_DEFAULT_WINDOW,
        );
        // Total descent budget across the joint-Newton loop, used by
        // the end-of-loop summary to report `descent_total`.
        let initial_joint_objective: f64 = lastobjective;
        // Per-cycle |Δobjective| history for the geometric-tail trigger of
        // the constrained-stationary certificate below. When the cycles
        // settle into a linear-rate plateau (|Δobj_next| / |Δobj_prev|
        // approaching 1 monotonically over the window), the total
        // *remaining* objective descent is rigorously bounded above by the
        // geometric series sum |Δobj_now| / (1 − max_ratio). When that
        // bound is below `objective_tol` the cert can fire many cycles
        // earlier than waiting for any single |Δobj| to individually
        // cross obj_tol — the bound is mathematically the same precision
        // contract, applied to the asymptotic tail rather than one step.
        const GEOMETRIC_TAIL_WINDOW: usize = 5;
        let mut geometric_tail_history: std::collections::VecDeque<f64> =
            std::collections::VecDeque::with_capacity(GEOMETRIC_TAIL_WINDOW);

        // The exact joint-Hessian route solves the penalized Newton system
        // directly. Extra damping must be wired through an accepted/rejected
        // step policy before it belongs here; keep the matvec faithful to the
        // objective until then.
        for cycle in 0..inner_loop_hard_ceiling {
            if cycle >= inner_max_cycles {
                break;
            }
            let verbose_cycle = cycle == 0
                || cycle + 1 == inner_max_cycles
                || (cycle + 1) % JOINT_LOG_VERBOSE_PERIOD == 0;
            // Pre-cycle header line removed: the post-cycle one-liner below
            // carries cycle/objective/Δobj/step/residual/time and on verbose
            // cadence the expanded convergence line additionally carries
            // -loglik and penalty. Suppressing this avoids emitting a second
            // info-level line per cycle just to repeat numbers we already
            // log at end of cycle.
            // Per-cycle phase-timing accumulators. Surface where the inner
            // joint-Newton spends time so a 18-min silent cycle 0 (the
            // bernoulli marginal-slope FLEX large-scale failure mode) becomes a
            // logged timeline at the end of the cycle. Phases:
            //   * hessian: joint Hessian source build (matrix-free workspace
            //     OR dense fallback assembly)
            //   * pcg:     matrix-free QP solve via solve_spd_pcg_with_info_into
            //              (already logs its own diagnostics; we accumulate
            //              here for the end-of-cycle summary)
            //   * line_search: backtracking step-size search (up to 8 attempts)
            //   * grad_reload: post-accept joint gradient + workspace refresh
            let cycle_started = std::time::Instant::now();
            // Top-of-cycle row-measure capture. The trust-region ratio
            // ρ = [F(β) − F(β + δ)] / [−g·δ − ½·δᵀHδ] is only meaningful when
            // every input (Hessian, gradient, objective at β, trial objective
            // at β + δ) is evaluated against the same row measure. We freeze
            // the measure here and re-read it at each of the four sites later
            // in the cycle, then hard-fail (Err) just before ρ if any of them
            // diverged. Cf. `src/solver/row_measure.rs`.
            let tr_row_measure_top =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            let hessian_started = std::time::Instant::now();
            let hessian_scope_guard = crate::process_monitor::track_scope(format!(
                "joint Newton hessian_qp cycle={cycle} n={total_joint_n} p={total_p}"
            ));
            log::info!(
                "[joint-newton-tr] phase=hessian_qp cycle={} r={:.3e}",
                cycle,
                joint_trust_radius,
            );
            let cycle_log = prelude_log;
            let constraints_started = std::time::Instant::now();
            let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
            let joint_constraints =
                assemble_joint_linear_constraints(&block_constraints, &ranges, total_p)?;
            if cycle_log && cycle == 0 {
                log::info!(
                    "[STAGE] PIRLS/inner step=cycle0 block+joint constraints elapsed={:.3}s n={} p={}",
                    constraints_started.elapsed().as_secs_f64(),
                    total_joint_n,
                    total_p,
                );
            }
            let workspace_build_started = std::time::Instant::now();
            // Get joint Hessian and block gradients from the current evaluation.
            let hessian_workspace_for_cycle: Option<Arc<dyn ExactNewtonJointHessianWorkspace>> =
                None;
            let joint_hessian_source = if joint_workspace_requested {
                let cached_hit = cached_joint_workspace.is_some();
                let workspace = match cached_joint_workspace.take() {
                    Some(workspace) => Some(workspace),
                    None => family.exact_newton_joint_hessian_workspace_with_options(
                        &states, specs, options,
                    )?,
                };
                if cycle_log && cycle == 0 {
                    log::info!(
                        "[STAGE] PIRLS/inner step=cycle0 hessian-workspace cached_hit={} elapsed={:.3}s n={} p={}",
                        cached_hit,
                        workspace_build_started.elapsed().as_secs_f64(),
                        total_joint_n,
                        total_p,
                    );
                }
                workspace
                    .as_ref()
                    .map(|workspace| {
                        exact_newton_joint_hessian_source_from_workspace(
                            workspace,
                            total_p,
                            MaterializationIntent::InnerSolve,
                            "joint Newton inner exact-newton operator mismatch",
                        )
                    })
                    .transpose()?
                    .flatten()
            } else {
                None
            };
            // Row measure observed by the Hessian build above.
            let tr_row_measure_hessian =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            let joint_hessian_source = match joint_hessian_source {
                Some(source) => source,
                None => {
                    // Spec-aware joint Hessian: canonical coupled-curvature
                    // source (see the availability gate above). Families that
                    // only override `_with_specs` (Dirichlet common-parameter)
                    // would otherwise hand back `None` from the spec-less
                    // default and silently drop off the joint-Newton path.
                    let h_joint_opt =
                        family.exact_newton_joint_hessian_with_specs(&states, specs)?;
                    let Some(h_joint) = h_joint_opt else {
                        break; // Fall back to blockwise if joint Hessian unavailable
                    };
                    match symmetrized_square_matrix(
                        h_joint,
                        total_p,
                        "joint Newton inner exact-newton Hessian shape mismatch",
                    ) {
                        Ok(matrix) => JointHessianSource::Dense(matrix),
                        Err(_) => break,
                    }
                }
            };
            let hessian_source_elapsed = workspace_build_started.elapsed();
            if hessian_source_elapsed.as_secs_f64() >= 1.0 || (cycle_log && cycle == 0) {
                let source_kind = if matches!(&joint_hessian_source, JointHessianSource::Dense(_)) {
                    "dense"
                } else {
                    "operator"
                };
                log::info!(
                    "[STAGE] PIRLS/inner step=cycle{} hessian-source joint_workspace_requested={} source={} elapsed={:.3}s n={} p={}",
                    cycle,
                    joint_workspace_requested,
                    source_kind,
                    hessian_source_elapsed.as_secs_f64(),
                    total_joint_n,
                    total_p,
                );
            }

            // Concatenate block gradients and betas.
            let Some(grad_joint) = cached_joint_gradient.clone() else {
                break;
            };
            // Row measure observed by the gradient at β. `cached_joint_gradient`
            // was loaded earlier under `options`; if the auto-subsample
            // installer or any sibling path swapped the mask between then and
            // now, the id captured here will diverge from the rest and the
            // pre-ρ check below will Err. Cf. `src/solver/row_measure.rs`.
            let tr_row_measure_gradient =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            if grad_joint.len() != total_p {
                break;
            }
            let mut beta_joint = Array1::<f64>::zeros(total_p);
            for b in 0..specs.len() {
                let (start, end) = ranges[b];
                beta_joint
                    .slice_mut(ndarray::s![start..end])
                    .assign(&states[b].beta);
            }

            // Non-finite-curvature guard (gam#1088). A `NaN`/`Inf` in the
            // family curvature `H` makes the penalized Hessian `H_pen = H +
            // S(λ)` — and therefore its spectrum — degenerate, so the KKT
            // certificate is structurally unreachable: the spectral step
            // solve produces garbage, the projected residual neither converges
            // nor trends down, and the residual-based divergence/stall guards
            // below (gated on a *finite* residual that a corrupted-but-not-yet-
            // propagated curvature can still leave finite) do not catch it.
            // Left unguarded the loop then burns the full `inner_loop_hard_
            // ceiling` (1200 cycles) on every outer ρ-eval / seed — the
            // multi-hour link-wiggle & location-scale benchmark timeouts. The
            // penalty is finite by construction, so this is a curvature defect:
            // the trial is degenerate. Exit immediately as non-converged with
            // the current finite β so the outer optimizer rejects this ρ-eval
            // cleanly (mirrors the residual divergence guard below), rather
            // than grinding to the ceiling and reporting a `NaN` H_pen
            // spectrum at the refusal point.
            if !joint_hessian_source_curvature_is_finite(&joint_hessian_source) {
                cycles_done = cycle + 1;
                log::warn!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | non-finite-curvature guard (gam#1088): the joint Hessian source carries a non-finite entry, so the penalized Hessian H_pen = H + S(λ) and its spectrum (λ_max/λ_min/cond) are degenerate and the KKT certificate can never be issued; returning unconverged with finite β so the outer optimizer rejects this ρ evaluation instead of grinding to inner_max_cycles={}.",
                    cycle,
                    inner_max_cycles,
                );
                converged = false;
                break;
            }

            let trace_diagonal_ridge = joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE;
            let joint_hessian_is_dense =
                matches!(&joint_hessian_source, JointHessianSource::Dense(_));
            let joint_solver_diagonal_ridge = stabilized_joint_solver_diagonal_ridge(
                family,
                &joint_hessian_source,
                &ranges,
                &s_lambdas,
                trace_diagonal_ridge,
                options.ridge_floor,
                joint_bundle,
            );
            // CHEAP CONDITIONING PRE-CHECK (always-on robustness, zero-cost on
            // easy/large fits). Before paying for the dense joint-Hessian
            // materialization + `O(p³)` reduced eigendecomposition inside the
            // Jeffreys term, ask whether the term is PROVABLY skippable from a few
            // matrix-free Hessian-vector products against the source we just built.
            // When `true`, the exact conditioning gate is certain to return the
            // zero term, so every Jeffreys call this cycle short-circuits to the
            // exact-zero contribution WITHOUT forming anything dense — byte-
            // identical to the gated-off path, and preserving the matrix-free path
            // on wide well-conditioned fits. Only runs the estimate when a Jeffreys
            // subspace exists and `total_p` is wide enough that the dense eigh is
            // the cost we want to avoid (the helper itself gates on the size
            // threshold and conservatively returns `false` if unsure). Computed
            // once per inner cycle and reused across the cycle's head-KKT, step,
            // and trial-value calls; the conditioning changes slowly across cycles
            // so re-estimating per cycle (one `O(p·k)` burst) is already cheap
            // against the work it guards.
            let jeffreys_skippable_this_cycle: bool = if options.seed_screening {
                // Seed screening only ranks seeds: skip the O(p · per-axis-Hdot)
                // full Jeffreys gradient/curvature loop. The value-only Jeffreys
                // term (folded into the objective baseline / trial penalties via
                // `custom_family_joint_jeffreys_value`, gated independently on
                // `joint_jeffreys_subspace.is_some()`) still bounds the screening
                // score on separating directions; only the per-axis step curvature
                // — the wrong cost class for ranking on a K-block coupled family —
                // is dropped here (gam#729/#808).
                true
            } else if joint_jeffreys_subspace.is_some() {
                // EXPECTED-INFORMATION GUARD (gam#1020): the skippable
                // certificate probes the OBSERVED Hessian source; it only
                // transfers to the Jeffreys gate when the family's Jeffreys
                // information IS the observed Hessian. Expected-information
                // families (probit-class) bypass the pre-check — observed
                // information grows on saturated rows exactly where the
                // expected information collapses and the gate must arm.
                family.joint_jeffreys_information_matches_observed_hessian()
                    && jeffreys_term_skippable_for_source(&joint_hessian_source, total_p)
                        .unwrap_or(false)
            } else {
                false
            };
            let joint_trust_metric_diag = match &joint_hessian_source {
                JointHessianSource::Dense(h_joint) => joint_penalty_preconditioner_diag(
                    &h_joint.diag().to_owned(),
                    &ranges,
                    &s_lambdas,
                    joint_solver_diagonal_ridge,
                    joint_bundle,
                ),
                JointHessianSource::Operator { diagonal, .. } => joint_penalty_preconditioner_diag(
                    diagonal,
                    &ranges,
                    &s_lambdas,
                    joint_solver_diagonal_ridge,
                    joint_bundle,
                ),
            };
            // HEAD-β JEFFREYS CACHE (gam#729/#808). The full Jeffreys/Firth triple
            // `(Φ, ∇Φ, H_Φ)` costs `p` family directional-derivative calls (the
            // `for k in 0..p` loop in `joint_jeffreys_term`); for a K-block coupled
            // family (Dirichlet/multinomial) that is the dominant per-cycle cost.
            // The head-of-cycle KKT residual, the constrained-QP step, and the
            // spectral/dense Newton step are ALL built at the SAME cycle-start β
            // (`&states`, before any step is accepted), so they need the SAME
            // triple. Compute it ONCE here and reuse, instead of three independent
            // O(p)-directional-derivative evaluations per cycle. The post-step
            // residual below is at the accepted β, so it correctly recomputes.
            // `None` when the term is condition-gated/skippable (∇Φ=0, H_Φ=0).
            let head_beta_key: Array1<f64> = flatten_state_betas(&states, specs);
            let head_jeffreys_term: Option<(Array1<f64>, Array2<f64>)> =
                if jeffreys_skippable_this_cycle {
                    None
                } else if let Some((_, grad_phi, hphi)) = jeffreys_triple_cache
                    .as_ref()
                    .filter(|(key, _, _)| *key == head_beta_key)
                {
                    // Cross-cycle cache hit: the previous cycle's post-step KKT
                    // residual already computed the exact triple at this β. Reuse.
                    Some((grad_phi.clone(), hphi.clone()))
                } else if let Some(z_joint) = joint_jeffreys_subspace.as_ref() {
                    let term = match custom_family_joint_jeffreys_term(
                        family, &states, specs, &ranges, z_joint,
                    )? {
                        Some((_phi, grad_phi, hphi))
                            if grad_phi.len() == grad_joint.len()
                                && hphi.nrows() == total_p
                                && hphi.ncols() == total_p =>
                        {
                            Some((grad_phi, hphi))
                        }
                        _ => None,
                    };
                    if let Some((grad_phi, hphi)) = term.as_ref() {
                        jeffreys_triple_cache =
                            Some((head_beta_key.clone(), grad_phi.clone(), hphi.clone()));
                    }
                    term
                } else {
                    None
                };
            // Fold the Firth/Jeffreys score `∇Φ` into the head-of-cycle KKT
            // residual when the term is armed, for the same reason as the
            // post-step residual below: the inner objective is `−ℓ + ½βᵀSβ − Φ`,
            // so the certifiable stationarity is `∇L − Sβ + ∇Φ = 0`. Without
            // this the head-of-cycle KKT exit (`current_stationarity_residual ≤
            // residual_tol`) can never fire on the near-separating span, even
            // when the iterate is the Firth optimum. No-op when the Jeffreys
            // term is unavailable or condition-gated to zero.
            let head_kkt_gradient: Option<Array1<f64>> = head_jeffreys_term
                .as_ref()
                .map(|(grad_phi, _hphi)| &grad_joint + grad_phi);
            let current_kkt_norm = exact_newton_joint_stationarity_inf_norm_from_gradient(
                head_kkt_gradient.as_ref().unwrap_or(&grad_joint),
                &states,
                specs,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                &block_constraints,
                Some(cached_active_sets.as_slice()),
            )?;
            if current_kkt_norm.is_finite() {
                min_certified_residual = min_certified_residual.min(current_kkt_norm);
            }
            let pcg_rel_tol = joint_pcg_eisenstat_walker_forcing(prev_kkt_norm, current_kkt_norm);

            let solve_joint_constraints_dense = joint_constraints.is_some()
                || !matrix_free_joint_requested
                || joint_hessian_is_dense;
            if cycle == 0 {
                log::info!(
                    "[JN-BRANCH-DIAG #1040] cycle=0 joint_constraints_is_some={} matrix_free_joint_requested={} joint_hessian_is_dense={} solve_joint_constraints_dense={} -> branch={} total_p={} levenberg_on_ill_cond={}",
                    joint_constraints.is_some(),
                    matrix_free_joint_requested,
                    joint_hessian_is_dense,
                    solve_joint_constraints_dense,
                    if solve_joint_constraints_dense && joint_constraints.is_some() {
                        "CONSTRAINED_QP"
                    } else if matrix_free_joint_requested && !joint_hessian_is_dense {
                        "MATRIX_FREE_PCG"
                    } else {
                        "DENSE_SPECTRAL"
                    },
                    total_p,
                    family.levenberg_on_ill_conditioning(),
                );
            }
            // Exact trust-region subproblem factorization (gam#979). Populated on
            // the unconstrained dense-spectral path with the metric-whitened
            // eigendecomposition of the penalized Hessian, so the trust loop below
            // re-solves the *exact* Moré–Sorensen subproblem at each trust radius
            // from one factorization — replacing the dogleg/Cauchy/box-truncation
            // globalization with the single object they all approximate. `None` on
            // the constrained-QP and matrix-free PCG paths, which keep their
            // existing globalization untouched.
            let mut joint_spectrum: Option<whitened_spectrum::WhitenedHessianSpectrum> = None;
            let (candidate_beta, joint_active_set, joint_step_spectral_nullity) =
                if solve_joint_constraints_dense
                    && let Some(constraints) = joint_constraints.as_ref()
                {
                    let mut lhs = match materialize_joint_hessian_source(
                        &joint_hessian_source,
                        total_p,
                        "joint Newton inner constrained Hessian materialization",
                    ) {
                        Ok(matrix) => matrix,
                        Err(_) => break,
                    };
                    add_joint_penalty_to_matrix(
                        &mut lhs,
                        &ranges,
                        &s_lambdas,
                        trace_diagonal_ridge,
                        joint_bundle,
                    );
                    if joint_solver_diagonal_ridge != trace_diagonal_ridge {
                        for d in 0..lhs.nrows() {
                            lhs[[d, d]] += joint_solver_diagonal_ridge - trace_diagonal_ridge;
                        }
                    }
                    check_linear_feasibility(&beta_joint, constraints, 1e-8).map_err(|e| {
                        format!("joint Newton constrained solve [cycle={cycle}]: {e}")
                    })?;
                    let warm_joint_active =
                        flatten_joint_active_set(&cached_active_sets, &block_constraints);
                    let lower_bounds = match extract_simple_lower_bounds(constraints, total_p) {
                        Ok(bounds) => bounds,
                        Err(_) => break,
                    };
                    // Newton IRLS step in absolute-β space:
                    //
                    //   β_new = H_pen⁻¹ (H_L β + ∇ℓ)
                    //
                    // where H_pen = H_L + S, derived from Newton's update
                    //   β_new = β + H_pen⁻¹(∇ℓ − Sβ)
                    //         = H_pen⁻¹(H_pen β + ∇ℓ − Sβ)
                    //         = H_pen⁻¹(H_L β + ∇ℓ).
                    //
                    // The QP `min 0.5 β' H_pen β − rhs_beta' β` has unconstrained
                    // optimum β = H_pen⁻¹ rhs_beta, so rhs_beta = H_pen β + (∇ℓ − Sβ)
                    // gives the correct Newton update. Passing raw grad_joint (=∇ℓ)
                    // would collapse to β = H_pen⁻¹ ∇ℓ, which at the true optimum
                    // (∇ℓ = Sβ̂) gives H_pen⁻¹ Sβ̂ ≠ β̂ — wrong fixed point.
                    let penalty_beta_joint = apply_joint_block_penalty(
                        &ranges,
                        &s_lambdas,
                        &beta_joint,
                        joint_mode_diagonal_ridge,
                        joint_bundle,
                    );
                    let mut rhs_step = &grad_joint - &penalty_beta_joint;
                    // Reuse the head-β Jeffreys triple (consistently attenuated in
                    // `head_jeffreys_term` — both ∇Φ and H_Φ scaled by one scalar,
                    // gam#826/#872/#715). Skipped when the cheap pre-check certifies
                    // well-conditioning: ∇Φ = 0 and H_Φ = 0 there, so neither
                    // rhs_step nor lhs change.
                    // PSD PROJECTION (gam#979). The exact divided-difference H_Φ is
                    // indefinite exactly where Φ is (mixed-sign reduced spectrum at
                    // off-mode trial points). The unconstrained dense-spectral path
                    // consumes it exactly — the Moré–Sorensen subproblem handles
                    // indefiniteness rigorously — but THIS active-set QP requires a
                    // convex model (an indefinite QP cycles its active set and the
                    // inner grinds the budget). Use the PSD part of H_Φ here: honest
                    // magnitudes (unlike the old `K²` vec-Gram phantom), guaranteed
                    // solvable QP, and the exact ∇Φ in the rhs keeps the fixed point
                    // unchanged — only the convergence rate on indefinite stretches
                    // degrades to the damped-Newton rate the constrained path always
                    // had.
                    if let Some((grad_phi, hphi)) = head_jeffreys_term.as_ref()
                        && grad_phi.len() == rhs_step.len()
                    {
                        rhs_step += grad_phi;
                        lhs += &symmetric_psd_projection(hphi);
                    }
                    // Self-vanishing Levenberg–Marquardt damping for the
                    // CONSTRAINED active-set QP, mirroring the spectral-range
                    // branch below (μ = JOINT_SPECTRAL_LEVENBERG_FACTOR·‖rhs‖∞).
                    //
                    // When the joint design carries inequality constraints
                    // (the monotone I-spline time-warp of a survival
                    // location-scale / AFT fit) the spectral range step that
                    // drops ker(H_pen) is NOT taken — this dense active-set QP
                    // runs instead. On a constant-scale AFT the 12-col monotone
                    // time-warp's non-affine deviation is statistically
                    // UNIDENTIFIED, so H_pen is rank-deficient along that gauge
                    // direction. An undamped QP then has a continuum of optima
                    // differing only by the free gauge component, and the
                    // active set slides along the monotone constraint face
                    // taking an O(1) proposal step in that direction every
                    // cycle. The proposal `step_inf` never exhausts, so the
                    // identified-subspace KKT certificate (gated on
                    // `step_inf ≤ step_tol`) never fires and the inner
                    // joint-Newton grinds the full `inner_max_cycles` on EVERY
                    // outer ρ-eval — the survival-LS AFT "hang" (#736/#735/#721).
                    //
                    // Adding μ·I to the QP Hessian gives ker(H_pen) a tiny
                    // positive curvature, so the constrained minimizer is unique
                    // and its gauge component is driven toward zero; the proposal
                    // step then exhausts at the identified-subspace optimum and
                    // the certificate fires in a handful of cycles. Because
                    // μ ∝ ‖∇L − Sβ‖∞ → 0 at the KKT fixed point, the converged β
                    // and the well-identified flexible-scale fast path (where the
                    // time-warp IS identified and H_pen is non-singular) are
                    // unchanged — a genuinely flexible survival-LS fit still
                    // performs its full search.
                    //
                    // CRITICAL: the floor is only correct on a genuinely
                    // rank-deficient `H_pen`. Gate it strictly on
                    // `nullity > 0`. On a FULLY IDENTIFIED constrained fit
                    // (e.g. the post-reduction constant-scale loglogistic AFT,
                    // #736/#735/#721/#733/#734 — a 3-parameter model with
                    // block_widths = [1,1,1] and an empty `ker(H_pen)`) the QP
                    // minimizer is already unique, so the floor adds nothing it
                    // is needed for but everything it costs: with residual r and
                    // factor 1e-3 the floor is μ≈1e-3·r, and on an unpenalized
                    // location intercept whose likelihood curvature H is small
                    // at n=23 the damped Newton component shrinks the residual
                    // only by the GEOMETRIC ratio H/(H+μ) per cycle instead of
                    // quadratically. With μ≈1e-6 and a small H that ratio is far
                    // from 1, so the threshold-block stationarity residual
                    // plateaus at ~1e-3–1e-4 and the inner solve burns its whole
                    // cycle budget without ever reaching `residual_tol`. The
                    // self-vanishing μ→0 is too slow because it vanishes only as
                    // fast as the residual it is throttling. Disabling the floor
                    // when `nullity == 0` makes the constrained QP solve the
                    // EXACT undamped Newton/KKT system, recovering quadratic
                    // convergence to `residual_tol` in a handful of cycles. The
                    // rank-deficient case (`nullity > 0`, the pre-reduction
                    // unidentified time-warp gauge) keeps the floor and its hang
                    // fix unchanged. `None` (eigensolve failed / zero Hessian)
                    // falls back to the damped path conservatively.
                    // gam#1040: the survival marginal-slope joint shares one
                    // matern PC basis between the marginal and the log-slope
                    // surface, so `H_pen` is FULL RANK (`nullity == 0`) yet
                    // severely ill-conditioned (cond ≈ 5.8e6). With the floor
                    // gated on `nullity > 0` alone the undamped active-set QP has
                    // a constrained minimiser that is unique only up to round-off
                    // along the near-null mode: the active set slides an O(1)
                    // proposal step every cycle, `step_inf` never exhausts, the
                    // constrained-fixed-point / KKT certificate never fires, and
                    // the inner joint-Newton grinds the full cycle budget on EVERY
                    // outer ρ-eval (the hours-long survival-MS hang). The family
                    // opts into damping this case via
                    // `levenberg_on_ill_conditioning()`; the self-vanishing μ
                    // (∝ projected residual → 0 at the KKT fixed point) gives the
                    // near-null mode a tiny positive curvature so the minimiser is
                    // unique and `step_inf` exhausts, WITHOUT moving the converged
                    // β. Apply it only when the matrix is genuinely ill-conditioned
                    // (`cond > LEVENBERG_ILL_CONDITIONING_THRESHOLD`); a
                    // well-conditioned full-rank constrained fit (the tiny
                    // unpenalised loglogistic AFT, #736/#735/#721, where the floor
                    // would cap the convergence rate at the geometric H/(H+μ) ratio)
                    // keeps the EXACT undamped Newton/KKT solve and its quadratic
                    // convergence. `None` (eigensolve failed / zero Hessian) falls
                    // back to the damped path conservatively.
                    let (hpen_nullity, hpen_condition) =
                        match symmetric_penalized_hessian_nullity_and_condition(&lhs) {
                            Some((n, c)) => (Some(n), c),
                            None => (None, f64::INFINITY),
                        };
                    let nullity_floor = hpen_nullity.map(|n| n > 0).unwrap_or(true);
                    let ill_conditioned_floor = family.levenberg_on_ill_conditioning()
                        && hpen_nullity == Some(0)
                        && hpen_condition > LEVENBERG_ILL_CONDITIONING_THRESHOLD;
                    let apply_constrained_floor = nullity_floor || ill_conditioned_floor;
                    // Self-vanishing scale = the PROJECTED stationarity residual
                    // (`current_kkt_norm`), NOT the raw ‖∇ℓ − Sβ + ∇Φ‖∞. At a
                    // CONSTRAINED optimum the raw RHS converges to the active-set
                    // multiplier mass ‖Aᵀλ‖∞ — an O(1) quantity that never
                    // vanishes — so a floor scaled by it never lifts, throttling
                    // every weakly-curved identified direction to a geometric
                    // H/(H+μ) contraction and exhausting the inner budget with the
                    // projected residual stalled just above tolerance (#1025: the
                    // competing-risks twin time-basis fit, per_block_resid stuck at
                    // 1.457 for the full budget). The projected residual is the
                    // honest distance-from-KKT measure: it equals the raw RHS on
                    // unconstrained fits (no behavior change there) and → 0 at a
                    // constrained optimum, so the floor vanishes exactly where the
                    // comment above promises it does.
                    let rhs_inf = rhs_step.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
                    let floor_scale = if current_kkt_norm.is_finite() {
                        current_kkt_norm.min(rhs_inf)
                    } else {
                        rhs_inf
                    };
                    let constrained_levenberg_mu = JOINT_SPECTRAL_LEVENBERG_FACTOR * floor_scale;
                    if apply_constrained_floor
                        && constrained_levenberg_mu > 0.0
                        && constrained_levenberg_mu.is_finite()
                    {
                        for d in 0..lhs.nrows() {
                            lhs[[d, d]] += constrained_levenberg_mu;
                        }
                    }
                    // MODIFIED-NEWTON CONVEXIFICATION (gam#1040 / gam#979). The
                    // exact survival marginal-slope joint NLL Hessian is INDEFINITE
                    // on the flat baseline-hazard λ valley (the linear baseline +
                    // the z·exp(logslope) cross-coupling carry genuine negative
                    // curvature away from the optimum). The active-set QP below
                    // minimizes `½βᵀHβ − rhs_betaᵀβ`; with an indefinite `H` that
                    // model has a direction that LOWERS the local quadratic
                    // objective while moving AWAY from the KKT point. The
                    // trust-region wrapper gates acceptance on the objective-
                    // reduction ratio ρ — NOT on the stationarity residual — so it
                    // accepts every such step at ρ≈1 and GROWS its radius while the
                    // stationarity residual DIVERGES (the measured 3.5e4 → 9.5e6
                    // blow-up on the time block). The unconstrained dense-spectral
                    // path never exhibits this because `WhitenedHessianSpectrum`
                    // already reflects negative-curvature modes to `|γ|`; the
                    // constrained branch must do the same to its dense `lhs`.
                    // Reflecting (not clamping-to-zero) keeps the curvature
                    // magnitude so the QP stays bounded and the step length matches
                    // the dense path; at a genuine constrained optimum the reduced
                    // Hessian is PSD so this is a no-op and the converged β is
                    // unchanged.
                    //
                    // NEWTON-DECREMENT CERTIFICATE ON THE CONSTRAINED PATH
                    // (gam#1040 / gam#1088). The dense-spectral branch populates
                    // `joint_spectrum` (line ~1493) so the convergence loop's
                    // Newton-decrement exit can terminate the geometric/linear tail
                    // when the achievable model descent `½ Σ c_k²/|γ_k|` drops below
                    // `objective_tol`. The constrained branch never set it, so a
                    // weakly-identified survival-MS fit (the n≈2e5 logslope block,
                    // step clamped by the trust region, residual creeping ~7%/cycle)
                    // had no early-exit and ground the whole budget. Build the same
                    // D-whitened spectrum from the penalized `lhs` (decrement reflects
                    // negative modes via `.abs()` internally, so the pre-reflection
                    // `lhs` is the right input) and the augmented stationarity RHS, so
                    // the decrement read is consistent with the dense path. Diagnostic
                    // only for the convergence test — it does NOT change the QP step.
                    if let Ok(spectrum) = whitened_spectrum::WhitenedHessianSpectrum::decompose(
                        &lhs,
                        &rhs_step,
                        &joint_trust_metric_diag,
                        KKT_REFUSAL_RANK_TOL,
                    ) {
                        joint_spectrum = Some(spectrum);
                    }
                    let lhs_reflected = symmetric_negative_curvature_reflected(&lhs);
                    if cycle <= 2 {
                        let min_eval_raw = symmetric_min_eigenvalue_signed(&lhs);
                        let min_eval_refl = symmetric_min_eigenvalue_signed(&lhs_reflected);
                        log::info!(
                            "[JN-REFLECT-DIAG #1040] cycle={cycle} CONSTRAINED_QP lambda_min_signed_raw={min_eval_raw:.3e} lambda_min_signed_reflected={min_eval_refl:.3e} (reflection {})",
                            if min_eval_refl > min_eval_raw + min_eval_raw.abs() * 1e-9 {
                                "CHANGED the spectrum"
                            } else {
                                "NO-OP (already PSD)"
                            },
                        );
                    }
                    let lhs = lhs_reflected;
                    let rhs_beta = &lhs.dot(&beta_joint) + &rhs_step;
                    let solve_result = if let Some(bounds) = lower_bounds.as_ref() {
                        solve_quadratic_with_simple_lower_bounds(
                            &lhs,
                            &rhs_beta,
                            &beta_joint,
                            bounds,
                            warm_joint_active.as_deref(),
                        )
                    } else {
                        solve_quadratic_with_linear_constraints(
                            &lhs,
                            &rhs_beta,
                            &beta_joint,
                            constraints,
                            warm_joint_active.as_deref(),
                        )
                        .map_err(|e| e.to_string())
                    };
                    match solve_result {
                        Ok((beta_new, active_set)) => (beta_new, Some(active_set), 0usize),
                        Err(_) => break,
                    }
                } else {
                    // Stationarity residual: r = S*beta - gradient (for penalized NLL)
                    let penalty_beta = apply_joint_block_penalty(
                        &ranges,
                        &s_lambdas,
                        &beta_joint,
                        joint_mode_diagonal_ridge,
                        joint_bundle,
                    );
                    let mut rhs = &grad_joint - &penalty_beta;
                    // Universal robustness: fold the family-general
                    // Jeffreys/Firth curvature `H_Φ` and score `∇Φ` into BOTH the
                    // matrix-free PCG step AND the dense spectral fallback below,
                    // scoped to the full-span basis `Z_J`. Computed ONCE here
                    // so the matvec closure and the RHS share the SAME term and the
                    // fallback does not recompute it. The inner objective is
                    // `−ℓ + ½βᵀSβ − Φ`, so the Newton system the step must solve is
                    //   (H + S_λ + H_Φ) δ = (∇ℓ − S_λβ) + ∇Φ.
                    // Previously the PCG matvec applied only `H + S_λ` and its RHS
                    // omitted `∇Φ`, so on the matrix-free path (large p / large n)
                    // Firth was a SILENT NO-OP: the proper-prior never reached the
                    // step that actually moves β, leaving separation/under-
                    // identification uncured exactly where the dense route is not
                    // taken. The dense route (small p, e.g. BMS p≈51) was already
                    // correct. `H_Φ` is the full-span Gauss-Newton surrogate
                    // `½ J H_id⁻¹ Jᵀ` (Z_J = identity ⇒ p×p, not low-rank), but the
                    // conditioning gate in `joint_jeffreys_term` returns the zero
                    // term on every well-conditioned fit, so this only arms on the
                    // near-separating span
                    // — and `hphi` is materialized once per cycle regardless, so the
                    // matvec adds only one O(p²) HVP, preserving the matrix-free
                    // path's asymptotics where Firth is negligible (term = `None`).
                    // Cheap pre-check certified well-conditioned ⇒ the exact term
                    // is the zero contribution (∇Φ = 0, H_Φ = 0). Short-circuit to
                    // `None` WITHOUT materializing the dense joint Hessian or running
                    // the O(p³) reduced eigendecomposition — this is the matrix-free
                    // PCG hot path, where forming a dense p×p H_Φ every cycle was the
                    // regression. Byte-identical to the gated-off dense path: `rhs`
                    // is left as `∇ℓ − S_λβ` and no H_Φ is folded into the matvec.
                    // Reuse the head-β Jeffreys triple (computed once this cycle);
                    // this Newton step is built at the same cycle-start β.
                    let inner_jeffreys_term: Option<(Array1<f64>, Array2<f64>)> =
                        match head_jeffreys_term.as_ref() {
                            Some((grad_phi, hphi)) if grad_phi.len() == rhs.len() => {
                                rhs += grad_phi;
                                Some((grad_phi.clone(), hphi.clone()))
                            }
                            _ => None,
                        };
                    // PSD PROJECTION for the SPD-PCG matvec (gam#979): the exact
                    // divided-difference H_Φ can be indefinite at off-mode trial
                    // points, which breaks the SPD-CG contract. The matvec uses its
                    // PSD part; the dense spectral fallback below keeps the EXACT
                    // (possibly indefinite) H_Φ — the Moré–Sorensen subproblem
                    // handles it rigorously.
                    let inner_jeffreys_hphi: Option<Arc<Array2<f64>>> = inner_jeffreys_term
                        .as_ref()
                        .map(|(_grad_phi, hphi)| Arc::new(symmetric_psd_projection(hphi)));
                    let pcg_started = std::time::Instant::now();
                    let pcg_requested = matrix_free_joint_requested && !joint_hessian_is_dense;
                    let mut spectral_nullity_for_step = 0usize;
                    let mut delta = if pcg_requested {
                        let preconditioner_diag = match &joint_hessian_source {
                            JointHessianSource::Dense(h_joint) => {
                                joint_penalty_preconditioner_diag(
                                    &h_joint.diag().to_owned(),
                                    &ranges,
                                    &s_lambdas,
                                    joint_solver_diagonal_ridge,
                                    joint_bundle,
                                )
                            }
                            JointHessianSource::Operator { diagonal, .. } => {
                                joint_penalty_preconditioner_diag(
                                    diagonal,
                                    &ranges,
                                    &s_lambdas,
                                    joint_solver_diagonal_ridge,
                                    joint_bundle,
                                )
                            }
                        };
                        // Pre-allocate the penalty workspace ONCE outside the
                        // PCG closure so each CG iter (called hundreds-to-
                        // thousands of times per outer iter at large scale)
                        // reuses the buffer instead of allocating per call.
                        // RefCell because solve_spd_pcg* expects `Fn` (immutable
                        // borrow of captures) and we need interior mutability
                        // to write into the workspace.
                        let penalty_workspace = RefCell::new(Array1::<f64>::zeros(total_p));
                        // Capture the Jeffreys/Firth curvature for the matvec. When
                        // armed (and nonzero past the conditioning gate) the PCG
                        // operator becomes `H + S_λ + H_Φ`, matching the augmented
                        // RHS `(∇ℓ − S_λβ) + ∇Φ` set above and the dense spectral
                        // fallback. `None` keeps the unaugmented matvec.
                        let pcg_hphi_dense = inner_jeffreys_hphi.clone();
                        let pcg_hphi_op = inner_jeffreys_hphi.clone();
                        match &joint_hessian_source {
                            JointHessianSource::Dense(h_joint) => {
                                crate::linalg::utils::solve_spd_pcg_with_info_into(
                                    |v, out| {
                                        // h_joint * v -> out (faer-backed, no alloc)
                                        crate::faer_ndarray::fast_av_view_into(
                                            h_joint,
                                            v,
                                            out.view_mut(),
                                        );
                                        let mut pen = penalty_workspace.borrow_mut();
                                        apply_joint_block_penalty_into(
                                            &ranges,
                                            &s_lambdas,
                                            v,
                                            joint_solver_diagonal_ridge,
                                            &mut pen,
                                            joint_bundle,
                                        );
                                        *out += &*pen;
                                        if let Some(hphi) = pcg_hphi_dense.as_ref() {
                                            *out += &hphi.dot(v);
                                        }
                                    },
                                    &rhs,
                                    &preconditioner_diag,
                                    pcg_rel_tol,
                                    JOINT_PCG_MAX_ITER_MULTIPLIER * total_p.max(1),
                                )
                                .map(|(solution, info)| {
                                    log_joint_pcg_diagnostics(
                                        cycle,
                                        total_p,
                                        total_joint_n,
                                        &preconditioner_diag,
                                        &info,
                                    );
                                    solution
                                })
                            }
                            JointHessianSource::Operator { apply_into, .. } => {
                                let apply_h_into = Arc::clone(apply_into);
                                crate::linalg::utils::solve_spd_pcg_with_info_into(
                                    |v, out| {
                                        if let Err(error) = apply_h_into(v, out) {
                                            log::warn!(
                                                "joint Newton inner operator matvec failed: {error}"
                                            );
                                            out.fill(0.0);
                                        }
                                        let mut pen = penalty_workspace.borrow_mut();
                                        apply_joint_block_penalty_into(
                                            &ranges,
                                            &s_lambdas,
                                            v,
                                            joint_solver_diagonal_ridge,
                                            &mut pen,
                                            joint_bundle,
                                        );
                                        *out += &*pen;
                                        if let Some(hphi) = pcg_hphi_op.as_ref() {
                                            *out += &hphi.dot(v);
                                        }
                                    },
                                    &rhs,
                                    &preconditioner_diag,
                                    pcg_rel_tol,
                                    JOINT_PCG_MAX_ITER_MULTIPLIER * total_p.max(1),
                                )
                                .map(|(solution, info)| {
                                    log_joint_pcg_diagnostics(
                                        cycle,
                                        total_p,
                                        total_joint_n,
                                        &preconditioner_diag,
                                        &info,
                                    );
                                    solution
                                })
                            }
                        }
                    } else {
                        None
                    };
                    if pcg_requested {
                        log::info!(
                            "[PIRLS/joint-PCG] cycle {:>3} | n={} p={} solved={} elapsed={:.3}s",
                            cycle,
                            total_joint_n,
                            total_p,
                            delta.is_some(),
                            pcg_started.elapsed().as_secs_f64()
                        );
                    }
                    if delta.is_none() {
                        if pcg_requested {
                            break;
                        }
                        let mut lhs_true = match materialize_joint_hessian_source(
                            &joint_hessian_source,
                            total_p,
                            "joint Newton inner dense fallback Hessian materialization",
                        ) {
                            Ok(matrix) => matrix,
                            Err(_) => break,
                        };
                        // Snapshot the Jeffreys information matrix only when a
                        // family supplies the contracted completion. The generic
                        // pairwise fallback costs p(p+1)/2 full second-directional
                        // Hessian passes; at biobank scale (BMS p=35, n≈196k) it
                        // turns a near-converged polishing cycle into ~50s of row
                        // work. Without a contracted hook the divided-difference
                        // H_phi model remains first-order correct and the KKT
                        // certificate owns convergence.
                        let jeffreys_completion_requested =
                            family.joint_jeffreys_information_contracted_trace_hessian_available();
                        let h_info_for_completion = (jeffreys_completion_endgame
                            && inner_jeffreys_term.is_some()
                            && jeffreys_completion_requested)
                            .then(|| family.joint_jeffreys_information_with_specs(&states, specs))
                            .transpose()?
                            .flatten();
                        add_joint_penalty_to_matrix(
                            &mut lhs_true,
                            &ranges,
                            &s_lambdas,
                            joint_mode_diagonal_ridge,
                            joint_bundle,
                        );
                        // Universal robustness: add the
                        // family-general Jeffreys curvature `H_Phi` to the
                        // penalized Hessian. This is the Tier-B coupled-Newton form
                        // of Firth: the reduced Fisher information `Z_J^T H Z_J`
                        // supplies the missing O(n) curvature that bounds a
                        // near-separating coefficient to O(1). When the Jeffreys
                        // term is unavailable, the step stays unaugmented.
                        //
                        // `∇Φ` is NOT re-added here: `rhs` (and thus `spectral_rhs`)
                        // already carries `+∇Φ` from the single shared computation
                        // above, and we REUSE that same `H_Φ` here rather than
                        // recomputing the (O(p) directional-derivative) term — the
                        // dense fallback and the matrix-free PCG step now solve the
                        // SAME Jeffreys-augmented Newton system.
                        let spectral_rhs = rhs.clone();
                        if let Some((_grad_phi, hphi)) = inner_jeffreys_term.as_ref() {
                            lhs_true += hphi;
                            // ENDGAME EXACTNESS (gam#979). The divided-difference
                            // H_Φ omits the second-directional-Hessian remainder
                            // `½ tr(K · D_ab)`; near a Firth-active mode that
                            // remainder is comparable to the kept curvature, so
                            // Newton converges only linearly (a residual sawtooth
                            // plateauing just above the certificate tolerance —
                            // enough mode noise to swamp outer finite differences
                            // and feed the IFT near-flat-kernel amplification).
                            // Once the residual enters the convergence band, add
                            // the exact completion so the model is the true
                            // Hessian of the Φ-augmented objective and the endgame
                            // is quadratic. A family contracted trace hook can
                            // supply it at any width; the pairwise `p(p+1)/2`
                            // fallback remains limited to moderate p. `None`
                            // degrades safely to the divided-difference model.
                            if let (Some(h_info), Some(z_joint)) = (
                                h_info_for_completion.as_ref(),
                                joint_jeffreys_subspace.as_ref(),
                            ) && let Some(completion) =
                                custom_family_joint_jeffreys_second_order_completion(
                                    family, &states, specs, h_info, z_joint, false,
                                )?
                            {
                                lhs_true += &completion;
                            }
                        }
                        // Single metric-whitened eigendecomposition drives BOTH the
                        // seed step and every trust-region re-solve this cycle
                        // (gam#979). The prior code ran a SECOND O(p³)
                        // eigendecomposition of the raw Hessian here purely to form
                        // the seed step — doubling the dominant per-cycle cost on the
                        // ~5 s/cycle ill-conditioned survival marginal-slope inner.
                        // The exact trust-region multiplier λ (chosen so ‖δ‖_D = r)
                        // subsumes the old self-vanishing Levenberg-μ seed: `decompose`
                        // whitens by the trust metric so the penalty (λ~e²⁴) and the
                        // likelihood scales are throttled uniformly — the scale
                        // invariance the multiplicative μ approximated. `lhs_true`
                        // already carries the penalty and the Firth/Jeffreys curvature
                        // H_Φ and `spectral_rhs` the augmented stationarity RHS, so the
                        // subproblem model matches the predicted-reduction model and the
                        // accept/reject gain ratio exactly.
                        let spectrum = whitened_spectrum::WhitenedHessianSpectrum::decompose(
                            &lhs_true,
                            &spectral_rhs,
                            &joint_trust_metric_diag,
                            KKT_REFUSAL_RANK_TOL,
                        )?;
                        // Seed = the unconstrained (Moore–Penrose, range-restricted)
                        // exact step, so cycle 0 can take the full Newton step on a
                        // well-conditioned model (the cycle-0 radius bump below relies
                        // on this); the trust loop re-solves at finite radius for every
                        // subsequent attempt. An indefinite model reflects negative
                        // curvature to |λ|, exactly as the prior spectral solve did.
                        let spectral_step = spectrum.trust_region_step(f64::INFINITY);
                        spectral_nullity_for_step = spectral_step.nullity;
                        if spectral_step.reflected_negative_modes > 0 {
                            log::info!(
                                "[PIRLS/joint-Newton] cycle {cycle:>3} | indefinite inner \
                                 Hessian: reflected {}/{} negative-curvature modes to |λ| \
                                 (λ_min={:.3e}); proceeding with modified-Newton descent step \
                                 under trust-region globalization",
                                spectral_step.reflected_negative_modes,
                                total_p,
                                spectral_step.most_negative_eigenvalue,
                            );
                        }
                        if spectral_step.nullity > 0 {
                            log::debug!(
                                "[PIRLS/joint-Newton] spectral reduced solve: nullity@{:.0e}={}/{} \
                             |P0 rhs|∞={:.3e} |P+ rhs|∞={:.3e} λ_min+={:.3e} λ_max={:.3e}",
                                spectral_step.rank_tol,
                                spectral_step.nullity,
                                total_p,
                                spectral_step.null_rhs_inf,
                                spectral_step.range_rhs_inf,
                                spectral_step.lambda_min_positive,
                                spectral_step.lambda_max_abs,
                            );
                        }
                        delta = Some(spectral_step.delta);
                        // The same factorization powers every trust-radius re-solve
                        // in the loop below (gam#979) — no second eigendecomposition.
                        joint_spectrum = Some(spectrum);
                    }

                    let Some(delta) = delta else {
                        break; // Fall back to blockwise
                    };
                    if !delta.iter().all(|v| v.is_finite()) {
                        break; // Fall back to blockwise
                    }
                    (beta_joint.clone() + &delta, None, spectral_nullity_for_step)
                };
            // Hessian-source build (and any QP solve immediately above) are
            // done by the time we reach `delta`. Capture the wall-clock
            // before the line-search phase so the end-of-cycle summary can
            // attribute time correctly between the Hessian/QP and the
            // backtracking step search.
            let hessian_and_qp_elapsed = hessian_started.elapsed();
            drop(hessian_scope_guard);
            let line_search_started = std::time::Instant::now();
            log::info!(
                "[joint-newton-tr] phase=line_search cycle={} r={:.3e} hessian_qp_elapsed={:.3}s",
                cycle,
                joint_trust_radius,
                hessian_and_qp_elapsed.as_secs_f64(),
            );
            let delta = &candidate_beta - &beta_joint;

            // Trust-region globalization for the joint Newton proposal.  The
            // previous implementation used up to eight backtracking likelihood
            // evaluations (each can build the exact joint workspace at large-scale
            // scale).  Here the step is truncated before evaluation and the
            // single trial objective is accepted only when the actual decrease
            // is positive relative to the local quadratic model.
            let step_inf = delta.iter().copied().map(f64::abs).fold(0.0_f64, f64::max);

            let old_beta: Vec<Array1<f64>> = states.iter().map(|s| s.beta.clone()).collect();
            // Firth value Φ at the OLD (start-of-cycle) β, folded under the SAME
            // skippable gate the trial uses below — so `actual_reduction =
            // old_objective − trialobjective` compares two points on one objective
            // `−ℓ + ½βᵀSβ − Φ` (gam#826/#872). `lastobjective` is the pure
            // quadratic-penalized objective; subtract the gated old-β Φ here.
            let old_phi = if !jeffreys_skippable_this_cycle {
                joint_jeffreys_subspace
                    .as_ref()
                    .map(|z_joint| {
                        custom_family_joint_jeffreys_value(family, &states, specs, &ranges, z_joint)
                    })
                    .unwrap_or(0.0)
            } else {
                0.0
            };
            let old_objective = lastobjective - old_phi;
            // Row measure observed by the objective at β. `lastobjective` was
            // set on the previous cycle (or at function entry) under `options`;
            // see top-of-cycle capture for rationale.
            let tr_row_measure_old_objective =
                crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
            let mut accepted = false;
            let mut accepted_joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>> =
                None;
            let mut line_search_attempts = 0usize;

            // Pure Newton must take a full step on the first cycle of an
            // exact quadratic problem (i.e. converge in one cycle when the
            // model is exact). The trust-region globalization above must not
            // truncate the very first proposal merely because the hard-coded
            // initial radius (1.0) is smaller than the natural Newton-step
            // 2-norm. Bumping the radius up to the post-barrier Newton-step
            // norm on cycle 0 preserves quadratic convergence on
            // well-conditioned problems while leaving the standard adaptive
            // shrink/expand for subsequent cycles. Family feasibility
            // constraints and the adaptive trust radius remain the safeguards
            // against runaway proposals.
            if cycle == 0 && joint_step_spectral_nullity == 0 {
                let initial_block_norms = joint_trust_region_block_metric_norms(
                    &delta,
                    &ranges,
                    &joint_trust_metric_diag,
                );
                for (radius, norm) in joint_block_trust_radii.iter_mut().zip(initial_block_norms) {
                    if norm.is_finite() && norm > *radius {
                        *radius = norm;
                    }
                }
                joint_trust_radius = joint_block_trust_radii
                    .iter()
                    .copied()
                    .fold(0.0_f64, f64::max);
                if !joint_trust_radius.is_finite() || joint_trust_radius <= 0.0 {
                    joint_trust_radius = 1.0;
                }
            }

            let penalty_beta = apply_joint_block_penalty(
                &ranges,
                &s_lambdas,
                &beta_joint,
                joint_mode_diagonal_ridge,
                joint_bundle,
            );
            // Stationarity RHS for the trust-region quadratic model. When the
            // Jeffreys/Firth term is armed the inner objective is `−ℓ+½βᵀSβ+Φ`, so
            // the model RHS is `∇L − Sβ + ∇Φ` — the SAME augmented RHS the Newton
            // step solves and the H_Φ-augmented `hpen_delta` below pairs with. Using
            // the bare `∇L − Sβ` here desyncs `predicted_reduction` from the
            // augmented step + the Φ-augmented `actual_reduction`, which is what
            // froze the coupled K-block line search (gam#729/#715). No-op when the
            // term is condition-gated/unavailable (∇Φ=0).
            let mut rhs = &grad_joint - &penalty_beta;
            if let Some((grad_phi, _hphi)) = head_jeffreys_term.as_ref()
                && grad_phi.len() == rhs.len()
            {
                rhs += grad_phi;
            }
            let beta_inf = states
                .iter()
                .flat_map(|s| s.beta.iter().copied())
                .map(f64::abs)
                .fold(0.0_f64, f64::max);
            let step_tol = inner_tol * (1.0 + beta_inf);
            let objective_tol = inner_tol * (1.0 + old_objective.abs());
            // Scale the KKT residual tolerance against the natural magnitude
            // of ‖Sβ − ∇L‖∞ (i.e. max(‖∇L‖∞, ‖Sβ‖∞)), not the objective. The
            // gradient and Sβ scale independently of the likelihood — at
            // large scale with |β|∞ ~ 10²–10³ and non-trivial smoothing,
            // ‖Sβ‖∞ can sit orders of magnitude above |obj| and FP noise
            // alone keeps the residual above any obj-scaled tol, so KKT is
            // never certified even when the iterate is the true optimum.
            let grad_inf = grad_joint
                .iter()
                .map(|x: &f64| x.abs())
                .fold(0.0_f64, f64::max);
            let penalty_inf = penalty_beta
                .iter()
                .map(|x: &f64| x.abs())
                .fold(0.0_f64, f64::max);
            let residual_tol = inner_tol * (1.0 + grad_inf.max(penalty_inf));
            last_residual_tol = residual_tol;
            let current_stationarity_residual = current_kkt_norm;
            // KKT certificate: ‖∇L − Sβ‖_∞ ≤ residual_tol together with
            // ‖δ‖_∞ ≤ step_tol is sufficient first-order optimality of the
            // penalized objective; no descent direction exists from the
            // current point. Conditioning that exit on additional evidence
            // of objective progress in the previous cycle would refuse to
            // recognize convergence at a starting point that already sits
            // at the optimum (e.g. balanced data with an intercept-only
            // fit, where ∇ℓ vanishes by symmetry from cycle 0 and the
            // Newton step is identically zero so the trust-region search
            // can never produce a strictly negative actual reduction).
            if current_stationarity_residual <= residual_tol && step_inf <= step_tol {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | pre-line-search converged: proposal_inf={:.3e} (tol={:.3e}) | residual={:.3e} (tol={:.3e})",
                    cycle,
                    step_inf,
                    step_tol,
                    current_stationarity_residual,
                    residual_tol,
                );
                cached_joint_workspace = hessian_workspace_for_cycle;
                cycles_done = cycle;
                converged = true;
                break;
            }

            // Trust-region retries preserve the objective-decrease guarantee
            // when the initial radius is too optimistic. If the Newton proposal
            // is not a descent direction for the penalized quadratic model,
            // switch once to a diagonally preconditioned gradient step and keep
            // the same exact full-objective accept/reject test.
            const JOINT_TRUST_MAX_ATTEMPTS: usize = 24;
            let mut search_delta = delta.clone();
            let search_joint_active_set: Option<Vec<usize>> = joint_active_set.clone();
            let mut tried_preconditioned_descent = false;
            // Dogleg Cauchy leg (gam#826/#808). Compute the unconstrained Cauchy
            // point of the penalized (Firth-augmented) quadratic model ONCE per
            // cycle: the M-metric steepest-descent direction `p_sd = M⁻¹·rhs`
            // and its curvature `p_sd·H·p_sd` (a coupled Hessian-vector product,
            // so it must be hoisted out of the radius-shrink loop). When the
            // Newton step exceeds a block's trust radius the dogleg blends
            // toward this Cauchy leg, guaranteeing at least the Cauchy decrease
            // even when the spectral Newton step is numerically frozen at the
            // oversmoothed seed (the high-curvature log_sigma block's Newton
            // component is `O(g/λ) ≈ 5e-21`). `joint_active_set` is the
            // unconstrained joint Newton path; the constrained-QP path keeps its
            // own globalization, so the dogleg is only built (and used) when no
            // active set is in force.
            let dogleg_cauchy: Option<Array1<f64>> = if search_joint_active_set.is_none() {
                let mut p_sd = Array1::<f64>::zeros(total_p);
                for (i, (r, w)) in rhs.iter().zip(joint_trust_metric_diag.iter()).enumerate() {
                    p_sd[i] = r / positive_joint_diagonal_entry(*w);
                }
                let mut h_psd = Array1::<f64>::zeros(total_p);
                let mut cauchy_penalty_scratch = Array1::<f64>::zeros(total_p);
                match apply_joint_penalized_hessian_into_with_workspace(
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    joint_mode_diagonal_ridge,
                    &p_sd,
                    &mut h_psd,
                    &mut cauchy_penalty_scratch,
                    joint_bundle,
                ) {
                    Ok(()) => {
                        if let Some((_grad_phi, hphi)) = head_jeffreys_term.as_ref() {
                            h_psd += &hphi.dot(&p_sd);
                        }
                        let cauchy = joint_cauchy_step(&rhs, &p_sd, &h_psd);
                        if cauchy.iter().all(|v| v.is_finite()) {
                            Some(cauchy)
                        } else {
                            None
                        }
                    }
                    Err(_) => None,
                }
            } else {
                None
            };
            let mut model_rejects = 0usize;
            let mut likelihood_rejects = 0usize;
            let mut objective_rejects = 0usize;
            let mut first_likelihood_reject: Option<String> = None;
            // Coalesce consecutive trust-region attempts whose accept/reject
            // outcome and numeric signature round to the same values, so a long
            // run of identical retries collapses into a single "attempts a..b
            // (×N)" line at flush time instead of spamming one line per try.
            let mut tr_log_sig: Option<String> = None;
            let mut tr_log_first: usize = 0;
            let mut tr_log_last: usize = 0;
            // Hoist the two full-size scratch buffers used in the predicted-
            // reduction computation outside the trust-region attempt loop.
            // The loop runs up to JOINT_TRUST_MAX_ATTEMPTS times per outer
            // Newton step, so allocating these per-attempt would add O(total_p)
            // heap traffic on every radius shrink/expand iteration.
            let mut hpen_delta = Array1::<f64>::zeros(total_p);
            let mut tr_penalty_scratch = Array1::<f64>::zeros(total_p);
            for trust_attempt in 0..JOINT_TRUST_MAX_ATTEMPTS {
                line_search_attempts = trust_attempt + 1;
                accepted_joint_workspace = None;
                // Dogleg globalization (gam#826/#808): when the unconstrained
                // Newton path is in force and a finite Cauchy leg was built,
                // construct the dogleg blend of the Cauchy and Newton points at
                // the current per-block radii. Otherwise (constrained-QP path,
                // or after the preconditioned-descent fallback replaced
                // `search_delta`) fall back to box-truncating the search step.
                let mut trial_delta;
                let mut block_step_norms = if let Some(spectrum) = joint_spectrum.as_ref() {
                    // Exact Moré–Sorensen trust-region step at the current radius
                    // (gam#979). The step already lies in the `D`-metric ball, so
                    // no dogleg blend or box-truncation is applied: on a shrink the
                    // direction is RE-SOLVED (bending toward the gradient), the
                    // property the dogleg/truncation lacked. Re-solving reuses the
                    // cached factorization at O(p) cost. On the constrained path the
                    // resulting (unconstrained) step is projected back onto the cone
                    // just below (gam#1108), preserving this step's fast convergence
                    // while keeping every accepted iterate feasible.
                    trial_delta = spectrum.trust_region_step(joint_trust_radius).delta;
                    joint_trust_region_block_metric_norms(
                        &trial_delta,
                        &ranges,
                        &joint_trust_metric_diag,
                    )
                } else if let Some(cauchy) = dogleg_cauchy.as_ref()
                    && !tried_preconditioned_descent
                {
                    trial_delta = Array1::<f64>::zeros(total_p);
                    joint_dogleg_step_to_block_metric_radii(
                        &search_delta,
                        cauchy,
                        &ranges,
                        &joint_trust_metric_diag,
                        &joint_block_trust_radii,
                        &mut trial_delta,
                    )
                } else {
                    trial_delta = search_delta.clone();
                    truncate_joint_step_to_block_metric_radii(
                        &mut trial_delta,
                        &ranges,
                        &joint_trust_metric_diag,
                        &joint_block_trust_radii,
                    )
                };
                if apply_joint_feasibility_limit(family, &states, &ranges, &mut trial_delta)
                    .is_err()
                {
                    joint_trust_radius = shrink_active_joint_block_trust_radii(
                        &mut joint_block_trust_radii,
                        &block_step_norms,
                        0.25,
                    );
                    continue;
                }
                // CONSTRAINED-PATH FEASIBILITY PROJECTION (gam#1108). The
                // trust-region trial step (Moré–Sorensen / dogleg / box-trunc) is
                // taken in the UNCONSTRAINED D-metric ball, and
                // `apply_joint_feasibility_limit` is a no-op for families whose
                // `max_feasible_step_size` is `None` (e.g. `LatentSurvivalFamily`),
                // so the step can cross the monotone time-derivative cone `Aβ ≥ b`.
                // The next cycle's `check_linear_feasibility` gate would then reject
                // the accepted iterate — the interval-censored survival warm-start
                // abort. Project the trial iterate back onto the cone with the exact
                // identity-Hessian active-set projection, preserving the trust
                // step's fast convergence while guaranteeing every accepted iterate
                // is feasible. No-op when the joint design is unconstrained or the
                // trial is already feasible; `block_step_norms` is recomputed from
                // the projected step just below so the trust-radius bookkeeping
                // stays consistent.
                if let Some(constraints) = joint_constraints.as_ref() {
                    let trial_beta = &beta_joint + &trial_delta;
                    if check_linear_feasibility(&trial_beta, constraints, 1e-8).is_err()
                        && let Some(projected) =
                            crate::solver::active_set::project_point_strictly_into_feasible_cone(
                                &trial_beta,
                                constraints,
                            )
                    {
                        trial_delta = &projected - &beta_joint;
                    }
                }
                block_step_norms = joint_trust_region_block_metric_norms(
                    &trial_delta,
                    &ranges,
                    &joint_trust_metric_diag,
                );
                let step_norm = block_step_norms.iter().copied().fold(0.0_f64, f64::max);
                let trial_step_inf = trial_delta
                    .iter()
                    .copied()
                    .map(f64::abs)
                    .fold(0.0_f64, f64::max);
                let step_hit_trust_boundary = block_step_norms
                    .iter()
                    .zip(&joint_block_trust_radii)
                    .any(|(step_norm, radius)| {
                        joint_block_step_hit_trust_boundary(*step_norm, *radius)
                    });
                // Predicted reduction must use the TRUE penalized Hessian
                // (the one that appears in `f(β) = -ℓ + ½βᵀSβ + ½·joint_mode_diagonal_ridge·‖β‖²`),
                // NOT the SPD-stabilized version. The stabilizing shift
                // in `joint_solver_diagonal_ridge` is purely a solver-side
                // tool to make the Newton system invertible when H_NLL
                // has negative eigenvalues; it is not part of the true
                // objective the trial-likelihood evaluator computes.
                //
                // If we use `joint_solver_diagonal_ridge` here, then for
                // any Newton step lying in null(H_true) (e.g. the
                // marginal-block cancellation direction in the saturated
                // probit regime — see
                // `marginal_block_hessian_cancels_in_saturated_regime`),
                // predicted = ½·rhs·δ while actual = rhs·δ, giving ρ = 2
                // exactly. The trust-region loop then accepts the step
                // (ρ > 0.75 expands the radius), and the same regime
                // repeats every cycle — exactly the large-scale-saturated
                // failure trace. Pinned by
                // `ridge_stabilization_gap_produces_exact_rho_two_in_null_direction`.
                //
                // `hpen_delta` and `tr_penalty_scratch` are hoisted outside
                // this loop; the workspace variant reuses them without
                // allocating per attempt.
                hpen_delta.fill(0.0);
                if apply_joint_penalized_hessian_into_with_workspace(
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    joint_mode_diagonal_ridge,
                    &trial_delta,
                    &mut hpen_delta,
                    &mut tr_penalty_scratch,
                    joint_bundle,
                )
                .is_err()
                {
                    break;
                }
                // JEFFREYS/FIRTH CURVATURE IN THE TRUST-REGION MODEL (gam#729/#715).
                // When the Jeffreys term is armed, the inner objective the merit
                // (`trialobjective = −ℓ + ½βᵀSβ + Φ`) measures and the Newton step
                // (`(H+Sλ+H_Φ)δ = ∇L−Sβ+∇Φ`) target both include the Firth term, so
                // the trust-region quadratic model's curvature MUST include `H_Φδ`
                // too. Omitting it (bare `(H+Sλ)δ`) makes `predicted_reduction`
                // inconsistent with the H_Φ-augmented `rhs` and the Φ-augmented
                // `actual_reduction`: for a coupled K-block family near the Firth
                // optimum (residual floored at ‖∇Φ‖) the resulting trust_ratio is
                // wrong, the line search rejects the genuine descent step (accepts
                // ~0), and β freezes with the residual stalled at a constant ≫ tol
                // — the unbounded-cycle non-convergence the inner solve exhibits on
                // the Dirichlet/multinomial fits. Adding `H_Φδ` makes the model
                // curvature match the augmented system the step solves and the
                // merit the accept test uses, so the step is accepted and the
                // residual descends. No-op when the term is condition-gated (∇Φ=0,
                // H_Φ=0) or unavailable.
                if let Some((_grad_phi, hphi)) = head_jeffreys_term.as_ref() {
                    let hphi_delta = hphi.dot(&trial_delta);
                    hpen_delta += &hphi_delta;
                }
                let predicted_reduction =
                    joint_quadratic_predicted_reduction(&rhs, &hpen_delta, &trial_delta);
                let linearized_next_kkt_inf = hpen_delta
                    .iter()
                    .zip(rhs.iter())
                    .map(|(hpen, rhs)| (hpen - rhs).abs())
                    .fold(0.0_f64, f64::max);
                // Reject only non-descent directions on the quadratic model.
                // A small-but-positive predicted reduction is what Newton
                // *should* produce near the optimum of a large-magnitude
                // objective: ½δᵀHδ scales with curvature×step², so it can be
                // far below the (relative) objective_tol = inner_tol·(1+|obj|)
                // while still being a correct Newton step. Trust-region ρ
                // shrink/expand handles small-but-valid Newton steps; the
                // preconditioned branch below is only for model-invalid
                // directions, and preserves linear constraints when present.
                //
                // NEAR-FLOOR CARVE-OUT (gam#787 binary matern centers=12). When
                // the Newton proposal is already at the step-tolerance floor —
                // `step_inf ≤ 4·step_tol`, the same round-off band the cert path
                // uses — the iterate is doing KKT polishing on a flat objective,
                // not global descent: there `predicted_reduction = rhs·δ − ½δᵀHδ`
                // is two near-equal O(step²) quantities and its SIGN is round-off
                // noise (a true Newton step gives +½δᵀHδ but the damped/range-
                // restricted spectral solve leaves rhs·δ a hair below ½δᵀHδ). The
                // `predicted_reduction ≤ 0` branch then mistook this for a model-
                // invalid direction and substituted `joint_preconditioned_descent_delta`,
                // a step sized for OBJECTIVE descent (diagonal-preconditioned
                // gradient, O(900×) larger than the polishing proposal). That step
                // bought a round-off-level objective gain but catapulted the KKT
                // residual off a near-converged iterate (‖∇L−Sβ‖ 1.7e-4 → 4.7e-1),
                // which then never recovered — every later cycle re-triggered the
                // same substitution (proposal stays pred≤0), pinning the residual
                // far above tol until the cycle budget exhausted → seed rejected →
                // hard raise. At the step floor we instead take the tiny proposal
                // as-is and let the trust-region noise-floor guard accept it at
                // ρ=1 (it neither helps nor hurts the objective beyond round-off),
                // so the inner keeps polishing the KKT residual to tol.
                let proposal_at_step_floor = joint_proposal_at_step_floor(step_inf, step_tol);
                if (!predicted_reduction.is_finite() || predicted_reduction <= 0.0)
                    && !proposal_at_step_floor
                {
                    model_rejects += 1;
                    // CONSTRAINED-PATH GUARD (#1108). The preconditioned-descent
                    // substitution replaces `search_delta` with an UNCONSTRAINED
                    // diagonally-preconditioned gradient step (`δ = M⁻¹·rhs`). That
                    // direction respects neither the active set nor the linear
                    // inequality cone `Aβ ≥ b`, and nothing downstream re-projects
                    // it: a constrained family that maintains feasibility purely
                    // through the QP (e.g. `LatentSurvivalFamily`, whose
                    // `max_feasible_step_size` is `None` and whose
                    // `post_update_block_beta` is the identity) has no barrier clip
                    // in `apply_joint_feasibility_limit` to pull the gradient step
                    // back onto the monotone time-derivative cone. The trial β then
                    // leaves the cone, the objective-descent test ACCEPTS it (the
                    // gradient step does lower the unconstrained merit), and the
                    // NEXT cycle's `check_linear_feasibility` rejects the accepted
                    // iterate as an "infeasible iterate" (raw `Aβ−b` violation
                    // ~5.5e-3) — aborting the whole interval-censored warm start.
                    // The QP's `search_delta` is a feasible-to-feasible chord
                    // (`candidate_beta − beta_joint`, both endpoints in the convex
                    // cone), so box-truncating it to a SMALLER trust radius keeps
                    // every sub-step feasible. On the constrained path we therefore
                    // never swap in the unconstrained descent direction; we only
                    // shrink the radius and re-truncate the constrained chord. The
                    // comment on the preconditioned branch already promised it
                    // "preserves linear constraints when present" — this makes the
                    // implementation honor that contract.
                    let constrained_path_active = search_joint_active_set.is_some();
                    if !tried_preconditioned_descent && !constrained_path_active {
                        match joint_preconditioned_descent_delta(
                            &joint_hessian_source,
                            &ranges,
                            &s_lambdas,
                            joint_solver_diagonal_ridge,
                            &rhs,
                            joint_bundle,
                        ) {
                            Ok(descent_delta) => {
                                search_delta = descent_delta;
                            }
                            Err(_) => {
                                joint_trust_radius = shrink_active_joint_block_trust_radii(
                                    &mut joint_block_trust_radii,
                                    &block_step_norms,
                                    0.25,
                                );
                            }
                        }
                        tried_preconditioned_descent = true;
                    } else {
                        joint_trust_radius = shrink_active_joint_block_trust_radii(
                            &mut joint_block_trust_radii,
                            &block_step_norms,
                            0.25,
                        );
                    }
                    continue;
                }

                for b in 0..specs.len() {
                    let (start, end) = ranges[b];
                    let mut trial_beta = old_beta[b].clone();
                    trial_beta += &trial_delta.slice(ndarray::s![start..end]);
                    let projected =
                        family.post_update_block_beta(&states, b, &specs[b], trial_beta.clone())?;
                    reject_constrained_post_update_repair(
                        b,
                        &specs[b],
                        &trial_beta,
                        &projected,
                        block_constraints[b].as_ref(),
                    )?;
                    states[b].beta.assign(&projected);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                let mut trial_penalty = total_quadratic_penalty(
                    &states,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    joint_bundle,
                    Some(specs),
                );
                // Jeffreys objective contribution at the trial point keeps the
                // accept/reject objective consistent with the Jeffreys-modified
                // Newton step. `states` already holds the trial coefficients
                // (assigned + eta-refreshed above). No-op when the Jeffreys term
                // is unavailable or condition-gated to zero. When the cheap pre-
                // check certified this cycle well-conditioned, the step used H_Φ=0
                // / ∇Φ=0, so the consistent accept/reject objective also uses Φ=0:
                // skipping here keeps value and step on the SAME objective (the
                // value/step consistency the term exists to enforce) and avoids the
                // dense H/eigh at the trial point. The 8× conditioning margin makes
                // a single damped Newton step incapable of crossing the gate.
                // SUBTRACT Φ: the inner NLL objective is `−ℓ + ½βᵀSβ − Φ` (Firth
                // adds ½log|I| to the log-likelihood). Must match the cycle-0
                // baseline, the Newton step, and the KKT residual — INCLUDING the
                // `jeffreys_skippable_this_cycle` gate, so that on a well-conditioned
                // cycle the trial, the step (H_Φ=0/∇Φ=0), and the residual all sit
                // on the SAME Φ=0 objective (gam#729/#715 sign fix; the baseline and
                // post-accept folds carry the matching skippable gate).
                if !jeffreys_skippable_this_cycle
                    && let Some(z_joint) = joint_jeffreys_subspace.as_ref()
                {
                    trial_penalty -= custom_family_joint_jeffreys_value(
                        family, &states, specs, &ranges, z_joint,
                    );
                }
                // Cheap-LL line-search path: rejected backtracking attempts
                // discard the exact-Newton workspace they build, so we evaluate
                // just the scalar full-data log-likelihood for the accept/reject
                // decision and only build the full state once the step is
                // accepted (via the gradient reload below).
                //
                // EARLY-EXIT THRESHOLD MUST BOUND THE NLL, NOT THE FULL OBJECTIVE
                // (was a stall — gam#787/#785, duchon centers≥20). The family's
                // `bernoulli_margslope_line_search_ll_with_early_exit` short-
                // circuits the row sweep when the accumulated `-Σ wᵢ log CDF` (the
                // NLL ALONE — no penalty, no Jeffreys Φ) exceeds the threshold; its
                // monotone-lower-bound proof is valid only for the NLL term. But the
                // accept test is on the FULL augmented objective
                // `F = -ℓ + ½βᵀSβ + Φ_trial`, accepted iff `F ≤ old_objective + slack`,
                // i.e. iff `-ℓ_trial ≤ old_objective + slack − penalty_trial`. Passing
                // the full `old_objective` as the NLL threshold therefore over-rejects
                // by exactly `penalty_trial`: where the trial penalty is NEGATIVE
                // (the Jeffreys term subtracts Φ, and `½βᵀSβ` can be net-negative
                // under the reparam) the NLL threshold sits BELOW the true accept
                // bound, so the early exit kills net-descent steps the trust region
                // would accept — every backtracking attempt false-rejects, the radius
                // collapses, and the inner exits non-converged at cycle ~2 (seed
                // rejected pre-solver → hard raise, β pinned). Subtract the trial
                // penalty so the threshold is the NLL the trial must beat.
                let line_search_options =
                    coefficient_line_search_options(options, old_objective + 1e-10 - trial_penalty);
                let trial_ll =
                    match joint_line_search_log_likelihood(family, &line_search_options, &states) {
                        Ok((value, workspace)) => {
                            accepted_joint_workspace = workspace;
                            value
                        }
                        Err(e) => {
                            likelihood_rejects += 1;
                            if first_likelihood_reject.is_none() {
                                first_likelihood_reject = Some(e);
                            }
                            for (b, old) in old_beta.iter().enumerate() {
                                states[b].beta.assign(old);
                            }
                            refresh_all_block_etas(family, specs, &mut states)?;
                            joint_trust_radius = shrink_active_joint_block_trust_radii(
                                &mut joint_block_trust_radii,
                                &block_step_norms,
                                0.25,
                            );
                            continue;
                        }
                    };
                let trialobjective = -trial_ll + trial_penalty;
                // Row measure observed by the trial objective at β + δ. The
                // line-search helper above runs under `coefficient_line_search_options`,
                // which now preserves `outer_score_subsample` and disables
                // any further auto-install; if either contract is broken the
                // id will diverge from `tr_row_measure_top` and we Err below.
                let tr_row_measure_trial =
                    crate::solver::row_measure::RowMeasure::from_options(options, total_joint_n);
                // Hard invariant: the trust-region ratio numerator (objective
                // at β minus trial at β+δ) and denominator (rhs·δ − ½δᵀH δ)
                // MUST share a row measure with the Hessian/gradient build.
                // Bubble out via `Err` rather than panic; this function
                // already returns `Result<_, String>`.
                let top_id = tr_row_measure_top.id;
                if tr_row_measure_hessian.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         Hessian id 0x{:016x} differs from top-of-cycle id 0x{:016x} \
                         (cycle {}); the joint Hessian was built against a different \
                         row mask than the trust-region globalization captured at the \
                         top of the cycle. ρ would compare ½δᵀHδ on one measure to \
                         F(β)−F(β+δ) on another.",
                        tr_row_measure_hessian.id, top_id, cycle
                    ));
                }
                if tr_row_measure_gradient.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         gradient id 0x{:016x} differs from top-of-cycle id 0x{:016x} \
                         (cycle {}); `cached_joint_gradient` was loaded against a \
                         different row mask than the trust-region globalization \
                         captured at the top of the cycle. rhs·δ in the predicted \
                         reduction would not match the rest of the ρ inputs.",
                        tr_row_measure_gradient.id, top_id, cycle
                    ));
                }
                if tr_row_measure_old_objective.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         objective-at-β id 0x{:016x} differs from top-of-cycle id \
                         0x{:016x} (cycle {}); `lastobjective` was computed against \
                         a different row mask than the trust-region globalization \
                         captured at the top of the cycle.",
                        tr_row_measure_old_objective.id, top_id, cycle
                    ));
                }
                if tr_row_measure_trial.id != top_id {
                    return Err(format!(
                        "trust-region row-measure invariant violated: \
                         trial-objective id 0x{:016x} differs from top-of-cycle id \
                         0x{:016x} (cycle {}, attempt {}); the line-search trial \
                         likelihood evaluated against a different row mask than the \
                         Hessian/gradient/old-objective build. Cf. \
                         `coefficient_line_search_options` and \
                         `install_auto_outer_subsample_options`.",
                        tr_row_measure_trial.id, top_id, cycle, trust_attempt
                    ));
                }
                let actual_reduction = old_objective - trialobjective;
                let trust_update = update_joint_trust_region_radius(
                    joint_trust_radius,
                    step_norm,
                    actual_reduction,
                    predicted_reduction,
                    old_objective,
                );
                let old_radius = joint_trust_radius;
                // Classify the outcome of this attempt so the diagnostic line
                // says *why* the step was taken or rejected rather than just
                // dumping numbers. The four phases partition the post-log
                // branches below; computing them up front lets the log line
                // and the dispatch agree.
                let floor_reached = trust_update.accepted
                    && current_stationarity_residual <= residual_tol
                    && joint_objective_floor_reached(
                        old_objective,
                        trialobjective,
                        actual_reduction,
                        predicted_reduction,
                        objective_tol,
                    );
                let roundoff_slack = joint_objective_roundoff_slack(old_objective, trialobjective);
                let secondary_ok = !floor_reached
                    && trialobjective.is_finite()
                    && trust_update.accepted
                    && trialobjective <= old_objective + roundoff_slack;
                let phase: &'static str = if floor_reached {
                    "converged"
                } else if secondary_ok {
                    "accepted"
                } else if trust_update.accepted {
                    "stall"
                } else {
                    "reject"
                };
                if floor_reached || secondary_ok {
                    for (block_radius, block_step_norm) in joint_block_trust_radii
                        .iter_mut()
                        .zip(block_step_norms.iter())
                    {
                        let block_update = update_joint_trust_region_radius(
                            *block_radius,
                            *block_step_norm,
                            actual_reduction,
                            predicted_reduction,
                            old_objective,
                        );
                        if block_update.radius >= *block_radius
                            || joint_block_step_hit_trust_boundary(*block_step_norm, *block_radius)
                        {
                            *block_radius = block_update.radius;
                        }
                    }
                    joint_trust_radius = joint_block_trust_radii
                        .iter()
                        .copied()
                        .fold(0.0_f64, f64::max);
                } else {
                    joint_trust_radius = shrink_active_joint_block_trust_radii(
                        &mut joint_block_trust_radii,
                        &block_step_norms,
                        0.25,
                    );
                }
                let radius_held =
                    (joint_trust_radius - old_radius).abs() <= 1e-12 * old_radius.abs().max(1.0);
                let joint_math = JointNewtonMathDiagnostic {
                    old_kkt_inf: current_kkt_norm,
                    linearized_next_kkt_inf,
                    predicted_reduction,
                    actual_reduction,
                    trust_ratio: trust_update.rho,
                    step_inf: trial_step_inf,
                    proposal_inf: step_inf,
                };
                let radius_field = if radius_held {
                    format!("r={:.3e} (held)", old_radius)
                } else {
                    format!("r={:.3e}->{:.3e}", old_radius, joint_trust_radius)
                };
                // Surface the TR-policy decision so future failures
                // distinguish "TR is throttling Newton" from "TR is not
                // the bottleneck — Newton itself finds short steps".
                // For the large-scale linear-convergence pattern the policy
                // is consistently `hold_inside` (ρ≈1, |δ| ≪ radius),
                // which proves the TR is not what is keeping the step
                // small — that came up before via "(held)" alone but
                // the explicit decision label makes the inference
                // immediate instead of requiring step/radius arithmetic
                // in the reader's head.
                let tr_attempt_sig = format!(
                    "{:<9}  ρ={:+.3e}  Δobj={:+.3e}  pred={:+.3e}  {}  decision={:<22}  |δ|={:.3e}  |δ|∞={:.3e}  |prop|∞={:.3e}",
                    phase,
                    trust_update.rho,
                    actual_reduction,
                    predicted_reduction,
                    radius_field,
                    trust_update.decision.label(),
                    step_norm,
                    trial_step_inf,
                    step_inf,
                );
                match tr_log_sig.as_deref() {
                    Some(prev) if prev == tr_attempt_sig.as_str() => {
                        tr_log_last = line_search_attempts;
                    }
                    Some(prev) => {
                        if tr_log_first == tr_log_last {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                                cycle,
                                tr_log_first,
                                prev,
                            );
                        } else {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                                cycle,
                                tr_log_first,
                                tr_log_last,
                                tr_log_last - tr_log_first + 1,
                                prev,
                            );
                        }
                        tr_log_sig = Some(tr_attempt_sig);
                        tr_log_first = line_search_attempts;
                        tr_log_last = line_search_attempts;
                    }
                    None => {
                        tr_log_sig = Some(tr_attempt_sig);
                        tr_log_first = line_search_attempts;
                        tr_log_last = line_search_attempts;
                    }
                }
                if floor_reached {
                    if let Some(sig) = tr_log_sig.take() {
                        if tr_log_first == tr_log_last {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                                cycle,
                                tr_log_first,
                                sig,
                            );
                        } else {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                                cycle,
                                tr_log_first,
                                tr_log_last,
                                tr_log_last - tr_log_first + 1,
                                sig,
                            );
                        }
                    }
                    for (b, old) in old_beta.iter().enumerate() {
                        states[b].beta.assign(old);
                    }
                    refresh_all_block_etas(family, specs, &mut states)?;
                    last_joint_math = Some(joint_math);
                    accepted = true;
                    converged = true;
                    break;
                }
                if secondary_ok {
                    if let Some(sig) = tr_log_sig.take() {
                        if tr_log_first == tr_log_last {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                                cycle,
                                tr_log_first,
                                sig,
                            );
                        } else {
                            log::info!(
                                "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                                cycle,
                                tr_log_first,
                                tr_log_last,
                                tr_log_last - tr_log_first + 1,
                                sig,
                            );
                        }
                    }
                    current_penalty = trial_penalty;
                    if let Some(joint_active_set) = search_joint_active_set.as_ref() {
                        cached_active_sets =
                            scatter_joint_active_set(joint_active_set, &block_constraints);
                    }
                    last_joint_math = Some(joint_math);
                    last_accepted_hit_joint_trust_boundary = step_hit_trust_boundary;
                    accepted = true;
                    break;
                }
                for (b, old) in old_beta.iter().enumerate() {
                    states[b].beta.assign(old);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                objective_rejects += 1;
            }
            if let Some(sig) = tr_log_sig.take() {
                if tr_log_first == tr_log_last {
                    log::info!(
                        "[PIRLS/joint-Newton/TR cycle={} attempt={}] {}",
                        cycle,
                        tr_log_first,
                        sig,
                    );
                } else {
                    log::info!(
                        "[PIRLS/joint-Newton/TR cycle={} attempts={}..{} ×{}] {}",
                        cycle,
                        tr_log_first,
                        tr_log_last,
                        tr_log_last - tr_log_first + 1,
                        sig,
                    );
                }
            }
            let line_search_elapsed = line_search_started.elapsed();
            if accepted && converged {
                log::info!(
                    "[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=true hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} reject_model={} reject_likelihood={} reject_objective={} first_likelihood_reject={} grad_reload=0.000s total={:.3}s",
                    cycle,
                    hessian_and_qp_elapsed.as_secs_f64(),
                    line_search_elapsed.as_secs_f64(),
                    line_search_attempts,
                    model_rejects,
                    likelihood_rejects,
                    objective_rejects,
                    first_likelihood_reject.as_deref().unwrap_or("none"),
                    cycle_started.elapsed().as_secs_f64(),
                );
                cached_joint_workspace = hessian_workspace_for_cycle;
                cycles_done = cycle + 1;
                break;
            }
            if !accepted {
                // Retry the joint Newton loop from the same state after a
                // failed trust-region search. Falling through into blockwise
                // would switch a coupled exact-Hessian problem onto a
                // principal-block surrogate, which is the ridge-drift failure
                // mode this path is meant to avoid. The trust-region radius
                // already collapsed via the attempt loop's shrink rules, so
                // the next cycle's Newton proposal will be evaluated under
                // a tighter L2 bound without any parallel adaptation here.
                log::info!(
                    "[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=false hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} reject_model={} reject_likelihood={} reject_objective={} first_likelihood_reject={} grad_reload=0.000s total={:.3}s",
                    cycle,
                    hessian_and_qp_elapsed.as_secs_f64(),
                    line_search_elapsed.as_secs_f64(),
                    line_search_attempts,
                    model_rejects,
                    likelihood_rejects,
                    objective_rejects,
                    first_likelihood_reject.as_deref().unwrap_or("none"),
                    cycle_started.elapsed().as_secs_f64(),
                );
                // Restore original betas
                for (b, old) in old_beta.iter().enumerate() {
                    states[b].beta.assign(old);
                }
                refresh_all_block_etas(family, specs, &mut states)?;
                // If the previous cycle's bookkeeping certified KKT
                // stationarity (residual ≤ tol and objective change ≤
                // tol), the line-search failure here is round-off on a
                // rank-deficient null mode rather than non-convergence:
                // the proposed `H⁻¹ g` step stays O(1) along the null
                // direction at the optimum, every trial moves β along
                // it without changing the objective, and round-off
                // flips the sign of `actual − predicted` so the
                // sufficient-decrease check rejects every trial. The
                // iterate ALREADY satisfies the first-order optimality
                // conditions; we accept that as convergence rather
                // than fail the outer "inner solve did not converge"
                // panic on a fully resolved fit.
                if last_cycle_residual_below_tol && last_cycle_obj_change_below_tol {
                    converged = true;
                    break;
                }
                // Fully-rejected stall guard. See the constant declaration
                // at the top of this function for the full rationale. The
                // condition is: every trust attempt this cycle was rejected by
                // SOME path (model OR likelihood OR objective; the three reject
                // counters partition the JOINT_TRUST_MAX_ATTEMPTS attempts) AND
                // the joint trust radius did not shrink relative to the previous
                // fully-rejected cycle. Both together prove the next cycle's
                // Newton system, trust radius, and trust-region search are
                // bytewise identical to this cycle's — there is no descent
                // direction the local quadratic model can reconcile at this β.
                //
                // The earlier form required objective_rejects ==
                // JOINT_TRUST_MAX_ATTEMPTS && likelihood_rejects == 0, so it
                // NEVER fired on the biobank gauge-flat marginal/logslope fit:
                // there the objective is flat to f64 precision along the
                // residual direction and the BMS line search rejects every
                // trial on the *likelihood* early-exit path
                // (likelihood_rejects == 24), so the stall guard's increment
                // condition was unreachable and the loop spun to its cap. A
                // full rejection by the likelihood path at a collapsed trust
                // radius is the same numerically-flat-no-descent stall as a
                // full objective rejection; counting either lets the guard fire.
                let all_attempts_rejected = model_rejects + likelihood_rejects + objective_rejects
                    == JOINT_TRUST_MAX_ATTEMPTS;
                let radius_held_since_last_reject = match prev_rejected_trust_radius {
                    Some(prev) => {
                        joint_trust_radius.is_finite()
                            && prev.is_finite()
                            && joint_trust_radius >= prev * (1.0 - 1e-12)
                    }
                    None => false,
                };
                if all_attempts_rejected && radius_held_since_last_reject {
                    consecutive_held_rejected_cycles =
                        consecutive_held_rejected_cycles.saturating_add(1);
                } else {
                    consecutive_held_rejected_cycles = 0;
                }
                prev_rejected_trust_radius = Some(joint_trust_radius);
                if consecutive_held_rejected_cycles >= FULLY_REJECTED_STALL_MAX_CYCLES {
                    let last_math_summary = last_joint_math
                        .as_ref()
                        .map(|math| {
                            format!(
                                "last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
                                math.old_kkt_inf,
                                math.linearized_next_kkt_inf,
                                math.actual_reduction,
                                math.predicted_reduction,
                                math.trust_ratio,
                                math.scalar_model_relative_error(),
                                math.step_inf,
                                math.proposal_inf,
                            )
                        })
                        .unwrap_or_else(|| "last_newton_math=<none>".to_string());
                    log::warn!(
                        "[PIRLS/joint-Newton convergence] cycle {:>3} | fully-rejected stall \
                         early-exit: every trust-region attempt rejected (by any of the model / \
                         likelihood / objective paths) for {} consecutive cycles with joint trust \
                         radius held at {:.3e} throughout. Reverted β + held trust radius mean the \
                         next cycle's Newton step is byte-identical to this one's; no descent \
                         direction is reachable from this iterate under the current local model. \
                         {}. Checking identified-subspace stationarity before declaring \
                         non-convergence.",
                        cycle,
                        consecutive_held_rejected_cycles,
                        joint_trust_radius,
                        last_math_summary,
                    );
                    // Judge convergence on the IDENTIFIED (range) subspace
                    // before declaring non-convergence. A fully-rejected stall
                    // at a collapsed trust radius (every trial rejected at
                    // ~noise, ΔNLL ≈ 1 ULP) is the PROOF the descent direction
                    // is gauge-flat: the raw KKT residual (the biobank fit's
                    // 0.5) lives in the unidentified ker(H_pen) direction (the
                    // gauge-flat marginal/logslope coupling, same family as the
                    // c5d327ba4 separation false-positive), which the outer IFT
                    // pseudo-inverse projects out. Reuse the EXACT machinery the
                    // normal converged path uses (gam#979 commit 09b584024):
                    // the active-set-projected stationarity vector
                    // (`exact_newton_joint_projected_stationarity_vector_from_gradient`)
                    // restricted to range(H+Sλ) via
                    // `projected_residual_range_space_inf`. If the identified-
                    // subspace residual is at tolerance the fit IS at a
                    // numerically-stationary penalized optimum and must be
                    // RETURNED converged; only if it is ALSO above tol is this a
                    // genuine non-convergence. `cached_joint_gradient` was loaded
                    // at the cycle-entry β, which is exactly the reverted
                    // `old_beta` here, so the residual is evaluated at the
                    // returned iterate.
                    let stall_converged_on_identified_subspace = match cached_joint_gradient
                        .as_ref()
                    {
                        Some(stall_gradient) => {
                            match exact_newton_joint_projected_stationarity_vector_from_gradient(
                                stall_gradient,
                                &states,
                                specs,
                                &s_lambdas,
                                ridge,
                                options.ridge_policy,
                                &block_constraints,
                                Some(cached_active_sets.as_slice()),
                            ) {
                                Ok(stall_projected_residual_vec) => {
                                    projected_residual_range_space_inf(
                                        &stall_projected_residual_vec,
                                        &joint_hessian_source,
                                        &ranges,
                                        &s_lambdas,
                                        ridge,
                                        options.ridge_policy,
                                        total_p,
                                    )
                                    .filter(|range_residual| range_residual.is_finite())
                                    .filter(|range_residual| *range_residual <= last_residual_tol)
                                }
                                Err(_) => None,
                            }
                        }
                        None => None,
                    };
                    if let Some(stall_range_residual) = stall_converged_on_identified_subspace {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | fully-rejected stall \
                             resolved as identified-subspace KKT convergence (gam#979): every \
                             trust-region attempt rejected for {} cycles at trust radius {:.3e} \
                             (objective flat to f64 precision along the proposal — the proof the \
                             descent direction is gauge-flat), but the range-space \
                             (identified-subspace) residual {:.3e} ≤ tol {:.3e}; the leftover raw \
                             residual lives entirely in the unidentified ker(H_pen) gauge mode the \
                             outer IFT projects out (gam#553). The iterate is at a \
                             numerically-stationary penalized optimum — returning converged.",
                            cycle,
                            consecutive_held_rejected_cycles,
                            joint_trust_radius,
                            stall_range_residual,
                            last_residual_tol,
                        );
                        if stall_range_residual.is_finite() {
                            min_certified_residual =
                                min_certified_residual.min(stall_range_residual);
                        }
                        converged = true;
                        break;
                    }
                    converged = false;
                    break;
                }
                // CONTINUE rather than break (gam#826/#872/#715). The comment
                // above documents the intent — "retry the joint Newton loop from
                // the same state after a failed trust-region search" — but the old
                // code BROKE instead, giving up after a SINGLE cycle of failed line
                // search. On a severely near-separating coupled fit (matern
                // binomial location-scale, quasi-separating multinomial, flexible
                // linkwiggle) the cycle-0 Newton proposal is huge (the separation
                // gradient ÷ the Firth-bounded curvature), the trust region clamps
                // it, and the clamped step does not yet reduce the merit — so the
                // FIRST cycle's backtracking exhausts without acceptance. The
                // attempt loop already shrank `joint_trust_radius` /
                // `joint_block_trust_radii` (carried across cycles), so the NEXT
                // cycle re-proposes under the tighter radius and eventually accepts
                // a productive step — standard trust-region globalization. Breaking
                // at cycle 0 aborted the coupled solve ("exited the joint Newton
                // path before convergence — no math snapshot") before the trust
                // region could adapt. The inner cycle cap and the residual-stall /
                // trust-region-floor guards above still bound the loop, so a
                // genuinely stuck fit exits with a diagnosed non-convergence rather
                // than spinning. Falling through to blockwise (the old `break`)
                // would switch the coupled exact-Hessian problem onto a
                // principal-block surrogate (the ridge-drift mode this path avoids).
                continue;
            }

            let grad_reload_started = std::time::Instant::now();
            log::info!(
                "[joint-newton-tr] phase=gradient_reload cycle={} attempts={} r={:.3e}",
                cycle,
                line_search_attempts,
                joint_trust_radius,
            );
            let (log_likelihood, gradient, eval, workspace) = load_joint_gradient_evaluation(
                family,
                specs,
                options,
                &states,
                joint_workspace_requested,
                accepted_joint_workspace.take(),
            )?;
            let grad_reload_elapsed = grad_reload_started.elapsed();
            // Reset the fully-rejected stall guard's bookkeeping: an accepted
            // cycle moved β and may have grown the trust radius, so the next
            // rejected-cycle comparison must start fresh rather than carry
            // forward a stale radius snapshot from the previous reject streak.
            prev_rejected_trust_radius = None;
            consecutive_held_rejected_cycles = 0;
            // Accepted-cycle timing breakdown is debug-only. The per-cycle
            // info line below already includes total cycle time; emitting a
            // four-phase split on every verbose cycle adds a redundant info
            // line. Rejected cycles still keep the detailed phase log since
            // the reject reason and per-phase split is the diagnostic.
            log::debug!(
                "[PIRLS/joint-Newton/cycle-summary] cycle={} accepted=true hessian_qp={:.3}s line_search={:.3}s line_search_attempts={} grad_reload={:.3}s total={:.3}s",
                cycle,
                hessian_and_qp_elapsed.as_secs_f64(),
                line_search_elapsed.as_secs_f64(),
                line_search_attempts,
                grad_reload_elapsed.as_secs_f64(),
                cycle_started.elapsed().as_secs_f64(),
            );
            current_log_likelihood = log_likelihood;
            cached_joint_gradient = gradient;
            cached_eval = eval;
            cached_joint_workspace = workspace;
            current_penalty = total_quadratic_penalty(
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            // `current_penalty` / `lastobjective` stay the pure quadratic-penalized
            // objective (NO Φ folded in) — the Firth value is applied per cycle at
            // each β (see `old_objective` above and `trialobjective` below). The
            // gated Φ at the accepted β is captured separately so the convergence
            // `objective_change` compares the augmented objective at the new vs old
            // β consistently (gam#826/#872).
            lastobjective = -current_log_likelihood + current_penalty;
            let new_phi = if !jeffreys_skippable_this_cycle {
                joint_jeffreys_subspace
                    .as_ref()
                    .map(|z_joint| {
                        custom_family_joint_jeffreys_value(family, &states, specs, &ranges, z_joint)
                    })
                    .unwrap_or(0.0)
            } else {
                0.0
            };
            let accepted_step_inf = states
                .iter()
                .zip(old_beta.iter())
                .flat_map(|(state, old)| {
                    state
                        .beta
                        .iter()
                        .zip(old.iter())
                        .map(|(new, old)| (new - old).abs())
                })
                .fold(0.0_f64, f64::max);
            cycles_done = cycle + 1;

            // Check convergence via joint stationarity. When the family-general
            // Firth/Jeffreys term is armed, the penalized objective the inner
            // Newton actually optimizes is `−ℓ + ½βᵀSβ − Φ`, so its KKT
            // stationarity is `∇L − Sβ + ∇Φ = 0`. The Newton STEP already folds
            // `∇Φ` into its RHS (`spectral_rhs += grad_phi`), but the bare
            // `exact_newton_joint_stationarity_*` residual omits it — at the
            // Firth fixed point `∇L − Sβ = −∇Φ`, so the certificate floors at
            // `‖∇Φ‖∞` and never certifies, stalling the inner solve on exactly
            // the near-separating span Firth is meant to bound (the residual the
            // outer REML then rejects). Fold `∇Φ` into the gradient used for the
            // KKT residual so the convergence criterion matches the augmented
            // objective the step descends. No-op when the Jeffreys term is
            // unavailable or condition-gated to zero.
            let Some(gradient) = cached_joint_gradient.as_ref() else {
                break;
            };
            let jeffreys_augmented_gradient: Option<Array1<f64>> = if jeffreys_skippable_this_cycle
            {
                // Well-conditioned ⇒ ∇Φ = 0, so the KKT residual is the bare
                // stationarity (and floors at 0, not ‖∇Φ‖) — matching the step,
                // which folded H_Φ=0/∇Φ=0 this cycle. Avoids the dense H/eigh.
                None
            } else if let Some(z_joint) = joint_jeffreys_subspace.as_ref() {
                match custom_family_joint_jeffreys_term(family, &states, specs, &ranges, z_joint)? {
                    Some((_phi, grad_phi, hphi))
                        if grad_phi.len() == gradient.len()
                            && hphi.nrows() == total_p
                            && hphi.ncols() == total_p =>
                    {
                        let augmented = gradient + &grad_phi;
                        // Cache the exact triple at the just-accepted β so the next
                        // cycle's head reuses it instead of recomputing the
                        // O(p)-directional-derivative + GEMM term (gam#729).
                        let post_beta_key = flatten_state_betas(&states, specs);
                        jeffreys_triple_cache = Some((post_beta_key, grad_phi, hphi));
                        Some(augmented)
                    }
                    _ => None,
                }
            } else {
                None
            };
            let residual_gradient = jeffreys_augmented_gradient.as_ref().unwrap_or(gradient);
            let residual = exact_newton_joint_stationarity_inf_norm_from_gradient(
                residual_gradient,
                &states,
                specs,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                &block_constraints,
                Some(cached_active_sets.as_slice()),
            )?;
            prev_kkt_norm = Some(residual);
            // Record this cycle's KKT residual for the steady-geometric-descent
            // test at the certificate-refusal gate below (gam#787 centers≥20).
            if residual.is_finite() {
                min_certified_residual = min_certified_residual.min(residual);
                residual_descent_history.push_back(residual);
                while residual_descent_history.len() > RESIDUAL_DESCENT_WINDOW {
                    residual_descent_history.pop_front();
                }
            }

            // Scale-aware tolerances. The objective check was already
            // relative (`inner_tol * (1 + |obj|)`), but the step and
            // residual checks were absolute against the bare `inner_tol`
            // — at large scale (n ≈ 320k), β iterates can keep moving
            // by ~1e-5 per cycle along the monotonicity-feasible
            // manifold even after the likelihood has gone flat, and the
            // joint gradient ‖·‖_∞ is O(|obj|), not O(1). Running
            // 50-100 cycles past objective convergence is the
            // dominant inner-PIRLS cost at large scale. Switching to
            // relative scaling (`inner_tol * (1 + ‖β‖_∞)` for steps,
            // `inner_tol * (1 + |obj|)` for the gradient residual)
            // exits PIRLS as soon as the optimum is statistically
            // resolved, without loosening behavior at small n where
            // ‖β‖_∞ ≈ 1 and |obj| ≈ 1 give tolerances within 2× of
            // the historical absolute 1e-6.
            let beta_inf = states
                .iter()
                .flat_map(|s| s.beta.iter().copied())
                .map(f64::abs)
                .fold(0.0_f64, f64::max);
            let step_tol = inner_tol * (1.0 + beta_inf);
            let objective_tol = inner_tol * (1.0 + lastobjective.abs());
            // KKT residual tolerance must scale with the natural magnitude of
            // ‖Sβ − ∇L‖∞ (i.e. max(‖∇L‖∞, ‖Sβ‖∞)), not the objective. At
            // large scale with |β|∞ in the 10²–10³ range the gradient and
            // penalty norms can sit orders of magnitude above |obj| and FP
            // noise alone keeps the residual above any obj-scaled tol. The
            // pre-line-search check at the head of the cycle already uses
            // `inner_tol * (1 + max(grad_inf, pen_inf))`; using only grad_inf
            // here created an asymmetry where the same convergence criterion
            // would accept at one site and reject at the other, and on
            // marginal-slope models where Sβ is the larger term it shrank
            // the post-accept tolerance below the achievable FP floor.
            let mut block_gradient_norms = Vec::with_capacity(states.len());
            let mut block_penalty_norms = Vec::with_capacity(states.len());
            for (block_idx, (start, end)) in ranges.iter().copied().enumerate() {
                block_gradient_norms.push(
                    gradient
                        .slice(s![start..end])
                        .iter()
                        .map(|x: &f64| x.abs())
                        .fold(0.0_f64, f64::max),
                );
                let mut penalty_block = s_lambdas[block_idx].dot(&states[block_idx].beta);
                if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                    penalty_block += &states[block_idx].beta.mapv(|v| ridge * v);
                }
                block_penalty_norms.push(
                    penalty_block
                        .iter()
                        .map(|x: &f64| x.abs())
                        .fold(0.0_f64, f64::max),
                );
            }
            let grad_inf = block_gradient_norms.iter().copied().fold(0.0_f64, f64::max);
            let pen_inf = block_penalty_norms.iter().copied().fold(0.0_f64, f64::max);
            // Firth/Jeffreys score magnitude. The convergence residual is the
            // AUGMENTED stationarity `∇L − Sβ + ∇Φ`, so `∇Φ` is a first-class term
            // whose own numerical scale sets the achievable KKT floor: `∇Φ` is a
            // trace `½ tr(H_id⁻¹ Z_Jᵀ Ḣ Z_J)` formed from a FLOORED reduced-info
            // pseudo-inverse, so its components carry O(‖∇Φ‖·ε_floor) round-off
            // that the augmented residual cannot polish below. Scaling the KKT
            // tolerance by `max(grad, pen, ‖∇Φ‖)` (not just grad/pen) makes the
            // certificate reachable for coupled K-block Firth fits whose data
            // gradient is small but whose Firth score is O(1): otherwise the
            // augmented residual plateaus a few × above an unattainably tight
            // `inner_tol·(1+grad)` tol and the solve refuses just short of
            // convergence (gam#729/#715 — the residual stalled at ~8.8e-6 against a
            // ~1e-6 tol). No-op when the term is condition-gated (∇Φ=0).
            let firth_score_inf = head_jeffreys_term
                .as_ref()
                .map(|(grad_phi, _hphi)| grad_phi.iter().map(|v| v.abs()).fold(0.0_f64, f64::max))
                .unwrap_or(0.0);
            let residual_tol = inner_tol * (1.0 + grad_inf.max(pen_inf).max(firth_score_inf));
            // Arm the Jeffreys second-order endgame completion (gam#979) once
            // the residual enters the convergence band; latched (never
            // un-armed) so the endgame model cannot oscillate between the
            // divided-difference and exact Hessians across cycles.
            if residual.is_finite() && residual <= JEFFREYS_COMPLETION_RESIDUAL_BAND * residual_tol
            {
                jeffreys_completion_endgame = true;
            }
            let block_stationarity_tolerances = block_gradient_norms
                .iter()
                .zip(&block_penalty_norms)
                .map(|(grad_norm, penalty_norm)| inner_tol * (1.0 + grad_norm.max(*penalty_norm)))
                .collect::<Vec<_>>();
            // Active-set-projected stationarity residual vector (multiplier
            // mass of every pinned bound row already subtracted). Lifted out of
            // the per-block norm reduction so the constrained-stationary
            // certificate below can also test its component in the *range* of
            // the penalized Hessian (gam#553 penalty-null-space acceptance).
            let projected_residual_vec =
                exact_newton_joint_projected_stationarity_vector_from_gradient(
                    gradient,
                    &states,
                    specs,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    &block_constraints,
                    Some(cached_active_sets.as_slice()),
                )?;
            let block_stationarity_norms = {
                let mut offset = 0usize;
                states
                    .iter()
                    .map(|state| {
                        let start = offset;
                        let end = start + state.beta.len();
                        offset = end;
                        projected_residual_vec
                            .slice(ndarray::s![start..end])
                            .iter()
                            .map(|x: &f64| x.abs())
                            .fold(0.0_f64, f64::max)
                    })
                    .collect::<Vec<_>>()
            };
            // Per-block stationarity must be judged on the IDENTIFIED (range-space)
            // residual, not the raw active-set-projected residual (gam#979). On the
            // survival I-spline time block the unpenalized affine baseline direction
            // is a genuine ker(H_pen) gauge mode: the raw per-block residual keeps
            // the full gradient component along it (the measured ~28 plateau at λ≈1e7
            // that the absolute tol can never reach), so the raw gate falsely rejects
            // a solve that IS stationary on every identifiable direction — the
            // residual mass it sees is the free gauge the outer IFT projects out
            // (gam#553). Use the range-projected per-block residual when a penalty
            // null space exists; fall back to the raw per-block residual when it does
            // not (there range == whole space, so they coincide and the strict gate
            // is unchanged for every well-identified family).
            //
            // PERF (gam#1082): the range projection eigendecomposes the FULL P·M
            // joint penalized Hessian — an O((P·M)³) cost. The two certificates
            // that consume the range-projected gate (the residual-stall and
            // relative-plateau exits below) only fire under rare preconditions
            // (`tr_clamped_during_stall` after a long no-improve streak; a latched
            // objective plateau). Computing the eigh EVERY inner cycle therefore
            // added a redundant O(p³) eigendecomposition per cycle to every
            // penalized family carrying a null space (every tp-smooth model) — the
            // multinomial smooth-by-factor wall-clock regression.
            //
            // The eigh is deferred to `range_projected_block_stationarity_small`
            // (called ONLY inside a certificate branch whose cheap precondition has
            // already passed via short-circuit `&&`), so on the convergence-tail
            // it runs at most once per accepted exit rather than once per cycle.
            let range_projected_block_stationarity_small = || -> bool {
                projected_residual_range_space_per_block_inf(
                    &projected_residual_vec,
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    total_p,
                )
                .unwrap_or_else(|| block_stationarity_norms.clone())
                .iter()
                .zip(&block_stationarity_tolerances)
                .all(|(norm, tol)| {
                    norm.is_finite()
                        && tol.is_finite()
                        && *norm <= RESIDUAL_STALL_BLOCK_GRADIENT_FACTOR * *tol
                })
            };
            // gam#1082 perf: a per-cycle #979 divergence-trace logging block
            // lived here and computed — EVERY inner cycle for the first 40
            // cycles, purely to feed two `log::info!` lines — a FULL O((P·M)³)
            // eigendecomposition (`projected_residual_range_space_inf`), a
            // penalty-matrix min-eigenvalue, and per-penalty quadratic forms.
            // On any penalized family with a penalty null space (every
            // `select=TRUE` double-penalty tp-smooth model, including the
            // multinomial smooth-by-factor fit) the eigh's `nullity > 0` branch
            // actually ran, so each outer REML evaluation paid up to 40
            // redundant O(p³) eigendecompositions inside its inner joint-Newton.
            // That diagnostic instrumentation — not the outer iteration count —
            // was the dominant wall-clock cost (the #1082 overrun the outer
            // rel-cost decouple could not touch, because the cost is
            // per-inner-cycle, not per-outer-iteration). The trace has served
            // its #979 purpose and is removed from the production hot path; every
            // convergence-relevant quantity (`residual`, `block_stationarity_norms`,
            // and the lazily-evaluated range-space gate above) is still computed
            // where the gate actually consumes it.
            let near_convergence = residual <= 10.0 * residual_tol;
            // Augmented-objective change: `(quad(new) − Φ_gated(new)) −
            // (quad(old) − Φ_gated(old))`. `lastobjective` is quadratic-only and
            // `old_objective` already carries `−old_phi`, so subtract the accepted
            // β's `new_phi` here to keep both endpoints on the Φ-augmented merit
            // (gam#826/#872). On a skippable cycle both phis are 0 ⇒ identical to
            // the bare quadratic change.
            let signed_obj_change = (lastobjective - new_phi) - old_objective;
            let objective_change = signed_obj_change.abs();

            // Per-cycle observability for the convergence test. Surfaces
            // WHICH criterion is binding (proposed step, accepted step,
            // residual, objective change) at every iteration so CI logs
            // distinguish "Newton hasn't proposed a small step yet"
            // (algorithm still working) from "step is small but residual
            // won't drop below tol" (tolerance scaling problem). Without
            // this, the only visible signal is the objective itself,
            // which is insufficient to choose the right algorithmic
            // remedy.
            //
            // gam#979 discriminator: the PER-BLOCK projected stationarity
            // breakdown. The aggregate `residual` alone cannot distinguish a
            // genuinely-coupled stall from one block dragging the others — for
            // the survival marginal↔logslope grind the question "is the total
            // residual dominated by a single block (the multiplicative
            // z·exp(logslope) coupling channel), or spread evenly (global
            // conditioning)?" is answerable only from the split. `block_resid`
            // is already computed above for the convergence test, so surfacing
            // it per cycle is free; reading it across a 75 s repro under
            // RUST_LOG=info tells whether the slowdown is a single stuck block
            // (curvature/coupling channel) or an evenly slow descent
            // (conditioning) — without it the four #979 candidates are not
            // separable from the timeline.
            let block_resid_sig = block_stationarity_norms
                .iter()
                .map(|n| format!("{n:.3e}"))
                .collect::<Vec<_>>()
                .join(",");
            log::info!(
                "[PIRLS/joint-Newton convergence] cycle {:>3} | step_inf={:.3e} (tol={:.3e}) | accepted_step_inf={:.3e} | residual={:.3e} (tol={:.3e}) | per_block_resid=[{}] | obj_change={:.3e} (tol={:.3e}) | beta_inf={:.3e}",
                cycle,
                step_inf,
                step_tol,
                accepted_step_inf,
                residual,
                residual_tol,
                block_resid_sig,
                objective_change,
                objective_tol,
                beta_inf,
            );

            // gam#1082 perf: a tightly-gated `#1040 inner-conditioning probe`
            // lived here. Once the inner joint-Newton stalled (residual stuck
            // above tol for `RESIDUAL_STALL_NO_IMPROVE_CYCLES` cycles), it
            // eigendecomposed the FULL P·M penalized Hessian (O((P·M)³)) plus an
            // O(p²) Rayleigh-quotient loop EVERY cycle thereafter, purely to feed
            // one `log::info!`. The gate's whole point is "the solve is
            // grinding" — exactly the regime where it then fires on EVERY one of
            // the remaining (up to `inner_max_cycles`) cycles, turning a stall
            // into an O(p³)-per-cycle crawl (a dominant face of the #1082
            // multinomial wall-clock overrun: the cost is per-stalled-cycle, not
            // per-outer-iteration). The diagnostic is removed from the hot path;
            // the inner solve's own stall handling (trust-region clamp,
            // Newton-decrement and range-space convergence certificates) governs
            // termination, and the cheap per-cycle convergence line above already
            // surfaces residual/step/per-block-residual for observability.

            if verbose_cycle || near_convergence {
                log::info!(
                    "[PIRLS/JN] cyc={:>3}/{} obj={:.6e} -loglik={:.6e} pen={:.3e} Δobj={:+.3e} |δ|∞={:.3e} accepted_|δ|∞={:.3e} resid={:.3e} (tol={:.3e}) obj_tol={:.3e} step_tol={:.3e} |β|∞={:.3e} attempts={} t={:.3}s",
                    cycle,
                    inner_max_cycles,
                    lastobjective,
                    -current_log_likelihood,
                    current_penalty,
                    signed_obj_change,
                    step_inf,
                    accepted_step_inf,
                    residual,
                    residual_tol,
                    objective_tol,
                    step_tol,
                    beta_inf,
                    line_search_attempts,
                    cycle_started.elapsed().as_secs_f64(),
                );
            } else {
                log::info!(
                    "[PIRLS/JN] cyc={:>3}/{} obj={:.6e} Δobj={:+.3e} |δ|∞={:.3e} resid={:.3e} attempts={} t={:.3}s",
                    cycle,
                    inner_max_cycles,
                    lastobjective,
                    signed_obj_change,
                    accepted_step_inf,
                    residual,
                    line_search_attempts,
                    cycle_started.elapsed().as_secs_f64(),
                );
            }

            // Divergence guard: a non-finite KKT residual, objective, or
            // log-likelihood means the inner joint Newton has diverged (NaN
            // mass propagating from a near-unidentified penalized block — the
            // binomial location-scale shared-basis log-σ deviation channel is
            // the canonical trigger, gam#554). Every convergence and
            // residual-stall exit below is gated on finite `<=` comparisons,
            // which a NaN residual silently defeats; left unguarded the loop
            // then grinds the full `inner_loop_hard_ceiling` on every outer
            // ρ-eval and every startup seed, which is the multi-hour "hang".
            // Treat it as immediate non-convergence so the outer optimizer
            // rejects this point cleanly instead of burning the budget.
            if !residual.is_finite()
                || !lastobjective.is_finite()
                || !current_log_likelihood.is_finite()
            {
                log::warn!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | divergence guard: non-finite inner state (residual={:.3e}, objective={:.3e}, -loglik={:.3e}); returning unconverged so the outer optimizer rejects this ρ evaluation instead of running to inner_max_cycles.",
                    cycle,
                    residual,
                    lastobjective,
                    -current_log_likelihood,
                );
                converged = false;
                break;
            }

            // KKT convergence: a small post-step residual is the
            // canonical optimality certificate for the penalized
            // objective. ‖∇L(β) − Sβ‖∞ ≤ residual_tol means the
            // iterate is at a KKT point to numerical precision and
            // further iteration cannot reduce it; the step magnitude
            // is irrelevant once the residual signal has fired.
            //
            // Tying convergence to a small step instead would refuse
            // to recognise quadratic-rate single-shot convergence:
            // exact Newton on an exact quadratic produces one full
            // step that lands at the optimum, so ‖delta‖∞ equals the
            // initial distance ‖β* − β₀‖∞ no matter how exact the
            // model is. Pairing a residual check with a step-size
            // requirement structurally rejects this entirely-correct
            // cycle-0 termination, leaving inner_max_cycles=1 callers
            // unable to certify convergence on a problem that was
            // solved exactly in one Newton step.
            if joint_inner_kkt_converged(residual, residual_tol) {
                converged = true;
                break;
            }
            // Identified-subspace (range-space) KKT certificate.
            //
            // The strict certificate above tests the FULL stationarity residual
            // ‖∇L − Sβ‖∞. On a genuinely rank-deficient penalized inner problem
            // — a degenerate small-n transformation-normal CTM/Box-Cox fit whose
            // joint Hessian carries an *unidentified* direction the
            // canonical-gauge pass cannot attribute to a single block (the same
            // structural null root-caused for the joint-Newton panic at
            // `solve_joint_newton_step_on_spectral_range`) — the stationarity
            // gradient keeps a fixed nonzero component inside ker(H_pen). The
            // spectral Newton step drops exactly that component (range-restricted
            // Moore–Penrose step: every null direction hits the `continue` branch
            // in the accumulation loop), so β converges on the identified
            // subspace and the step exhausts, yet the FULL residual never reaches
            // `residual_tol`. The strict test then runs the whole cycle budget
            // "non-converged" on an iterate that is, in fact, the optimum on the
            // only identifiable directions.
            //
            // The principled certificate is stationarity on range(H_pen): the
            // residual restricted to the curved (identified) subspace is at
            // tolerance while the leftover mass is provably confined to
            // ker(H_pen) — an unidentified direction with neither curvature nor
            // constraint. That null component is dropped by the spectral step
            // here and projected out of the KKT residual by the outer IFT
            // pseudo-inverse `U_S·H_proj⁻¹·U_Sᵀ` before the envelope correction
            // (see the gam#553 note and `projected_residual_range_space_inf`), so
            // it cannot bias the outer gradient.
            //
            // The remaining requirement is to prove we are AT the
            // range-restricted optimum rather than mid-descent, so this does not
            // short-circuit a genuinely nonlinear CTM fit that is still moving β.
            // There are two independent, equally-rigorous proofs of that, and
            // EITHER suffices once `range_residual ≤ residual_tol` has fired:
            //   (a) the full Newton step is exhausted (`step_inf ≤ step_tol`):
            //       the well-identified case, where the range-restricted step
            //       collapses to zero and the leftover ker(H_pen) component is
            //       already dropped by the spectral step, so the FULL step is
            //       small too; OR
            //   (b) the objective has stopped changing
            //       (`objective_change ≤ objective_tol`): the joint objective
            //       (−loglik + ½βᵀSβ) is a function of the IDENTIFIED coordinates
            //       ONLY — moving β along an unidentified direction in ker(H_pen)
            //       = ker(H_L) ∩ ker(S) changes neither the likelihood nor the
            //       penalty by construction — so a flat objective proves no
            //       identified-direction descent remains regardless of how large
            //       the FULL step is.
            // Proof (b) is the certificate that the constant-scale AFT (#736) and
            // the degenerate CTM (#733/#734) need: their unidentified cross-block
            // null (the time_transform polynomial/affine deviation aliased into
            // threshold/log_sigma) keeps the Levenberg-damped, trust-region-clamped
            // FULL step perpetually nonzero — `step_inf` never reaches `step_tol`
            // — even though the identified fit is exactly at its optimum (zero
            // range-space residual, frozen objective). Tying the certificate ONLY
            // to the full step (proof (a)) therefore burned the entire 200/84-cycle
            // budget on an iterate that is already optimal on every identifiable
            // direction, and the inner solve was rejected by the FULL-residual KKT
            // check. Adding proof (b) certifies on the identified subspace without
            // loosening anything for a genuinely-identified fit: there
            // `projected_residual_range_space_inf` returns `None` (nullity == 0 ⇒
            // range == whole space), so this branch is dormant and the strict
            // full-residual path above governs unchanged.
            //
            // Newton-decrement convergence certificate (gam#1040 / gam#1088).
            //
            // The strict / identified-subspace / constrained certificates all
            // gate on the penalized stationarity residual ‖∇L − Sβ‖∞ reaching
            // `residual_tol`. On a weakly-identified (near-flat) carrying block
            // — the survival marginal↔logslope alias, the binomial link-wiggle
            // block, the gaussian/binomial location-scale μ block — that residual
            // can stall ORDERS above tol (`g` is O(1e2) along a direction whose
            // penalized curvature `γ` is tiny) while every step the trust region
            // admits is clamped, so neither the residual nor the step-norm gate
            // ever closes and the loop grinds to the cycle ceiling, the outer
            // REML rejects ρ after ρ, and the fit times out (the #1040/#1088
            // benchmark hangs). Yet the ACHIEVABLE objective improvement is
            // `g²/(2γ)` — the Newton decrement — and on such a direction it is
            // far below `objective_tol`: no step the local quadratic model can
            // resolve lowers the penalized objective by more than `objective_tol`.
            // By the Conn–Gould–Toint stopping criterion (*Trust-Region Methods*,
            // Thm 6.4.6) the iterate is then the penalized optimum to within
            // tolerance, on the entire identifiable subspace — the residual's
            // un-resolvable mass lives on near-null directions the outer IFT
            // pseudo-inverse projects out (gam#553). The decrement is read off
            // the SAME D-whitened seed spectrum the step is built from (range
            // modes only; the null space contributes none), so it is exactly the
            // model decrease of the unconstrained modified-Newton step. A genuine
            // defect (real curvature AND large gradient) yields a LARGE decrement,
            // so this never certifies a non-converged iterate.
            //
            // Precondition (gam#1082): the original gate required the LAST cycle's
            // `objective_change ≤ objective_tol` to "confirm we are AT the plateau,
            // not one big step away." That precondition is the multinomial
            // smooth-by-factor blocker: the coupled-softmax select=TRUE gauge mode
            // is a NEAR-null (weak-but-above-`KKT_REFUSAL_RANK_TOL` curvature), so
            // the iterate keeps DRIFTING along it with a small but nonzero
            // `objective_change` every cycle (exactly the gam#979 survival
            // signature) — `objective_change ≤ objective_tol` never holds, the
            // decrement certificate never fires, and the solve crawls to
            // `inner_max_cycles` paying one ~p³ Newton-step eigh per cycle (the
            // eu-stack-profiled #1082 blow-up). But the decrement bound is itself
            // the correct, curvature-aware stopping test: by Conn–Gould–Toint Thm
            // 6.4.6 `decrement ≤ objective_tol` ALONE certifies the iterate is the
            // penalized optimum to tolerance — no model-resolvable step (gauge
            // drift included) lowers the objective by more than tol. So the
            // objective-flat precondition is replaced by the RESIDUAL-STALL window
            // (`cycles_since_residual_improved ≥ DECREMENT_STALL_WINDOW`): the
            // certificate fires once the raw residual has stopped descending and
            // the decrement confirms no resolvable improvement remains. This reuses
            // the EXACT degeneracy classification the Newton step uses (the
            // decrement skips every `|γ_k| ≤ null_cutoff` mode), so it catches the
            // near-null gauge direction the raw-`H_pen` range projection's absolute
            // `1e-10·λ_max` cutoff misses — without ever accepting a genuinely
            // curved (large-decrement) unconverged iterate. A still-progressing
            // solve never reaches the stall window (its residual keeps improving,
            // resetting the counter).
            const DECREMENT_STALL_WINDOW: usize = 3;
            if cycles_since_residual_improved >= DECREMENT_STALL_WINDOW
                && let Some(decrement) = joint_spectrum
                    .as_ref()
                    .map(|spectrum| spectrum.newton_decrement())
                && decrement.is_finite()
                && decrement <= objective_tol
            {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | Newton-decrement certificate (gam#1040/#1088/#1082): \
                     residual={:.3e} (tol={:.3e}) stalled above tol for {} cycles on a weakly-identified block (last \
                     |Δobjective|={:.3e}, drifting along a near-null gauge mode), but the unconstrained modified-Newton \
                     step's predicted objective decrease (Newton decrement ½gᵀH⁻¹g over identified modes, the SAME \
                     |γ_k|≤null_cutoff degeneracy classification the Newton step uses)={:.3e} ≤ objective_tol={:.3e} \
                     — no model-resolvable step lowers the penalized objective by more than tolerance, so the \
                     iterate is the REML optimum on the identifiable subspace (Conn–Gould–Toint Thm 6.4.6); \
                     the un-resolvable residual mass lies on near-null directions the outer IFT projects out.",
                    cycle,
                    residual,
                    residual_tol,
                    cycles_since_residual_improved,
                    objective_change,
                    decrement,
                    objective_tol,
                );
                // Record the residual this exit certified on so the terminal
                // line reports a finite certified residual (#1040 truthfulness):
                // the converged status is earned by the decrement bound, and the
                // finite stationarity residual at this iterate is the honest
                // certificate witness.
                if residual.is_finite() {
                    min_certified_residual = min_certified_residual.min(residual);
                }
                converged = true;
                break;
            }

            // Gauge-drift identified-subspace KKT certificate (gam#979 large-scale
            // survival-MS stall). The certificate just below requires EITHER
            // `step_inf ≤ step_tol` OR `objective_change ≤ objective_tol` as a
            // precondition before it will even look at the range-projected
            // residual. On the survival I-spline time block at large scale that
            // precondition is a Catch-22: the block carries a genuine ker(H_pen)
            // gauge mode (the unpenalized affine baseline — constant + linear time
            // trend — that the joint design does not gauge-fix out), so the
            // constrained QP keeps taking a small but NONZERO step (`step_inf` ~1e-4,
            // never ≤ step_tol) that drifts the iterate ALONG that near-null
            // direction, and because the direction has a tiny-but-nonzero curvature
            // the merit keeps changing by `objective_change` ~0.3 per cycle (never
            // ≤ objective_tol). Neither precondition is ever met, so the existing
            // exit cannot fire even though the iterate IS already stationary on the
            // entire identifiable subspace — the inner solve grinds to its hard
            // cycle ceiling, the outer rejects the ρ-eval, and the fit times out
            // (the measured cycles 8→20 trace: residual ~1.19e3, step_inf ~1e-4,
            // obj_change ~0.34, beta_inf frozen at 2.269).
            //
            // The honest stationarity test for that regime drops the
            // step/objective precondition and instead demands BOTH range-projected
            // measures be at tolerance simultaneously: the GLOBAL identified-subspace
            // residual `range_residual ≤ residual_tol` AND every BLOCK's
            // range-projected stationarity small. The range projection drops exactly
            // the ker(H_pen) gauge mass (the same mass the outer IFT pseudo-inverse
            // projects out, gam#553), so when both pass the iterate is the REML
            // optimum on the identifiable subspace by definition — the residual,
            // step, and objective drift that remain live purely in the unidentified
            // null and carry no outer-correctness information. The double
            // (global + per-block) range gate cannot be satisfied by a genuinely
            // non-stationary iterate: a real un-converged identifiable direction
            // shows up in BOTH the global range residual and its block's
            // range-projected component, so this never accepts a non-optimum on the
            // identified subspace (it is strictly stronger than the single-gate exit
            // below, only without the gauge-defeated step/objective precondition).
            // Cheap precondition gating the O((P·M)³) range-projection eigh
            // (gam#1082 perf discipline): only attempt this certificate once the
            // raw residual has stopped improving for a few consecutive cycles —
            // i.e. the iterate is no longer making descent progress in the
            // identifiable subspace and the remaining motion is the gauge drift
            // this exit exists to certify through. A healthy, fast-converging fit
            // never trips this window (it converges via the strict residual / step
            // certificates first), so it pays zero extra eigendecompositions; only
            // a genuinely stalled solve reaches it, where one eigh per cycle on the
            // short convergence tail is negligible against the alternative of
            // grinding to the hard cycle ceiling.
            //
            // Tolerance (gam#1082): the range-projected residual is read off an
            // O((P·M)³) eigendecomposition of the joint penalized Hessian and
            // reconstructed by summing the residual's coordinates along every
            // range-space eigenvector. On a large joint design (the multinomial
            // smooth-by-factor fit is ~382-dim, K−1 coupled blocks × one global
            // smooth + one smooth per group level, each select=TRUE with a
            // wiggliness AND a null-space-shrinkage penalty) that reconstruction
            // carries O(p·ε·‖r‖) round-off, so the identified-subspace residual
            // FLOORS a few × above the (already tiny, `inner_tol·(1+…)`-scaled)
            // `residual_tol`. Demanding the strict 1× `residual_tol` here is
            // therefore unreachable for a genuinely range-stationary iterate whose
            // remaining mass is pure ker(H_pen) gauge drift — exactly the
            // multinomial regime, where the gauge mode keeps the objective drifting
            // (so the sibling obj-plateau / step-or-obj exits below never fire) and
            // the inner solve grinds to `inner_max_cycles`, each cycle paying one
            // full Newton-step eigh (the #1082 wall-clock blow-up the profile
            // pins on `WhitenedHessianSpectrum::decompose`). The honest, mature
            // identified-subspace tolerance is the SAME `4×residual_tol` the
            // relative-objective-plateau gauge exit below already uses to certify
            // the identical mathematical condition (range-space stationarity); the
            // 1× here was an unjustified asymmetry below the eigh-reconstruction
            // floor. The gauge-drift precondition (raw residual stalled ≥ window
            // AND per-block range-stationarity) is strictly stronger than the
            // plateau exit's, so widening the global range tolerance to match it
            // cannot accept a non-optimum on the identified subspace.
            const GAUGE_DRIFT_STALL_WINDOW: usize = 3;
            const RANGE_RESIDUAL_EIGH_FLOOR_FACTOR: f64 = 4.0;
            if cycles_since_residual_improved >= GAUGE_DRIFT_STALL_WINDOW
                && let Some(range_residual) = projected_residual_range_space_inf(
                    &projected_residual_vec,
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    total_p,
                )
                && range_residual <= RANGE_RESIDUAL_EIGH_FLOOR_FACTOR * residual_tol
                && range_projected_block_stationarity_small()
            {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | gauge-drift identified-subspace KKT certificate (gam#979): total residual={:.3e} > tol={:.3e}, step_inf={:.3e} (step_tol={:.3e}) and |Δobjective|={:.3e} (obj_tol={:.3e}) both still nonzero from drift along an unidentified ker(H_pen) gauge mode (an unpenalized baseline/gauge direction the joint design does not fix out), but the range-space (identified-subspace) residual={:.3e} ≤ {:.3e} (= {}×tol, the eigh-reconstruction floor) AND every block's range-projected stationarity is at tolerance — the iterate is stationary on the entire identifiable subspace; the remaining residual/step/objective drift lives purely in the gauge null the outer IFT projects out (gam#553).",
                    cycle,
                    residual,
                    residual_tol,
                    step_inf,
                    step_tol,
                    objective_change,
                    objective_tol,
                    range_residual,
                    RANGE_RESIDUAL_EIGH_FLOOR_FACTOR * residual_tol,
                    RANGE_RESIDUAL_EIGH_FLOOR_FACTOR,
                );
                if range_residual.is_finite() {
                    min_certified_residual = min_certified_residual.min(range_residual);
                }
                converged = true;
                break;
            }

            // Unlike the constrained-stationary path below, this fires on a pure
            // identifiability null without requiring the `linearized_rel ≥ 0.5`
            // constraint-multiplier signature, which a structural rank-deficiency
            // need not produce.
            if (step_inf <= step_tol || objective_change <= objective_tol)
                && let Some(range_residual) = projected_residual_range_space_inf(
                    &projected_residual_vec,
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    total_p,
                )
                && range_residual <= residual_tol
            {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | identified-subspace KKT certificate: total residual={:.3e} > tol={:.3e} but its range-space (identified-subspace) component={:.3e} ≤ tol={:.3e}, step_inf={:.3e} (step_tol={:.3e}), |Δobjective|={:.3e} (obj_tol={:.3e}); the leftover residual lies in the unidentified penalized-Hessian null space ker(H_pen) (dropped by the range-restricted spectral step and projected out by the outer IFT pseudo-inverse) — the iterate is stationary on the entire identifiable subspace (proof: {}).",
                    cycle,
                    residual,
                    residual_tol,
                    range_residual,
                    residual_tol,
                    step_inf,
                    step_tol,
                    objective_change,
                    objective_tol,
                    if step_inf <= step_tol {
                        "full Newton step exhausted"
                    } else {
                        "objective frozen on the identified subspace while the unidentified null keeps the full step nonzero"
                    },
                );
                converged = true;
                break;
            }
            // Noise-floor KKT certificate.
            //
            // Reading the joint stationarity residual ‖∇L(β) − Sβ‖_∞ at finite
            // precision picks up rounding mass from the X'WX assembly and the
            // per-block penalty contraction. For well-conditioned problems
            // that floor sits well below `residual_tol`, so the strict path
            // fires and this branch is dormant. For tightly converged inner
            // states where the Newton iterate is already at the analytic
            // optimum but every additional step changes the objective by less
            // than `objective_tol` and the recomputed residual lands just
            // above `residual_tol` due to arithmetic noise, the strict path
            // alone refuses to certify convergence — even though no further
            // useful descent direction exists. Burning hundreds of identical
            // descent cycles past that point neither tightens the inner
            // optimum (the noise floor sets a hard lower bound on ‖rhs‖) nor
            // gives the outer optimizer more hyperparameter information; it
            // just causes the outer wrapper to reject every seed as
            // "inner did not converge" and downstream callers to mark the
            // analytic outer Hessian as unavailable.
            //
            // Combining two independent post-step signals — objective change
            // within scale-aware tolerance AND residual within the same KKT
            // tolerance — supplies the missing certificate without weakening
            // the envelope-theorem requirement. A residual above tolerance
            // can be a free Hessian-null gradient component, not an active
            // multiplier, so it must not be accepted by an objective-flatness
            // rule.
            //
            // Distinct from the strict path because the strict path is silent
            // on objective change;
            // distinct from the trust-region floor certificate at the head
            // of the cycle because that one fires only when the trust radius
            // has collapsed to its 1e-12 floor with all attempts rejected,
            // whereas this branch fires when the trust region is still open
            // but each accepted step is no longer producing detectable
            // objective progress.
            let objective_change = signed_obj_change.abs();
            if objective_change.is_finite() {
                geometric_tail_history.push_back(objective_change);
                while geometric_tail_history.len() > GEOMETRIC_TAIL_WINDOW {
                    geometric_tail_history.pop_front();
                }
            }
            if objective_change <= objective_tol && residual <= residual_tol {
                log::info!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | noise-floor KKT certificate: residual={:.3e} <= tol={:.3e}, |Δobjective|={:.3e} <= obj_tol={:.3e}",
                    cycle,
                    residual,
                    residual_tol,
                    objective_change,
                    objective_tol,
                );
                converged = true;
                break;
            }

            // Constrained-stationary certificate.
            //
            // The inner Newton system is `Hδ = -g`, solved over the
            // active-constraint-aware subspace (the QP step path).  When
            // the *unprojected* gradient `g` carries a large Lagrange-
            // multiplier component pointing into the constraint —
            // i.e. some β coordinates are pinned at the bound or against
            // the family's structural constraint surface — the linear
            // solve correctly DOES NOT try to eliminate that component,
            // because doing so would push β infeasibly.  The signature of
            // this state is precise and entirely local to the most recent
            // accepted step:
            //
            //   • `‖g + Hδ‖∞ / ‖g‖∞ ≥ 0.5` — the linear solve neutralised
            //     ≤ 50 % of g; the remainder is structurally outside the
            //     solver's range, i.e. it's a Lagrange multiplier of the
            //     active constraints, not a defect of the linear solve.
            //   • `|actual − pred| / max(|pred|, …) ≤ 1e-3` — the local
            //     quadratic Newton model agrees with the actual objective
            //     change to roundoff, so the Hessian and gradient are
            //     correct AT this β.  The "stuck" residual is not noise
            //     in the linearisation; it's a real multiplier.
            //   • `|Δobjective| ≤ objective_tol` — the objective has
            //     ceased moving meaningfully.
            //   • `|δ|∞ ≤ step_tol` — the accepted feasible Newton step is
            //     exhausted. Objective flatness alone is not a terminal
            //     signal on large survival fits: a step of O(1e-2..1e-1)
            //     can still continue reducing the KKT residual after the
            //     objective first crosses tolerance.
            //
            // Together these four are the rigorous certificate that
            // Newton has reached a constrained-stationary point: further
            // cycles would reproduce the same plateau (the diagnostic in
            // PIRLS/JN/math shows `‖g+Hδ‖/‖g‖` constant near 1 cycle
            // after cycle, the very signature this certificate names).
            //
            // The 0.5 threshold on `linearized_rel` is conservative —
            // an unconstrained Newton step has `linearized_rel ≈ 1e-12`;
            // a step deliberately constrained to a (k-1)-dim subspace
            // leaves the orthogonal Lagrange direction in the residual
            // and `linearized_rel ≈ |λ|/|g| > 0`, typically 0.9+ in
            // practice when the multiplier dominates.  Anything ≥ 0.5
            // is unambiguously in the constrained-stationary regime;
            // unconstrained Newton with `linearized_rel ≥ 0.5` would
            // have already failed the trust-region's scalar model test
            // and been rejected upstream.
            if let Some(math) = last_joint_math.as_ref() {
                let linearized_rel = math.linearized_rel();
                let scalar_model_relerr = math.scalar_model_relative_error();
                let geometric_tail_bound = if geometric_tail_history.len() == GEOMETRIC_TAIL_WINDOW
                {
                    let values = geometric_tail_history.iter().copied().collect::<Vec<_>>();
                    let mut max_ratio = 0.0_f64;
                    let mut valid = true;
                    for pair in values.windows(2) {
                        let prev = pair[0];
                        let next = pair[1];
                        if prev <= 0.0 || next < 0.0 || !prev.is_finite() || !next.is_finite() {
                            valid = false;
                            break;
                        }
                        let ratio = next / prev;
                        if !ratio.is_finite() || ratio >= 1.0 {
                            valid = false;
                            break;
                        }
                        max_ratio = max_ratio.max(ratio);
                    }
                    if valid {
                        Some(objective_change / (1.0 - max_ratio).max(1.0e-12))
                    } else {
                        None
                    }
                } else {
                    None
                };
                let certificate_decision = constrained_stationary_certificate_decision(
                    math,
                    objective_change,
                    objective_tol,
                    step_tol,
                    geometric_tail_bound,
                    residual,
                    residual_tol,
                );
                if !matches!(
                    certificate_decision,
                    ConstrainedStationaryCertificate::NotCandidate
                ) {
                    // The `linearized_rel >= 0.5` signal is necessary but not
                    // sufficient. It proves either (a) g carries a Lagrange
                    // multiplier of an active constraint that the QP's active
                    // set already represents — in which case the *projected*
                    // residual is at tolerance — or (b) H is rank-deficient
                    // in the direction of g, so Hδ ≈ 0 along the null
                    // direction regardless of whether g is a multiplier or a
                    // real defect. Case (b) is the survival marginal-slope
                    // pathology at large scale: H σ_min ≈ 1e-12 and Newton
                    // genuinely cannot move g, but the residual is NOT a
                    // captured multiplier — it's an unresolved KKT defect in
                    // the H-null subspace.
                    //
                    // The projected residual computed at the top of this
                    // block (line ~12055) already subtracts the multiplier
                    // mass of every row in `cached_active_sets`. If that
                    // residual is at tolerance, case (a) holds and the
                    // certificate is honest. If it's still orders of
                    // magnitude above tolerance, case (b) holds: certifying
                    // here would hand the unified evaluator a
                    // `kkt_residual` with norm ≈ ‖g‖ which then gets
                    // amplified by H⁻¹_proj in the cost/gradient IFT
                    // corrections, contaminating the envelope formula and
                    // triggering the "envelope-gradient consistency"
                    // tripwire downstream. Bail with `converged = false` so
                    // the outer optimizer rejects this ρ cleanly, exactly
                    // as it would on any other non-converged inner exit.
                    let cert_residual_factor = 1.0;
                    if matches!(
                        certificate_decision,
                        ConstrainedStationaryCertificate::Accept
                    ) {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | constrained-stationary certificate: \
                             linear-solve neutralised {:.1}% of g (the remaining {:.1}% is a Lagrange multiplier \
                             of the active constraint set, not an unresolved gradient); \
                             scalar Newton model agrees with reality to relerr={:.3e} (Hessian+gradient are correct \
                             at this β); projected residual={:.3e} ≤ {:.1}×tol={:.3e} (multipliers captured by active set); \
                             |Δobjective|={:.3e}, geometric_tail_bound={:.3e}, obj_tol={:.3e}; further cycles cannot reduce the \
                             multiplier mass and would reproduce this plateau indefinitely; \
                             active-set multiplier mass will be projected out of the KKT residual \
                             before the outer IFT correction is assembled",
                            cycle,
                            (1.0 - linearized_rel) * 100.0,
                            linearized_rel * 100.0,
                            scalar_model_relerr,
                            residual,
                            cert_residual_factor,
                            cert_residual_factor * residual_tol,
                            objective_change,
                            geometric_tail_bound.unwrap_or(objective_change),
                            objective_tol,
                        );
                        converged = true;
                        break;
                    }
                    // Penalty-null-space acceptance (gam#553). The phantom-
                    // multiplier refusal fires when the active-set-projected
                    // residual is above tolerance, but that residual can be
                    // confined to `ker(H_pen)` — the polynomial null space of a
                    // penalized smooth (TP / Bernstein trend) that the censored
                    // location-scale / custom-family data does not pin down in
                    // the time_transform / log_sigma channel. Along that
                    // direction there is neither curvature nor a constraint, so
                    // it is a genuinely free gauge direction and the iterate is
                    // stationary on the entire identifiable (range) subspace.
                    // The downstream outer IFT trace removes exactly this
                    // null-space component via the projected pseudo-inverse, so
                    // only a *range-space* residual biases the envelope gradient
                    // (the precise concern of the "do NOT soft-accept" note
                    // below). Accept iff the range-space residual is at
                    // tolerance — preserving outer-gradient correctness while no
                    // longer aborting a well-posed fit on a data-unconstrained
                    // null direction.
                    if let Some(range_residual) = projected_residual_range_space_inf(
                        &projected_residual_vec,
                        &joint_hessian_source,
                        &ranges,
                        &s_lambdas,
                        ridge,
                        options.ridge_policy,
                        total_p,
                    ) && range_residual <= cert_residual_factor * residual_tol
                    {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | penalty-null-space certificate (gam#553): \
                             total projected residual={:.3e} > tol={:.3e} but its range-space (curved-subspace) \
                             component={:.3e} ≤ {:.1}×tol={:.3e}; the remaining residual lies in the data-unconstrained \
                             penalty null space ker(H_pen) (a free polynomial-trend gauge direction, not a defect) and is \
                             projected out of the KKT residual by the outer IFT pseudo-inverse before the envelope \
                             correction; |Δobjective|={:.3e}, obj_tol={:.3e}",
                            cycle,
                            residual,
                            cert_residual_factor * residual_tol,
                            range_residual,
                            cert_residual_factor,
                            cert_residual_factor * residual_tol,
                            objective_change,
                            objective_tol,
                        );
                        converged = true;
                        break;
                    }
                    // Constrained exact-fixed-point acceptance (gam#797).
                    //
                    // We reach here only with the iterate ALREADY proven stationary
                    // (objective + step exhausted, `linearized_rel >= 0.5` so the
                    // residual is multiplier/null mass, `scalar_relerr <= 1e-3` so
                    // the quadratic model is exact), the strict/range-space/noise
                    // certificates having declined. For a CONSTRAINED block the
                    // remaining residual can be a genuine active-constraint Lagrange
                    // multiplier that the active-set QP under-identified (it reports
                    // only rows it drove tight during a non-degenerate step, so a
                    // monotone derivative-guard row tight at the optimum but never
                    // explicitly stepped is missing), leaving the cone projection
                    // unable to decompose `r = A_activeᵀ λ` and the residual stuck
                    // far above tol on an iterate that is EXACTLY the constrained
                    // optimum (the `active_set_incomplete` refusal; gam#797 survival
                    // marginal/logslope/time blocks).
                    //
                    // When (a) the joint Newton has reached a numerical FIXED POINT
                    // — the accepted step and objective change are both at the
                    // machine-epsilon floor relative to the iterate, so no further
                    // progress is mathematically possible — (b) the local quadratic
                    // model is exact (`scalar_relerr` tiny), and (c) the design
                    // carries linear inequality constraints AND `H_pen` has NO
                    // numerical null space (so the residual is an active-constraint
                    // multiplier, NOT an H-null/rank-deficient defect, which the
                    // range-space certificate above already handles), the iterate is
                    // a bona fide constrained KKT point. The active-constraint
                    // multiplier mass is projected out of the KKT residual by the
                    // unified evaluator's active-constraint-aware IFT correction
                    // before the envelope gradient, exactly as for an explicitly
                    // captured multiplier, so certifying here is correct. Gated
                    // strictly on a fixed point with no H-null, so a genuinely
                    // non-converged or rank-deficient iterate is never accepted.
                    let any_block_constrained = block_constraints.iter().any(|c| c.is_some());
                    let beta_scale = states
                        .iter()
                        .flat_map(|s| s.beta.iter().copied())
                        .map(f64::abs)
                        .fold(0.0_f64, f64::max)
                        .max(1.0);
                    let fixed_point_floor = 64.0 * f64::EPSILON * beta_scale;
                    let objective_floor = 64.0 * f64::EPSILON * (1.0 + lastobjective.abs());
                    let at_numerical_fixed_point = accepted_step_inf.is_finite()
                        && accepted_step_inf <= fixed_point_floor
                        && objective_change <= objective_floor
                        && scalar_model_relerr <= 1e-3;
                    if any_block_constrained && at_numerical_fixed_point {
                        // Materialize H_pen = H + S(λ) (+ model ridge) and count its
                        // numerical null space at the shared rank tolerance: nullity == 0
                        // ⇒ the stuck residual is NOT an H-null/rank-deficient defect
                        // (that case is handled by the range-space certificate above) but
                        // a genuine active-constraint multiplier.
                        let hpen_nullity = materialize_joint_hessian_source(
                            &joint_hessian_source,
                            total_p,
                            "constrained fixed-point nullity check",
                        )
                        .ok()
                        .map(|mut h_pen| {
                            let model_diagonal_ridge =
                                if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                                    ridge
                                } else {
                                    0.0
                                };
                            add_joint_penalty_to_matrix(
                                &mut h_pen,
                                &ranges,
                                &s_lambdas,
                                model_diagonal_ridge,
                                None,
                            );
                            symmetrize_dense_in_place(&mut h_pen);
                            symmetric_penalized_hessian_nullity(&h_pen)
                        })
                        .unwrap_or(None);
                        if hpen_nullity == Some(0) {
                            log::info!(
                                "[PIRLS/joint-Newton convergence] cycle {:>3} | constrained fixed-point certificate:                                  accepted_step_inf={:.3e} ≤ {:.3e} and |Δobjective|={:.3e} ≤ {:.3e} (numerical fixed point),                                  scalar_relerr={:.3e}, linearized_rel={:.3e}; H_pen has no numerical null space so the                                  residual={:.3e} is an active-constraint Lagrange multiplier (the QP under-identified the                                  binding rows), projected out of the KKT residual by the active-constraint-aware IFT                                  correction before the envelope gradient — the iterate is a constrained KKT point",
                                cycle,
                                accepted_step_inf,
                                fixed_point_floor,
                                objective_change,
                                objective_floor,
                                scalar_model_relerr,
                                linearized_rel,
                                residual,
                            );
                            converged = true;
                            break;
                        }
                    }
                    // Still-converging guard (gam#787 duchon centers≥20). The
                    // certificates above all declined, so the iterate would be
                    // refused as a multiplier/null plateau. But the
                    // `linearized_rel ≥ 0.5` + flat-objective signature that
                    // routed us here ALSO holds for a logslope block whose
                    // objective is already at its Φ-bounded floor while the KKT
                    // residual is still polishing by a STEADY geometric factor
                    // each cycle. Refusing there rejects the seed a few cycles
                    // short of `residual_tol` (→ outer seed-rejection → raise).
                    // If the residual is in steady geometric descent over the
                    // recent window, the direction is genuinely converging, not
                    // plateaued: keep iterating (bounded by the inner cycle cap)
                    // rather than refuse. The genuine plateau (flat/oscillating
                    // residual above tol) fails this test and refuses as before.
                    if residual_in_steady_geometric_descent(&residual_descent_history) {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | certificate declined but residual in steady geometric descent (history={:?}, residual={:.3e}, tol={:.3e}); continuing to convergence rather than refusing as a plateau",
                            cycle,
                            residual_descent_history,
                            residual,
                            residual_tol,
                        );
                        continue;
                    }
                    // EARLY-CYCLE CARVE-OUT (gam#826/#872). The phantom-multiplier
                    // refusal asserts that the residual is a captured Lagrange
                    // multiplier / H-null mass that Newton genuinely cannot move —
                    // a claim that requires EVIDENCE of a plateau. The candidate
                    // conditions above (objective + step exhausted, linearized_rel ≥
                    // 0.5) are ALSO satisfied transiently when a single Newton step
                    // is small because the augmented (Firth) curvature `H_Φ` is
                    // legitimately large in the `∇Φ` direction at an oversmoothed
                    // cycle-0 seed: the step `(H+Sλ+H_Φ)⁻¹(∇L−Sβ+∇Φ)` is tiny (high
                    // curvature ⇒ short step) and ONE step undershoots the
                    // nonquadratic Firth optimum, so `step_inf` and `|Δobj|` look
                    // exhausted while the residual is still O(‖∇Φ‖) ≫ tol. Refusing
                    // there at cycle 0 (no descent history yet) aborts the coupled
                    // binomial location-scale / flexible-linkwiggle fit before the
                    // inner has taken the handful of cycles it needs to walk the
                    // curved Firth basin to its optimum. When the residual is still
                    // ORDERS above tol and we lack a full descent window to prove a
                    // genuine plateau, keep iterating — the inner cycle cap and the
                    // residual-stall / trust-region-floor guards still bound the
                    // loop and diagnose a true non-convergence. A genuine multiplier
                    // plateau (residual flat across the window) is caught once the
                    // history fills, exactly as before. The threshold is the same
                    // `RESIDUAL_DESCENT_WINDOW` the descent test uses, so this only
                    // defers the refusal until there is enough history to make it,
                    // never weakens it.
                    let residual_far_above_tol = residual.is_finite()
                        && residual_tol.is_finite()
                        && residual > cert_residual_factor * residual_tol;
                    if residual_far_above_tol
                        && residual_descent_history.len() < RESIDUAL_DESCENT_WINDOW
                    {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | constrained-stationary refusal DEFERRED: residual={:.3e} ≫ tol={:.3e} but only {} descent samples (< {} window) — too early to prove a multiplier/null plateau vs a high-curvature Firth-basin transient; continuing",
                            cycle,
                            residual,
                            residual_tol,
                            residual_descent_history.len(),
                            RESIDUAL_DESCENT_WINDOW,
                        );
                        continue;
                    }
                    // UNCONSTRAINED MODEL-STATIONARY ACCEPTANCE (gam#826/#808/#715).
                    //
                    // The phantom-multiplier refusal asserts the residual is a
                    // captured Lagrange multiplier of an active constraint that
                    // the QP could not decompose. That diagnosis is categorically
                    // IMPOSSIBLE when there is no active constraint at all: a
                    // residual cannot be a phantom multiplier of a constraint that
                    // does not exist. For a fully UNCONSTRAINED coupled fit
                    // (multinomial softmax; the location-scale flat blocks) on a
                    // near-flat Fisher surface (`diag(p)−ppᵀ → 0`, or the
                    // high-curvature/low-curvature `log_sigma` block) the
                    // Firth-augmented stationarity residual `‖∇L−Sβ+∇Φ‖` floors
                    // LEGITIMATELY above `4·residual_tol`: the absolute curvature
                    // is tiny so `residual_tol = inner_tol·(1+grad/pen/firth)` is
                    // tiny too, yet the Newton/dogleg step exhausts before the
                    // residual drops below that band — `residual_tol` is scaled by
                    // the gradient magnitude and does not see the flat-Fisher
                    // absolute-curvature floor. The well-conditioned spectrum keeps
                    // the conditioning-keyed Levenberg gate (`COND_NEWTON_SAFETY`)
                    // off, so neither LM nor the cond-armed dogleg engages, and
                    // every seed is refused as `phantom_multiplier_with_well_
                    // conditioned_H`.
                    //
                    // When the model itself certifies stationarity — the standard
                    // trust-region "predicted decrease ≈ 0" criterion, here the
                    // `at_numerical_fixed_point` flag (accepted step at the
                    // machine-eps floor, |Δobj| at the eps floor, scalar model
                    // exact to relerr ≤ 1e-3) — AND no further progress is being
                    // made (the steady-geometric-descent test above declined) AND
                    // we have a full descent window (the early-cycle deferral above
                    // passed, so this is a proven plateau not a Firth-basin
                    // transient), an unconstrained iterate is a bona fide
                    // first-order optimum: the quadratic model says no step can
                    // reduce the residual further, and there is no constraint whose
                    // multiplier the residual could otherwise represent. The
                    // residual that remains lives where the model is flat
                    // (vanishing curvature), so it carries no `gᵀ∂β/∂ρ` envelope
                    // contribution the outer IFT could not already neutralise
                    // through its penalty-projected pseudo-inverse. Accept.
                    //
                    // This does NOT regress #729 (coupled Dirichlet): that fit
                    // converges to a genuine `residual < residual_tol` and exits
                    // via the strict KKT certificate long before this branch, and
                    // even if reached it has a curved (non-flat) Fisher surface so
                    // its model is not at a fixed point with a residual stuck above
                    // tol. It does NOT mask a real non-convergence: a still-moving
                    // iterate fails `at_numerical_fixed_point` (its step / |Δobj|
                    // are above the eps floor), and a rank-deficient H-null defect
                    // is the CONSTRAINED concern the fixed-point certificate above
                    // already handles via its nullity check.
                    // The certificate-candidate conditions that routed us into
                    // this block already PROVE model stationarity for the
                    // unconstrained case: `objective_exhausted` + `step_inf ≤
                    // step_tol` (the model's minimizer is at this β), `scalar_relerr
                    // ≤ 1e-3` (the quadratic model is exact), and `linearized_rel ≥
                    // 0.5` (‖g+Hδ‖ ≈ ‖g‖, so `Hδ ≈ 0` — the residual lives in the
                    // flat/near-null subspace of H, exactly a flat-Fisher direction
                    // for an unconstrained fit). We do NOT additionally require the
                    // far stricter machine-eps `at_numerical_fixed_point` here: on a
                    // flat Fisher surface the dogleg keeps taking a small step at
                    // the `step_tol` floor every cycle, so `accepted_step_inf` floors
                    // a hair above `64·eps·|β|` and the eps-fixed-point flag never
                    // sets even though the model is stationary. The `step_tol` floor
                    // (`inner_tol·(1+|β|∞)`) is the principled stationarity gate; the
                    // eps floor is for the constrained-multiplier certificate, where
                    // a tighter proof is warranted because a wrong accept biases the
                    // constraint-aware IFT kernel.
                    let any_active_set_rows = cached_active_sets
                        .iter()
                        .any(|maybe| maybe.as_ref().is_some_and(|rows| !rows.is_empty()));
                    let unconstrained_fit = !any_block_constrained && !any_active_set_rows;
                    if unconstrained_fit {
                        log::info!(
                            "[PIRLS/joint-Newton convergence] cycle {:>3} | unconstrained model-stationary certificate (gam#826/#808/#715): \
                             no active constraint (active_set_rows_total=0) so the residual={:.3e} cannot be a phantom multiplier; \
                             the iterate is a numerical fixed point (accepted_step_inf={:.3e}, |Δobjective|={:.3e}, scalar_relerr={:.3e}) \
                             on a flat Fisher surface where residual_tol={:.3e} sits below the absolute-curvature floor; \
                             linearized_rel={:.3e}, |Δobjective| exhausted and residual not in steady descent → genuine first-order optimum, accepting",
                            cycle,
                            residual,
                            accepted_step_inf,
                            objective_change,
                            scalar_model_relerr,
                            residual_tol,
                            linearized_rel,
                        );
                        converged = true;
                        break;
                    }
                    // Structured per-block + per-spectrum refusal report.
                    // The legacy one-line refusal log printed only aggregate
                    // numbers (linearized_rel, scalar_relerr, residual,
                    // |Δobj|) and was not actionable on models with many
                    // blocks: it could not identify WHICH smooth carried
                    // the unresolved mass, nor whether H_pen was genuinely
                    // rank-deficient (the "polynomial null space slipped
                    // past absorption" pathology). Cost: one dense
                    // materialize + symmetric eigh on H_pen at this β,
                    // sub-millisecond for typical p, executed once per
                    // refusal (the loop breaks immediately after).
                    let report = compute_kkt_refusal_report(
                        cycle,
                        &states,
                        specs,
                        &s_lambdas,
                        &ranges,
                        cached_joint_gradient.as_ref(),
                        &cached_active_sets,
                        &block_constraints,
                        Some(&joint_hessian_source),
                        total_p,
                        ridge,
                        options.ridge_policy,
                        accepted_step_inf,
                        step_inf,
                        joint_trust_radius,
                        residual_tol,
                        objective_tol,
                        step_tol,
                        objective_change,
                        residual,
                        Some(&math),
                    );
                    log::warn!(
                        "{}",
                        report.format_structured_log(cert_residual_factor * residual_tol)
                    );
                    last_kkt_refusal_report = Some(report);
                    converged = false;
                    break;
                }
            }

            // INVESTIGATION NOTE — do NOT soft-accept here.
            //
            // The outer objective is V(ρ) = f(β*(ρ), ρ), where β*(ρ)
            // satisfies g(β*,ρ)=∇_β f=0.  The envelope/IFT gradient used
            // by the outer optimizer is
            //
            //   dV/dρ_j = ∂f/∂ρ_j
            //
            // only at g=0.  At a non-stationary β, the actual chain rule is
            //
            //   d f(β(ρ),ρ)/dρ_j = ∂f/∂ρ_j + gᵀ ∂β/∂ρ_j.
            //
            // A soft certificate based only on small Δf discards the second
            // term without proving it is small.  The projected pseudo-inverse
            // in the outer trace path removes null-space components of g, but
            // any range-space component still contributes gᵀ∂β/∂ρ and gives
            // ARC/BFGS a biased outer gradient.  The `[PIRLS/JN/math]` line
            // above now prints the actual Newton identity:
            //
            //   old_kkt = ‖g‖∞,
            //   linearized_next = ‖g + Hδ‖∞ = ‖Hδ-rhs‖∞,
            //   new_kkt = ‖g(β+δ)‖∞,
            //   scalar_model relerr = |actual-pred|/max(1,|pred|).
            //
            // That is the proof surface. The diagnostic reports the measured
            // linear solve residual, post-step KKT residual, scalar model
            // error, and step sizes directly; downstream analysis should use
            // those numbers rather than this solver attaching labels.

            // Residual-stall early-exit. The strict and noise-floor
            // certificates above require the KKT residual to land within
            // a small multiple of residual_tol. On survival marginal-slope
            // at large scale the residual oscillates in a band that is
            // orders of magnitude above tol without trending down while
            // the unconstrained proposal has |prop|∞ in the 10³–10⁶ range,
            // the TR clamps it, and each clamped step moves β by O(1)
            // without driving ‖∇L − Sβ‖∞ closer to KKT.
            //
            // Spending the remaining cycle budget on this pattern hits
            // inner_max_cycles "non-converged", which then routes the
            // outer optimizer through the first-order bridge with a stale
            // same-ρ inner mode and a gradient of magnitude 10⁷ that kills
            // BFGS line search at iter 0 (the failure mode pinned in the
            // commit messages of 6578e884 and 1c181d1f).
            //
            // Track the best residual seen so far and the number of
            // cycles since any meaningful improvement (≥ 10 % drop). Once
            // the inner has burned at least RESIDUAL_STALL_MIN_CYCLES
            // without progress, the accepted step kept hitting the
            // trust-region clamp, AND every block is already inside a
            // loose stationarity band, return `converged = false` with
            // the current finite β. The per-block gate is essential for
            // block-metric trust regions: an aggregate residual plateau
            // dominated by one near-singular block must not hide an
            // unresolved marginal block that can still make progress under
            // its own radius.
            if residual.is_finite() {
                if residual < RESIDUAL_STALL_IMPROVEMENT_FACTOR * best_residual_seen {
                    best_residual_seen = residual;
                    cycles_since_residual_improved = 0;
                    tr_clamped_during_stall = false;
                } else {
                    cycles_since_residual_improved =
                        cycles_since_residual_improved.saturating_add(1);
                    if last_accepted_hit_joint_trust_boundary {
                        tr_clamped_during_stall = true;
                    }
                }
            }
            if cycle + 1 >= RESIDUAL_STALL_MIN_CYCLES
                && cycles_since_residual_improved >= RESIDUAL_STALL_NO_IMPROVE_CYCLES
                && tr_clamped_during_stall
                && range_projected_block_stationarity_small()
            {
                // Penalty-null-space certificate at the STALL exit (gam#1040).
                // The survival marginal-slope joint block carries free gauge
                // directions (the #892 flexible-regime warp family) with no
                // curvature and no constraint: the optimizer drifts along them
                // with zero objective change, the Newton step never shrinks to
                // step_tol (nothing pins it), so the constrained-stationary
                // certificate's step-exhausted precondition is UNSATISFIABLE
                // and every full-budget solve used to exit here unconverged —
                // the outer REML then rejects ρ-evaluation after ρ-evaluation
                // and cycles for hours (#1040: matern/duchon/measure-jet all
                // time out; binary-MS, which has no such direction, fits in
                // seconds). Stationarity on the identifiable subspace is the
                // honest convergence statement: if the projected residual's
                // component in the RANGE of H_pen is at tolerance, the stalled
                // mass lives in ker(H_pen) — exactly what the outer IFT
                // projects out before the envelope correction (gam#553) — and
                // the iterate is accepted. A residual with genuine range-space
                // mass (a real defect) still exits unconverged below.
                if objective_change <= objective_tol
                    && let Some(range_residual) = projected_residual_range_space_inf(
                        &projected_residual_vec,
                        &joint_hessian_source,
                        &ranges,
                        &s_lambdas,
                        ridge,
                        options.ridge_policy,
                        total_p,
                    )
                    && range_residual <= 4.0 * residual_tol
                {
                    log::info!(
                        "[PIRLS/joint-Newton convergence] cycle {:>3} | residual-stall range-space certificate (gam#1040): \
                         total projected residual={:.3e} > tol={:.3e} stalled for {} cycles, but its range-space component={:.3e} \
                         ≤ 4×tol={:.3e} and |Δobjective|={:.3e} ≤ obj_tol={:.3e}; the stalled mass is a free \
                         ker(H_pen) gauge direction the outer IFT pseudo-inverse projects out — accepting as stationary \
                         on the identifiable subspace.",
                        cycle,
                        residual,
                        residual_tol,
                        cycles_since_residual_improved,
                        range_residual,
                        4.0 * residual_tol,
                        objective_change,
                        objective_tol,
                    );
                    // Record the residual this exit actually certified on
                    // (#1040 inner-report truthfulness): the converged status is
                    // earned by `range_residual ≤ 4×tol` on the identifiable
                    // subspace, so the terminal line must report that finite
                    // certified residual — not the `inf` stall-tracker sentinel,
                    // which a cycle-1 certificate exit (head KKT non-finite, so
                    // the head-of-cycle `min` update was skipped) would otherwise
                    // leave unset, printing `converged=true … best_residual_inf=inf`.
                    if range_residual.is_finite() {
                        min_certified_residual = min_certified_residual.min(range_residual);
                    }
                    converged = true;
                    break;
                }
                let last_math_summary = last_joint_math
                    .as_ref()
                    .map(|math| {
                        format!(
                            "last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
                            math.old_kkt_inf,
                            math.linearized_next_kkt_inf,
                            math.actual_reduction,
                            math.predicted_reduction,
                            math.trust_ratio,
                            math.scalar_model_relative_error(),
                            math.step_inf,
                            math.proposal_inf,
                        )
                    })
                    .unwrap_or_else(|| "last_newton_math=<none>".to_string());
                log::warn!(
                    "[PIRLS/joint-Newton convergence] cycle {:>3} | residual-stall early-exit: residual={:.3e} best_seen={:.3e} no_improve_cycles={} accepted_step_inf={:.3e} trust_radius={:.3e} block_stationarity_inf={:?} {}; returning unconverged with finite β so the outer optimizer rejects this ρ evaluation before inner_max_cycles.",
                    cycle,
                    residual,
                    best_residual_seen,
                    cycles_since_residual_improved,
                    accepted_step_inf,
                    joint_trust_radius,
                    block_stationarity_norms,
                    last_math_summary,
                );
                converged = false;
                break;
            }

            // KKT convergence: small residual plus EITHER a small
            // Newton step (tight quadratic-rate convergence, lets β
            // polish to machine precision), confirmed stagnation
            // (`accepted_step_inf <= step_tol` AND `objective_change
            // <= objective_tol`, the rank-deficient null-mode case),
            // OR a stricter stationarity certificate where both the
            // residual and objective change are an additional factor of
            // `inner_tol` below their scale-aware tolerances. The last
            // branch is deliberately stricter than the public tolerance:
            // it handles machine-precision null directions where β can
            // still move by about `step_tol` but the KKT residual and
            // objective are already over-polished. Using objective
            // stagnation alone is not sufficient; the residual guard is
            // what preserves first-order correctness.
            let superconverged_residual_tol = inner_tol * residual_tol;
            let superconverged_objective_tol = inner_tol * objective_tol;
            let superconverged_stationarity = residual <= superconverged_residual_tol
                && objective_change <= superconverged_objective_tol;
            if residual <= residual_tol
                && (step_inf <= step_tol
                    || (accepted_step_inf <= step_tol && objective_change <= objective_tol)
                    || superconverged_stationarity)
            {
                log::info!(
                    "[JN-EXIT] cycle={cycle} reason=plateau_objective_flat residual={residual:.3e} residual_tol={residual_tol:.3e} obj_change={objective_change:.3e} objective_tol={objective_tol:.3e} consecutive_flat={} accepted_step_inf={accepted_step_inf:.3e} step_tol={step_tol:.3e}",
                    obj_flat_streak.streak(),
                );
                // This branch certifies on `residual ≤ residual_tol`; record it
                // so the terminal line reports the finite certified residual
                // rather than the `inf` stall sentinel (#1040 truthfulness).
                if residual.is_finite() {
                    min_certified_residual = min_certified_residual.min(residual);
                }
                converged = true;
                break;
            }
            // Scale-invariant objective-plateau exit (gam#1040). The flatness
            // predicate is RELATIVE — `objective_tol = inner_tol·(1+|obj|)` —
            // so it fires identically whether the survival NLL objective is
            // O(1) or O(1e4); a fixed absolute ε never trips at the ~6e4
            // magnitude of a marginal-slope survival fit. When the objective
            // has been relative-flat for the full `FlatStreak` window the
            // iterate has stopped moving in value. On a genuinely flat REML
            // valley along the weakly-identified time-wiggle ρ the Newton
            // step is tiny because the gradient is tiny (not because the
            // trust region truncated it), so the `tr_clamped_during_stall`
            // precondition of the residual-stall range-space certificate
            // above is UNSATISFIED and that exit never fires — the loop used
            // to grind to `inner_loop_hard_ceiling` every outer eval, which
            // is the #1040 hang (outer REML rejects ρ after ρ for hours).
            // The honest convergence statement is identical to the tr-clamped
            // path: if the projected residual's component in range(H_pen) is
            // at tolerance, the un-moved mass lives in ker(H_pen) — the free
            // gauge directions the outer IFT pseudo-inverse projects out
            // (gam#553) — and the iterate IS the REML optimum on the
            // identifiable subspace. Report converged.
            let plateau_verdict = obj_flat_streak.note(objective_change <= objective_tol);
            if plateau_verdict == crate::solver::loop_guard::LoopVerdict::Plateaued
                && range_projected_block_stationarity_small()
                && let Some(range_residual) = projected_residual_range_space_inf(
                    &projected_residual_vec,
                    &joint_hessian_source,
                    &ranges,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    total_p,
                )
                && range_residual <= 4.0 * residual_tol
            {
                log::info!(
                    "[JN-EXIT] cycle={cycle} reason=relative_objective_plateau (gam#1040): \
                     |Δobjective|={objective_change:.3e} ≤ obj_tol={objective_tol:.3e} for {} \
                     consecutive cycles (scale-invariant rel-flat streak); total projected \
                     residual={residual:.3e} > tol={residual_tol:.3e} but its range-space \
                     component={range_residual:.3e} ≤ 4×tol={:.3e} — the un-moved mass is a free \
                     ker(H_pen) gauge direction the outer IFT projects out; accepting as stationary \
                     on the identifiable subspace.",
                    obj_flat_streak.streak(),
                    4.0 * residual_tol,
                );
                // Certified on `range_residual ≤ 4×tol`; record it so the
                // terminal report carries this finite certified residual
                // instead of the `inf` stall sentinel (#1040 truthfulness).
                if range_residual.is_finite() {
                    min_certified_residual = min_certified_residual.min(range_residual);
                }
                converged = true;
                break;
            }
            // Carry the KKT-stationarity / objective-stagnation signals
            // into the next cycle so the line-search-failure path above
            // can recognise a true KKT optimum on a rank-deficient null
            // mode. See that path for the full rationale.
            last_cycle_residual_below_tol = residual <= residual_tol;
            last_cycle_obj_change_below_tol = objective_change <= objective_tol;

            // NOTE: there is deliberately NO wall-clock-driven "adaptive
            // early-exit" here. A convergence verdict that fires when a cycle's
            // wall-clock happens to fall below a fraction of a running EMA is
            // non-deterministic — under CPU contention (a parallel sweep) the
            // same fit accepts at a different iterate than it does run alone,
            // which cascades into a different outer seed and a different
            // continuation-pre-warm fire/collapse decision (gam#979's
            // "collapses sequentially, fires in parallel" instability). It also
            // accepts iterates up to 10× outside the real KKT/objective
            // tolerance, biasing the REML/LAML criterion the inner residual
            // feeds. Convergence is certified ONLY by the mathematical tests
            // above (KKT residual / Newton step / objective change at their
            // scale-aware tolerances); whether convergence is *reachable within
            // the cycle budget* is judged by the deterministic descent-rate
            // guard alongside the residual-stall detector above.
        }

        // Explicit terminal verdict for the joint-Newton inner solve.
        //
        // The per-cycle `[PIRLS/JN] cyc=N/MAX … resid=… (tol=…)` line prints
        // the KKT/step/objective gaps at every cycle but never states which
        // criterion *terminated* the loop, so the final visible line on a
        // budget-exhausted solve looks identical to an ordinary mid-run cycle
        // (gam#744). A reader scanning a sweep log cannot tell a fit that
        // reached a stationary point from one that simply ran out of cycles
        // with the residual still orders of magnitude above tolerance and only
        // the objective stalled. Emit one authoritative line, on every exit
        // path, naming the terminating condition: `converged` is the honest
        // status the result carries downstream, `budget_exhausted` distinguishes
        // "ran the full cap" from an early certificate/divergence exit, and the
        // residual/step/objective stall flags say *why*. A budget-exhausted,
        // non-converged exit is logged at WARN so it is impossible to miss even
        // when per-cycle INFO is filtered out; a clean convergence is INFO.
        {
            let budget_exhausted = cycles_done >= inner_max_cycles;
            // Hard convergence-truthfulness invariant (#1040): a converged exit
            // is, by construction, certified on a finite stationarity residual
            // ≤ tol (every `converged = true` path above is gated on a finite
            // residual / range-space check and records it into
            // `min_certified_residual`). If — through any path — `converged` is
            // set without a finite certified residual on record, the solve has
            // NOT actually certified convergence; reporting `converged=true …
            // best_residual_inf=inf` is the self-contradicting status #1040
            // flags. The honest status is then non-converged: downgrade it so
            // the outer REML/LAML evaluation rejects this ρ rather than
            // consuming a phantom optimum certified on no finite residual.
            if !crate::solver::loop_guard::inner_convergence_is_truthful(
                converged,
                min_certified_residual,
            ) {
                log::warn!(
                    "[PIRLS/joint-Newton terminal] cycle {cycles_done}/{inner_max_cycles}: a converged \
                     exit fired without any finite certified stationarity residual on record \
                     (min_certified_residual is non-finite) — this would report \
                     converged=true with best_residual_inf=inf, a convergence-truthfulness \
                     violation (#1040). Downgrading to non-converged so the outer optimizer \
                     rejects this evaluation."
                );
                converged = false;
            }
            let terminator = if converged {
                "KKT/certificate-converged"
            } else if budget_exhausted {
                "budget-exhausted (max cycles reached)"
            } else {
                "early-exit non-converged (divergence/stall guard)"
            };
            // `solve_wall` (whole inner-solve elapsed) + `cycles` make the
            // per-solve cost explicit on ONE line: gam#979's "outer
            // multiplication" candidate is read off by counting these terminal
            // lines across a repro and summing their wall-times, and the
            // overhead candidate by comparing `solve_wall / cycles` against the
            // [joint-newton-tr] phase splits. Together with the per-cycle
            // `per_block_resid` (which block stalls) and the existing TR line
            // (ρ gain-ratio + decision: model infidelity vs TR throttling), a
            // single RUST_LOG=info run separates all four #979 candidates.
            //
            // Report `min_certified_residual` (the smallest stationarity residual
            // the solve actually computed) rather than the stall-tracker
            // `best_residual_seen`: the latter is only written at the post-step
            // residual site, so a head-of-cycle / pre-line-search certificate exit
            // (cycle-0 KKT exit on already-stationary data) left it at the sentinel
            // `inf` and the line read `converged=true … best_residual_inf=inf`, a
            // self-contradicting status (#1040 inner-report truthfulness). A
            // converged exit always certified on a finite residual ≤ tol, so the
            // reported residual is finite whenever `converged` (every converged=true
            // path is gated on a `≤ tol` check of a residual recorded above).
            let reported_residual_below_tol = last_cycle_residual_below_tol
                || (converged && min_certified_residual <= last_residual_tol);
            let verdict = format!(
                "[PIRLS/joint-Newton terminal] converged={} terminator={} cycles={}/{} \
                 solve_wall={:.3}s best_residual_inf={:.3e} (tol={:.3e}) last_residual_below_tol={} \
                 last_obj_change_below_tol={} objective={:.6e}; this is the status the inner \
                 solve reports to the outer REML/LAML evaluation — a non-converged exit \
                 (residual ≫ tol with only the objective stalled) is rejected, not accepted",
                converged,
                terminator,
                cycles_done,
                inner_max_cycles,
                inner_started.elapsed().as_secs_f64(),
                min_certified_residual,
                last_residual_tol,
                reported_residual_below_tol,
                last_cycle_obj_change_below_tol,
                lastobjective,
            );
            if converged {
                log::info!("{verdict}");
            } else {
                log::warn!("{verdict}");
            }
        }

        // If joint Newton converged, skip the blockwise loop entirely.
        if converged {
            let penalty_value = total_quadratic_penalty(
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            let (block_logdet_h, block_logdet_s) = blockwise_logdet_terms_with_workspace(
                family,
                specs,
                &mut states,
                block_log_lambdas,
                options,
                cached_joint_workspace.clone(),
            )?;
            // The IFT/outer KKT residual must be the AUGMENTED stationarity
            // `∇L − Sβ + ∇Φ` the inner Newton actually drove to zero — NOT the bare
            // `∇L − Sβ`. With the Firth term armed, `∇L − Sβ = −∇Φ` at the
            // converged β, so the bare residual's null-space component equals ∇Φ
            // (O(‖∇Φ‖), e.g. 2.49 for the coupled Dirichlet). The outer evaluator's
            // range-projected IFT validity gate (`projected_into_reduced_range`)
            // then sees that ‖∇Φ‖ of "unresolved mass outside the reduced range"
            // and rejects EVERY seed at outer startup validation ("no candidate
            // seeds passed", gam#729/#715). Folding ∇Φ into the gradient makes the
            // residual the genuinely-near-zero augmented stationarity the inner
            // certified, so the gate passes. No-op when the term is
            // condition-gated/unavailable (∇Φ=0).
            let augmented_joint_gradient: Option<Array1<f64>> = match (
                cached_joint_gradient.as_ref(),
                joint_jeffreys_subspace.as_ref(),
            ) {
                (Some(gradient), Some(z_joint)) => {
                    match custom_family_joint_jeffreys_term(
                        family, &states, specs, &ranges, z_joint,
                    )? {
                        Some((_phi, grad_phi, _hphi)) if grad_phi.len() == gradient.len() => {
                            Some(gradient + &grad_phi)
                        }
                        _ => None,
                    }
                }
                _ => None,
            };
            let ift_gradient = augmented_joint_gradient
                .as_ref()
                .or(cached_joint_gradient.as_ref());
            let kkt_residual = exact_newton_joint_kkt_residual_for_ift_from_cached_gradient(
                family,
                specs,
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                Some(cached_active_sets.as_slice()),
                ift_gradient,
            )?;
            let kkt_residual =
                require_projected_kkt_residual(kkt_residual, "joint-Newton converged exit")?;
            // Thread the cert tolerance + free subspace rank through to
            // the unified evaluator's certificate so the outer
            // optimiser's InnerStatus carrier sees honest numbers
            // instead of NaN / None.
            let active_set_rows_total: usize = cached_active_sets
                .iter()
                .map(|maybe| maybe.as_ref().map(|v| v.len()).unwrap_or(0))
                .sum();
            let free_rank_at_cert = total_p.saturating_sub(active_set_rows_total);
            let kkt_residual = kkt_residual.with_metadata(last_residual_tol, free_rank_at_cert);
            // Build the joint active-constraint block for the unified
            // evaluator's constraint-aware kernel
            // `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. Returns `None` when
            // the family has no declared inequality constraints, or when
            // no rows are currently active at the cert point; in either
            // case the consumer-side `with_active_constraints` helper
            // degrades back to the bare penalty-projected pseudo-inverse.
            let active_constraints = {
                let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
                assemble_active_constraint_block(
                    &block_constraints,
                    &cached_active_sets,
                    &ranges,
                    total_p,
                )
                .map(std::sync::Arc::new)
            };
            return Ok(BlockwiseInnerResult {
                block_states: states,
                active_sets: normalize_active_sets(cached_active_sets),
                log_likelihood: current_log_likelihood,
                penalty_value,
                cycles: cycles_done,
                converged,
                block_logdet_h,
                block_logdet_s,
                s_lambdas,
                joint_workspace: cached_joint_workspace.clone(),
                kkt_residual: Some(kkt_residual),
                active_constraints,
            });
        }
        if cycles_done >= inner_max_cycles {
            if !converged {
                // Engine-level diagnostic. Emit measured quantities only:
                // objective movement, coefficient scale, per-block dimensions,
                // per-block β and gradient scales, the unprojected stationarity
                // norm at exit, the Hessian source shape, and the last accepted
                // Newton identity diagnostics. The outer error path has no
                // access to these internals, so this line is the complete
                // numerical record needed to decide the next fix.
                let block_grad_norms: Vec<f64> = match cached_joint_gradient.as_ref() {
                    Some(joint_grad) => {
                        let mut acc = 0usize;
                        states
                            .iter()
                            .map(|s| {
                                let n = s.beta.len();
                                let end = (acc + n).min(joint_grad.len());
                                let nrm = if acc < end {
                                    joint_grad
                                        .slice(ndarray::s![acc..end])
                                        .iter()
                                        .map(|x: &f64| x.abs())
                                        .fold(0.0_f64, f64::max)
                                } else {
                                    f64::NAN
                                };
                                acc += n;
                                nrm
                            })
                            .collect()
                    }
                    None => vec![f64::NAN; states.len()],
                };
                let block_widths: Vec<usize> = states.iter().map(|s| s.beta.len()).collect();
                let block_beta_inf: Vec<f64> = states
                    .iter()
                    .map(|s| s.beta.iter().map(|x: &f64| x.abs()).fold(0.0_f64, f64::max))
                    .collect();
                let descent_total = initial_joint_objective - lastobjective;
                let beta_inf_final = states
                    .iter()
                    .flat_map(|s| s.beta.iter().copied())
                    .map(f64::abs)
                    .fold(0.0_f64, f64::max);
                let block_diag_default =
                    !family.exact_newton_joint_hessian_beta_dependent() && specs.len() >= 2;
                let exit_unprojected_kkt_inf = cached_joint_gradient
                    .as_ref()
                    .and_then(|joint_grad| {
                        exact_newton_joint_stationarity_vector_from_gradient(
                            joint_grad,
                            &states,
                            specs,
                            &s_lambdas,
                            ridge,
                            options.ridge_policy,
                        )
                        .ok()
                    })
                    .map(|residual| {
                        residual
                            .iter()
                            .map(|x: &f64| x.abs())
                            .fold(0.0_f64, f64::max)
                    })
                    .unwrap_or(f64::NAN);
                let last_math_summary = last_joint_math
                    .as_ref()
                    .map(|math| {
                        format!(
                            "last_newton_math={{old_kkt={:.3e}, linearized_next={:.3e}, actual={:+.3e}, pred={:+.3e}, rho={:+.3e}, scalar_relerr={:.3e}, step_inf={:.3e}, proposal_inf={:.3e}}}",
                            math.old_kkt_inf,
                            math.linearized_next_kkt_inf,
                            math.actual_reduction,
                            math.predicted_reduction,
                            math.trust_ratio,
                            math.scalar_model_relative_error(),
                            math.step_inf,
                            math.proposal_inf,
                        )
                    })
                    .unwrap_or_else(|| "last_newton_math=<none>".to_string());
                log::warn!(
                    "[PIRLS/joint-Newton] cycle={} budget-exhausted without KKT: objective_start={:.6e} objective_end={:.6e} objective_drop={:+.3e} beta_inf={:.3e} exit_unprojected_kkt_inf={:.3e} total_p={} total_n={} block_widths={:?} block_beta_inf={:?} block_grad_inf={:?} block_diag_hessian_default={} {}; rejecting this outer REML/LAML evaluation",
                    cycles_done,
                    initial_joint_objective,
                    lastobjective,
                    descent_total,
                    beta_inf_final,
                    exit_unprojected_kkt_inf,
                    total_p,
                    total_joint_n,
                    block_widths,
                    block_beta_inf,
                    block_grad_norms,
                    block_diag_default,
                    last_math_summary,
                );
                if coupled_exact_joint_required {
                    // Budget-exhaustion error MUST carry `block_residual_inf=…`
                    // so the carrying block survives the bubble through the
                    // outer optimiser. If no in-cycle cert refusal produced
                    // a structured report we build one here from the cached
                    // joint gradient + states. `joint_hessian_source` is
                    // per-cycle so the H_pen spectrum fields degrade to
                    // NaN/empty; per-block residual data is fully present.
                    let block_diag = if let Some(report) = last_kkt_refusal_report.as_ref() {
                        report.format_bubbled_error()
                    } else {
                        let block_constraints =
                            collect_block_linear_constraints(family, &states, specs)?;
                        let report = compute_kkt_refusal_report(
                            cycles_done,
                            &states,
                            specs,
                            &s_lambdas,
                            &ranges,
                            cached_joint_gradient.as_ref(),
                            &cached_active_sets,
                            &block_constraints,
                            None,
                            total_p,
                            ridge,
                            options.ridge_policy,
                            f64::NAN,
                            f64::NAN,
                            f64::NAN,
                            last_residual_tol,
                            f64::NAN,
                            f64::NAN,
                            f64::NAN,
                            exit_unprojected_kkt_inf,
                            last_joint_math.as_ref(),
                        );
                        report.format_bubbled_error()
                    };
                    return Err(format!(
                        "coupled exact-joint inner solve exhausted the joint Newton budget without KKT convergence after {cycles_done} cycle(s) — {block_diag}"
                    ));
                }
            }
            let penalty_value = total_quadratic_penalty(
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            let (block_logdet_h, block_logdet_s) = blockwise_logdet_terms_with_workspace(
                family,
                specs,
                &mut states,
                block_log_lambdas,
                options,
                cached_joint_workspace.clone(),
            )?;
            let active_constraints = {
                let local_ranges = block_param_ranges(specs);
                let local_total_p = local_ranges.last().map(|(_, end)| *end).unwrap_or(0);
                let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
                assemble_active_constraint_block(
                    &block_constraints,
                    &cached_active_sets,
                    &local_ranges,
                    local_total_p,
                )
                .map(std::sync::Arc::new)
            };
            return Ok(BlockwiseInnerResult {
                block_states: states,
                active_sets: normalize_active_sets(cached_active_sets),
                log_likelihood: current_log_likelihood,
                penalty_value,
                cycles: cycles_done,
                converged,
                block_logdet_h,
                block_logdet_s,
                s_lambdas,
                joint_workspace: cached_joint_workspace.clone(),
                kkt_residual: None,
                active_constraints,
            });
        }
        if coupled_exact_joint_required {
            // Bubble the structured KKT refusal report (per-block residual
            // breakdown + H_pen spectrum + diagnosis) so the cause of the
            // refusal survives serialization through the outer optimizer,
            // the seed-validation cascade, and gamfit. When the cert refused
            // inside the cycle loop we already computed a `KktRefusalReport`
            // at the refusing iterate; reuse it verbatim. If a different
            // early-exit path reaches this branch, build the same structured
            // report from the last Newton math snapshot rather than routing
            // through a second diagnostic string format.
            let block_diag = last_kkt_refusal_report
                .as_ref()
                .map(KktRefusalReport::format_bubbled_error)
                .unwrap_or_else(|| {
                    "structured KKT refusal report unavailable: no joint Newton math snapshot"
                        .to_string()
                });
            return Err(format!(
                "coupled exact-joint inner solve exited the joint Newton path before convergence — {block_diag}"
            ));
        }
        // Otherwise fall through to blockwise iteration below.
    }

    let mut cached_eval = match cached_eval {
        Some(eval) => eval,
        None => family.evaluate(&states)?,
    };
    lastobjective = -cached_eval.log_likelihood + current_penalty;

    // Divergence-detection state for the blockwise loop.
    //
    // Some family parameterizations (e.g. BernoulliMarginalSlopeFamily with
    // linkwiggle + scorewarp) carry a near-null direction in the joint
    // Hessian when the link-deviation basis's empirical anchor — fixed at
    // the rigid-pilot η₀ when the basis is constructed — drifts during
    // PIRLS as the location/spatial blocks update η₀. The Newton step
    // becomes dominated by that null direction and is clamped at
    // MAX_NEWTON_STEP every cycle while β grows linearly along it; the
    // log-likelihood stays frozen, only the penalty changes (slowly).
    // Without an early-exit the loop runs to inner_max_cycles producing
    // the same -loglik over and over, which at large scale (each cycle
    // ~0.5s) burns ~50s per ρ-cost call and stacks up to a 2400s timeout.
    //
    // Detect the pattern and bail with `converged = false` so the cost
    // call returns Err / +∞, BFGS κ-optim backs off the divergent ρ
    // region, and the outer loop progresses instead of grinding.

    // Per-block trust-region radius in the block's penalized-Hessian metric.
    // Updated each cycle by `update_joint_trust_region_radius` (the same
    // function the joint-Newton path uses) on a real model-vs-truth rho
    // computed from each block's penalized quadratic. Using the curvature
    // metric here avoids the same starvation mechanism fixed in the joint
    // path: one near-null coordinate in a block must not raw-rescale every
    // other coordinate in that block. The η-overflow safety half of the
    // previous static `MAX_NEWTON_STEP = 20.0` is owned by the family's
    // `max_feasible_step_size` barrier check, called by the line search below;
    // this variable handles only the algorithmic trust-region half. The
    // initial seed value is the family-declared safe step for a fresh fit; the
    // function then adapts it freely (clamped to [1e-12, 1e6] by the function
    // itself, same as the joint path).
    const BLOCK_NEWTON_STEP_INITIAL: f64 = 20.0;
    let mut block_max_step: Vec<f64> = vec![BLOCK_NEWTON_STEP_INITIAL; specs.len()];

    let mut prev_log_likelihood_for_divergence_check = cached_eval.log_likelihood;
    // Frozen-loglik streak rides the shared window discipline
    // (loop_guard::FlatStreak, #968); the frozen-loglik predicate and the
    // clamped-step side condition below stay local — they are policy about
    // what counts as flat, which this loop rightly owns.
    let mut frozen_loglik_streak =
        crate::solver::loop_guard::FlatStreak::new(DIVERGENCE_FROZEN_LOGLIK_CYCLES);
    // Coordinate descent visits each block in turn, so `max_proposed_step`
    // (the per-cycle max across blocks) only fires the cap on cycles where
    // the divergent block is the active one. On a near-null direction this
    // produces an alternation pattern (e.g. cap, cap, small, cap, small,
    // cap, …) and a strict "consecutive cycles where step is clamped"
    // requirement resets the counter every time another block's smaller
    // step dominates the per-cycle maximum. The frozen-loglik signal,
    // however, is a property of the joint state — it stays true across
    // every cycle of the alternation. Track frozen-loglik consecutively
    // and require that `step_clamped` was observed AT LEAST ONCE inside
    // the frozen run (rather than EVERY cycle).
    let mut clamped_step_in_frozen_run: bool = false;
    const DIVERGENCE_FROZEN_LOGLIK_CYCLES: usize = 8;

    let is_dynamic = family.block_geometry_is_dynamic();
    for cycle in 0..inner_max_cycles {
        // Fires at the top of each blockwise coordinate cycle so we can count
        // iterations from CI logs when a benchmark hangs inside the first
        // outer-eval. Emitted at info-level: same rationale as the joint-Newton
        // sibling above — silent-grind diagnosis without debug logs.
        log::info!(
            "[PIRLS/blockwise coord] cycle {:>3}/{} | -loglik {:.6e} | penalty {:.6e} | objective {:.6e}",
            cycle,
            inner_max_cycles,
            -cached_eval.log_likelihood,
            current_penalty,
            lastobjective,
        );
        let mut max_proposed_beta_step = 0.0_f64;
        let mut max_accepted_beta_step = 0.0_f64;
        let mut trust_boundary_hit_in_cycle = false;

        let mut objective_cycle_prev = lastobjective;
        // Reuse cached evaluation from end of previous cycle (or initial eval).
        // For dynamic families, the end-of-cycle evaluation is also reused here
        // instead of re-evaluating redundantly — the state hasn't changed since
        // the last cycle's final evaluate.
        let mut cycle_eval = std::mem::replace(
            &mut cached_eval,
            FamilyEvaluation {
                log_likelihood: 0.0,
                blockworking_sets: Vec::new(),
            },
        );
        if cycle_eval.blockworking_sets.len() != specs.len() {
            return Err(format!(
                "family returned {} block working sets, expected {}",
                cycle_eval.blockworking_sets.len(),
                specs.len()
            ));
        }
        // Track whether any block was modified this cycle (for dynamic families,
        // we only need to re-evaluate before block b if a previous block changed).
        let mut any_block_modified = false;
        for b in 0..specs.len() {
            if is_dynamic && any_block_modified {
                // Only re-evaluate if a previous block in this cycle actually
                // modified coefficients. Skips the redundant evaluate for the
                // first block (b=0) since cached_eval is still valid.
                refresh_all_block_etas(family, specs, &mut states)?;
                cycle_eval = family.evaluate(&states)?;
                if cycle_eval.blockworking_sets.len() != specs.len() {
                    return Err(format!(
                        "family returned {} block working sets, expected {}",
                        cycle_eval.blockworking_sets.len(),
                        specs.len()
                    ));
                }
            }

            let spec = &specs[b];
            let work = &cycle_eval.blockworking_sets[b];
            let linear_constraints = family.block_linear_constraints(&states, b, spec)?;
            let s_lambda = &s_lambdas[b];
            let updater = work.updater();
            let update = updater.compute_update_step(&BlockUpdateContext {
                family,
                states: &states,
                spec,
                block_idx: b,
                s_lambda,
                options,
                linear_constraints: linear_constraints.as_ref(),
                cached_active_set: cached_active_sets[b].as_deref(),
            })?;
            if let Some(active_set) = update.active_set {
                cached_active_sets[b] = Some(active_set);
            }
            let beta_new_raw = update.beta_new_raw;
            let beta_new = family.post_update_block_beta(&states, b, spec, beta_new_raw.clone())?;
            reject_constrained_post_update_repair(
                b,
                spec,
                &beta_new_raw,
                &beta_new,
                linear_constraints.as_ref(),
            )?;
            let beta_old = states[b].beta.clone();
            let raw_delta = &beta_new - &beta_old;
            // Per-block trust-region radius in the block's local
            // penalized-Hessian metric. The cap is the current value of
            // `block_max_step[b]`, updated below via
            // `update_joint_trust_region_radius` once we know rho.
            let block_cap = block_max_step[b];
            let (delta, step_metric_norm) = truncate_block_step_to_metric_radius(
                spec,
                work,
                s_lambda,
                raw_delta,
                block_cap,
                ridge,
                options.ridge_policy,
            )?;
            let step_hit_trust_boundary =
                joint_block_step_hit_trust_boundary(step_metric_norm, block_cap);
            trust_boundary_hit_in_cycle |= step_hit_trust_boundary;
            // Capture the objective at the start of this block update so
            // we can compute the true `actual_reduction` once the line
            // search has finished. `objective_cycle_prev` is the running
            // total: it advances inside the line search whenever a trial
            // is accepted, so we must snapshot it here.
            let obj_before_block = objective_cycle_prev;
            let old_block_penalty =
                block_quadratic_penalty(&beta_old, s_lambda, ridge, options.ridge_policy);
            let step_beta_inf = delta.iter().copied().map(f64::abs).fold(0.0, f64::max);
            max_proposed_beta_step = max_proposed_beta_step.max(step_beta_inf);
            if step_beta_inf <= inner_tol {
                continue;
            }

            // Damped update: require non-increasing penalized objective under dynamic geometry.
            // Precompute X * delta once so line-search eta updates are O(n) not O(np).
            // Reuse pre-allocated eta backup to avoid O(n) allocation per block per cycle.
            let eta_checkpoint = BlockEtaCheckpoint::capture_reuse(&states[b], &mut eta_backups[b]);
            let x_delta = if !is_dynamic {
                Some(spec.solver_design().matrixvectormultiply(&delta))
            } else {
                None
            };
            let mut accepted = false;
            // Barrier-aware step ceiling: families with natural log-barrier
            // terms (e.g. log(h') in transformation-normal) report the maximum
            // feasible step fraction so the line search never evaluates the
            // likelihood outside its domain.
            let barrier_ceiling = family
                .max_feasible_step_size(&states, b, &delta)?
                .unwrap_or(1.0);
            // Reuse trial_beta_buf to avoid allocation per backtracking trial.
            let mut trial_beta_buf = beta_old.clone();
            let mut accepted_bt: usize = usize::MAX;
            for bt in 0..8 {
                let alpha = (0.5f64.powi(bt)).min(barrier_ceiling);
                trial_beta_buf.assign(&beta_old);
                trial_beta_buf.scaled_add(alpha, &delta);
                let trial_beta =
                    family.post_update_block_beta(&states, b, spec, trial_beta_buf.clone())?;
                reject_constrained_post_update_repair(
                    b,
                    spec,
                    &trial_beta_buf,
                    &trial_beta,
                    linear_constraints.as_ref(),
                )?;
                states[b].beta = trial_beta;
                // Use precomputed X*delta when geometry is static and beta wasn't modified.
                if let Some(ref xd) = x_delta {
                    if states[b].beta == trial_beta_buf {
                        eta_checkpoint.restore_eta_with_step(&mut states[b], alpha, xd);
                    } else {
                        refresh_single_block_eta(family, specs, &mut states, b)?;
                    }
                } else {
                    refresh_single_block_eta(family, specs, &mut states, b)?;
                }
                let trial_block_penalty =
                    block_quadratic_penalty(&states[b].beta, s_lambda, ridge, options.ridge_policy);
                let trial_penalty = current_penalty - old_block_penalty + trial_block_penalty;
                let line_search_options = coefficient_line_search_options(
                    options,
                    objective_cycle_prev - trial_penalty + 1e-10,
                );
                let trial_ll =
                    match family.log_likelihood_only_with_options(&states, &line_search_options) {
                        Ok(value) => value,
                        Err(_) => {
                            states[b].beta.assign(&beta_old);
                            eta_checkpoint.restore_eta(&mut states[b]);
                            continue;
                        }
                    };
                let trialobjective = -trial_ll + trial_penalty;
                if trialobjective.is_finite() && trialobjective <= objective_cycle_prev + 1e-10 {
                    objective_cycle_prev = trialobjective;
                    current_penalty = trial_penalty;
                    accepted = true;
                    accepted_bt = bt as usize;
                    break;
                }
            }
            // Trust-region update for this block, using the same
            // `update_joint_trust_region_radius` strategy the
            // joint-Newton path uses. Predicted reduction is computed
            // from the per-block penalized quadratic model:
            //
            //   Q(β + αδ) ≈ Q(β) − α·rhs·δ + 0.5·α²·δ·H_pen·δ
            //   predicted_reduction(α) = α·(rhs·δ) − 0.5·α²·(δ·H_pen·δ)
            //
            // where `rhs = score − S·β (− ridge·β)` is the penalized
            // gradient (in maximize-direction) and `H_pen = H + S
            // (+ ridge·I)` is the penalized observed information.
            // Actual reduction is the true penalized objective change
            // measured by the line search; rho = actual / predicted is
            // the standard model-vs-truth ratio that drives the same
            // 0.25 / 0.75 grow-shrink rules `update_joint_trust_region_radius`
            // already implements for the joint path.
            let alpha_accepted = if accepted {
                0.5_f64.powi(accepted_bt as i32)
            } else {
                0.0
            };
            let (rhs_block, hpen_delta_full): (Array1<f64>, Array1<f64>) = match work {
                BlockWorkingSet::ExactNewton { gradient, .. } => {
                    let mut rhs = gradient - &s_lambda.dot(&beta_old);
                    if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                        rhs.scaled_add(-ridge, &beta_old);
                    }
                    let hpen = block_penalized_hessian_vector(
                        spec,
                        work,
                        s_lambda,
                        &delta,
                        ridge,
                        options.ridge_policy,
                    );
                    (rhs, hpen)
                }
                BlockWorkingSet::Diagonal {
                    working_response,
                    working_weights,
                } => {
                    // IRLS local-quadratic gradient and Hessian:
                    //   rhs = X^T W (z − Xβ) − Sβ
                    //   H_pen δ = X^T W X δ + Sδ
                    let solver_design = spec.solver_design();
                    let xb = solver_design.matrixvectormultiply(&beta_old);
                    let resid = working_response - &xb;
                    let w_resid = &resid * working_weights;
                    let mut rhs = solver_design.transpose_vector_multiply(&w_resid);
                    rhs -= &s_lambda.dot(&beta_old);
                    if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                        rhs.scaled_add(-ridge, &beta_old);
                    }
                    let hpen = block_penalized_hessian_vector(
                        spec,
                        work,
                        s_lambda,
                        &delta,
                        ridge,
                        options.ridge_policy,
                    );
                    (rhs, hpen)
                }
            };
            let rhs_dot_delta = rhs_block.dot(&delta);
            let delta_dot_hpen = delta.dot(&hpen_delta_full);
            let predicted_reduction = alpha_accepted * rhs_dot_delta
                - 0.5 * alpha_accepted * alpha_accepted * delta_dot_hpen;
            let actual_reduction = obj_before_block - objective_cycle_prev;
            let trust_update = update_joint_trust_region_radius(
                block_max_step[b],
                alpha_accepted * step_metric_norm,
                actual_reduction,
                predicted_reduction,
                obj_before_block,
            );
            block_max_step[b] = trust_update.radius;
            if !accepted {
                states[b].beta.assign(&beta_old);
                eta_checkpoint.restore_eta(&mut states[b]);
                if let BlockWorkingSet::ExactNewton { gradient, .. } = work {
                    let mut raw_descent = gradient - &s_lambda.dot(&beta_old);
                    if options.ridge_policy.include_quadratic_penalty && ridge > 0.0 {
                        raw_descent -= &beta_old.mapv(|v| ridge * v);
                    }
                    let (descent_dir, descent_metric_norm) = truncate_block_step_to_metric_radius(
                        spec,
                        work,
                        s_lambda,
                        raw_descent,
                        block_cap,
                        ridge,
                        options.ridge_policy,
                    )?;
                    trust_boundary_hit_in_cycle |=
                        joint_block_step_hit_trust_boundary(descent_metric_norm, block_cap);
                    let dir_norm = descent_dir.iter().fold(0.0_f64, |m, &v| m.max(v.abs()));
                    if dir_norm > inner_tol {
                        // Precompute X * descent_dir once for incremental eta updates.
                        let x_descent = if !is_dynamic {
                            Some(spec.solver_design().matrixvectormultiply(&descent_dir))
                        } else {
                            None
                        };
                        let descent_barrier_ceiling = family
                            .max_feasible_step_size(&states, b, &descent_dir)?
                            .unwrap_or(1.0);
                        for bt in 0..12 {
                            let alpha = (0.5f64.powi(bt)).min(descent_barrier_ceiling);
                            trial_beta_buf.assign(&beta_old);
                            trial_beta_buf.scaled_add(alpha, &descent_dir);
                            let trial_beta = family.post_update_block_beta(
                                &states,
                                b,
                                spec,
                                trial_beta_buf.clone(),
                            )?;
                            reject_constrained_post_update_repair(
                                b,
                                spec,
                                &trial_beta_buf,
                                &trial_beta,
                                linear_constraints.as_ref(),
                            )?;
                            states[b].beta = trial_beta;
                            if let Some(ref xd) = x_descent {
                                if states[b].beta == trial_beta_buf {
                                    eta_checkpoint.restore_eta_with_step(&mut states[b], alpha, xd);
                                } else {
                                    refresh_single_block_eta(family, specs, &mut states, b)?;
                                }
                            } else {
                                refresh_single_block_eta(family, specs, &mut states, b)?;
                            }
                            let trial_block_penalty = block_quadratic_penalty(
                                &states[b].beta,
                                s_lambda,
                                ridge,
                                options.ridge_policy,
                            );
                            let trial_penalty =
                                current_penalty - old_block_penalty + trial_block_penalty;
                            let line_search_options = coefficient_line_search_options(
                                options,
                                objective_cycle_prev - trial_penalty + 1e-10,
                            );
                            let trial_ll = match family
                                .log_likelihood_only_with_options(&states, &line_search_options)
                            {
                                Ok(value) => value,
                                Err(_) => {
                                    states[b].beta.assign(&beta_old);
                                    eta_checkpoint.restore_eta(&mut states[b]);
                                    continue;
                                }
                            };
                            let trialobjective = -trial_ll + trial_penalty;
                            if trialobjective.is_finite()
                                && trialobjective <= objective_cycle_prev + 1e-10
                            {
                                objective_cycle_prev = trialobjective;
                                current_penalty = trial_penalty;
                                accepted = true;
                                break;
                            }
                            states[b].beta.assign(&beta_old);
                            eta_checkpoint.restore_eta(&mut states[b]);
                        }
                    }
                }
            }
            if !accepted {
                states[b].beta.assign(&beta_old);
                eta_checkpoint.restore_eta(&mut states[b]);
            } else {
                let accepted_step = states[b]
                    .beta
                    .iter()
                    .zip(beta_old.iter())
                    .map(|(new, old)| (new - old).abs())
                    .fold(0.0_f64, f64::max);
                max_accepted_beta_step = max_accepted_beta_step.max(accepted_step);
                any_block_modified = true;
            }
            // Recycle the checkpoint's buffer back into the pre-allocated pool.
            eta_backups[b] = eta_checkpoint.into_buffer();
        }

        // For non-dynamic families, incremental eta updates within the block loop
        // maintain correct etas. Only refresh from scratch for dynamic-geometry families
        // where block interactions may require recomputation.
        if is_dynamic {
            refresh_all_block_etas(family, specs, &mut states)?;
        }
        cached_eval = family.evaluate(&states)?;
        current_penalty = total_quadratic_penalty(
            &states,
            &s_lambdas,
            ridge,
            options.ridge_policy,
            joint_bundle,
            Some(specs),
        );
        let objective = -cached_eval.log_likelihood + current_penalty;
        let objective_change = (objective - lastobjective).abs();
        lastobjective = objective;
        cycles_done = cycle + 1;

        // Divergence guard (mirrors the joint-Newton sibling, gam#554): a
        // non-finite objective / log-likelihood means a near-unidentified
        // penalized block has propagated NaN mass through the coordinate
        // descent. Every convergence and divergence-frozen exit below is a
        // finite `<=` comparison that NaN silently defeats, so without this
        // the loop grinds the full `inner_max_cycles` on every outer ρ-eval
        // and startup seed. Break unconverged so the outer optimizer rejects
        // this point immediately instead of burning the budget.
        if !objective.is_finite() || !cached_eval.log_likelihood.is_finite() {
            log::warn!(
                "[PIRLS/blockwise convergence] cycle {:>3} | divergence guard: non-finite inner state (objective={:.3e}, -loglik={:.3e}); returning unconverged so the outer optimizer rejects this ρ evaluation instead of running to inner_max_cycles.",
                cycle,
                objective,
                -cached_eval.log_likelihood,
            );
            converged = false;
            break;
        }

        // Scale-aware tolerances — see the matching joint-Newton path
        // above for the rationale. At large scale absolute step/residual
        // tolerances against `inner_tol = 1e-6` keep this loop spinning
        // long after the objective has gone flat.
        let beta_inf = states
            .iter()
            .flat_map(|s| s.beta.iter().copied())
            .map(f64::abs)
            .fold(0.0_f64, f64::max);
        let step_tol = inner_tol * (1.0 + beta_inf);
        let objective_tol = inner_tol * (1.0 + objective.abs());
        let residual_tol = objective_tol;
        // For single-block models the blockwise iteration IS the joint
        // iteration, so block-conditional convergence implies joint
        // convergence.  The exact_newton_joint_stationarity check can
        // stall at ~10x the tolerance due to numerical differences
        // between the block-conditional and joint gradient formulations,
        // causing 100s of wasted cycles on an already-converged solution.
        let exact_joint_stationarity_ok = if has_joint_exacthessian && specs.len() >= 2 {
            exact_newton_joint_stationarity_inf_norm(
                family,
                specs,
                &cached_eval,
                &states,
                &s_lambdas,
                ridge,
                options.ridge_policy,
                None,
            )?
            .map(|residual| residual <= residual_tol)
            .unwrap_or(true)
        } else {
            true
        };
        log::info!(
            "[PIRLS/blockwise convergence] cycle {:>3} | max_proposed_step={:.3e} (tol={:.3e}) | max_accepted_step={:.3e} | obj_change={:.3e} (tol={:.3e}) | beta_inf={:.3e} | joint_stationarity_ok={}",
            cycle,
            max_proposed_beta_step,
            step_tol,
            max_accepted_beta_step,
            objective_change,
            objective_tol,
            beta_inf,
            exact_joint_stationarity_ok,
        );

        // Divergence early-exit. See the rationale block at the top of
        // this loop. We treat "log-likelihood unchanged + Newton step
        // pinned at the trust-region cap" as a near-null direction
        // signature and break out unconverged once it persists for
        // DIVERGENCE_FROZEN_LOGLIK_CYCLES consecutive iterations. Tracking
        // log-likelihood (not objective) is essential: when the null mode
        // dominates, only the penalty drifts cycle-to-cycle, so
        // `objective_change` stays above tol while -loglik is genuinely
        // frozen.
        let loglik_change_for_divergence_check =
            (cached_eval.log_likelihood - prev_log_likelihood_for_divergence_check).abs();
        let loglik_frozen_tol_for_divergence_check =
            inner_tol * (1.0 + cached_eval.log_likelihood.abs());
        let step_clamped_for_divergence_check = trust_boundary_hit_in_cycle;
        let loglik_frozen =
            loglik_change_for_divergence_check <= loglik_frozen_tol_for_divergence_check;
        let frozen_verdict = frozen_loglik_streak.note(loglik_frozen);
        if loglik_frozen {
            if step_clamped_for_divergence_check {
                clamped_step_in_frozen_run = true;
            }
        } else {
            clamped_step_in_frozen_run = false;
        }
        prev_log_likelihood_for_divergence_check = cached_eval.log_likelihood;
        if frozen_verdict == crate::solver::loop_guard::LoopVerdict::Plateaued
            && clamped_step_in_frozen_run
        {
            log::warn!(
                "[PIRLS/blockwise convergence] divergence early-exit at cycle {} | -loglik={:.6e} frozen for {} consecutive cycles | max_proposed_step={:.3e} (trust-boundary hit observed in frozen run) | step_tol={:.3e}; near-null Hessian direction detected — returning unconverged so the outer optimizer backs off this region instead of running to inner_max_cycles.",
                cycle,
                -cached_eval.log_likelihood,
                frozen_loglik_streak.streak(),
                max_proposed_beta_step,
                step_tol,
            );
            converged = false;
            break;
        }

        // NOTE: there is deliberately NO wall-clock-driven "adaptive
        // early-exit" here — the same discipline the joint-Newton sibling loop
        // documents above. A verdict that fires when a cycle's wall-clock falls
        // below a fraction of a running EMA is non-deterministic: under CPU
        // contention (a parallel sweep) the same fit accepts at a different
        // iterate than it does run alone, and it accepts iterates up to 10×
        // outside the real KKT/objective tolerance, biasing the REML/LAML
        // criterion the inner residual feeds. Convergence is certified ONLY by
        // the exact stationarity gate below.
        if max_accepted_beta_step <= step_tol && objective_change <= objective_tol {
            if exact_joint_stationarity_ok || max_proposed_beta_step <= step_tol {
                converged = true;
            }
            break;
        }
    }

    // ── Polishing joint Newton step ──
    //
    // For block-coupled multi-block families (e.g. GAMLSS wiggle), Gauss-Seidel
    // blockwise iteration can reach step_inf < inner_tol while the joint KKT
    // residual (||Sβ − grad_ℓ||_∞) remains at ~10× inner_tol. This is because
    // each block is solved conditionally on other blocks' current values —
    // block-conditional stationarity does not imply joint stationarity when
    // the likelihood couples blocks off-diagonally.
    //
    // Once blockwise has placed β near the true joint optimum, a single (or
    // a few) damped joint Newton steps can tighten the joint residual to the
    // floor set by β magnitudes. This polishing phase is essential for the
    // outer REML gradient formula (which assumes exact β̂ stationarity); a
    // non-converged β̂ produces large envelope-theorem violations in the
    // analytic outer gradient.
    if use_joint_newton && !converged {
        polish_joint_newton_step(
            family,
            specs,
            options,
            &s_lambdas,
            ridge,
            joint_bundle,
            inner_tol,
            &cached_active_sets,
            &mut states,
            &mut cached_eval,
            &mut current_penalty,
            &mut converged,
        )?;
    }

    assemble_inner_blockwise_result(
        family,
        specs,
        states,
        block_log_lambdas,
        options,
        s_lambdas,
        ridge,
        joint_bundle,
        cached_active_sets,
        &cached_eval,
        converged,
        cycles_done,
        last_residual_tol,
    )
}

/// Polishing joint-Newton step for the blockwise fall-through path of
/// [`inner_blockwise_fit`].
///
/// For block-coupled multi-block families (e.g. GAMLSS wiggle), Gauss-Seidel
/// blockwise iteration can reach `step_inf < inner_tol` while the joint KKT
/// residual (`||Sβ − grad_ℓ||_∞`) remains at ~10× `inner_tol`. Once blockwise
/// has placed β near the joint optimum, a few damped joint-Newton steps tighten
/// the joint residual to the floor set by β magnitudes; this is essential for the
/// outer REML gradient formula (which assumes exact β̂ stationarity).
///
/// Behavior is identical to the inline loop it replaced: the `?`-propagation, the
/// per-iteration `break` exits (gradient/Hessian unavailable, non-finite delta,
/// solver failure, residual-tolerance reached, line-search failure) and the
/// inner backtracking-search `continue` are preserved verbatim. Mutates `states`,
/// `cached_eval`, `current_penalty`, and `converged` in place exactly as before.
pub(crate) fn polish_joint_newton_step<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    s_lambdas: &[Array2<f64>],
    ridge: f64,
    joint_bundle: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
    inner_tol: f64,
    cached_active_sets: &[Option<Vec<usize>>],
    states: &mut Vec<ParameterBlockState>,
    cached_eval: &mut FamilyEvaluation,
    current_penalty: &mut f64,
    converged: &mut bool,
) -> Result<(), String> {
    let ranges_joint: Vec<(usize, usize)> = {
        let mut offset = 0;
        specs
            .iter()
            .map(|s| {
                let start = offset;
                offset += s.design.ncols();
                (start, offset)
            })
            .collect()
    };
    let total_p_joint: usize = ranges_joint.last().map_or(0, |r| r.1);
    let joint_mode_diagonal_ridge = if ridge > 0.0 && options.ridge_policy.include_quadratic_penalty
    {
        ridge
    } else {
        0.0
    };
    let trace_diagonal_ridge = joint_mode_diagonal_ridge + JOINT_TRACE_STABILITY_RIDGE;

    // Allow up to a few polishing steps. The blockwise endpoint is close
    // to optimum, so step sizes should be small and line search should
    // accept full steps quickly.
    const POLISH_MAX_ITER: usize = 16;
    for _polish_iter in 0..POLISH_MAX_ITER {
        // Re-evaluate at current β to get the joint gradient and Hessian.
        refresh_all_block_etas(family, specs, states)?;
        let eval_for_polish = family.evaluate(states)?;
        let grad_full =
            match exact_newton_joint_gradient_from_eval(&eval_for_polish, specs, states)? {
                Some(g) => g,
                None => break,
            };
        // Spec-aware joint Hessian: canonical coupled-curvature source
        // (see the joint-Newton availability gate). Families overriding
        // only `_with_specs` return `None` from the spec-less default.
        let h_joint_opt = family.exact_newton_joint_hessian_with_specs(states, specs)?;
        let Some(h_joint) = h_joint_opt else { break };
        let mut h_dense = match symmetrized_square_matrix(
            h_joint,
            total_p_joint,
            "joint polish Hessian shape mismatch",
        ) {
            Ok(matrix) => matrix,
            Err(_) => break,
        };
        add_joint_penalty_to_matrix(
            &mut h_dense,
            &ranges_joint,
            s_lambdas,
            trace_diagonal_ridge,
            joint_bundle,
        );

        let mut beta_joint = Array1::<f64>::zeros(total_p_joint);
        for b in 0..specs.len() {
            let (start, end) = ranges_joint[b];
            beta_joint
                .slice_mut(ndarray::s![start..end])
                .assign(&states[b].beta);
        }
        let penalty_beta = apply_joint_block_penalty(
            &ranges_joint,
            s_lambdas,
            &beta_joint,
            joint_mode_diagonal_ridge,
            joint_bundle,
        );
        let rhs = &grad_full - &penalty_beta;

        // Respect constraints that block line search on the boundary.
        // Gauss-Seidel blockwise leaves the joint KKT residual at a floor
        // around |λ_k S_k β̂| for boundary-active components. The residual
        // magnitude on FREE components is a better measure of whether we
        // should keep polishing: if β_i is clipped at the boundary and
        // KKT multiplier μ_i > 0, then rhs[i] is the multiplier, not a
        // free-space gradient violation.
        let block_constraints_now = collect_block_linear_constraints(family, states, specs)?;
        let joint_constraints_now = assemble_joint_linear_constraints(
            &block_constraints_now,
            &ranges_joint,
            total_p_joint,
        )?;
        let mut active_mask: Vec<bool> = vec![false; total_p_joint];
        if let Some(ref constraints) = joint_constraints_now
            && let Ok(Some(bounds)) = extract_simple_lower_bounds(constraints, total_p_joint)
        {
            for (idx, (bound, beta_val)) in bounds
                .lower_bounds
                .iter()
                .zip(beta_joint.iter())
                .enumerate()
            {
                if *bound > f64::NEG_INFINITY && (*beta_val - *bound).abs() < 1e-12 {
                    active_mask[idx] = true;
                }
            }
        }
        let res_inf_free = rhs
            .iter()
            .zip(active_mask.iter())
            .filter(|(_, active)| !**active)
            .map(|(v, _)| v.abs())
            .fold(0.0_f64, f64::max);
        // Scale-aware residual tolerance — the joint stationarity
        // residual ‖∇ℓ − Sβ‖_∞ scales with |obj| (≈ O(n) at large-scale
        // scale), so the historical absolute `inner_tol = 1e-6` is
        // unachievable here even at the true minimum. Same rationale
        // as the joint-Newton convergence test above.
        let polish_obj = -cached_eval.log_likelihood + *current_penalty;
        let polish_residual_tol = inner_tol * (1.0 + polish_obj.abs());
        if res_inf_free <= polish_residual_tol {
            *converged = true;
            break;
        }

        // Solve constrained Newton system if simple bounds are present,
        // else unconstrained.
        let delta = if let Some(ref constraints) = joint_constraints_now {
            let warm = flatten_joint_active_set(cached_active_sets, &block_constraints_now);
            let lower_bounds_opt = extract_simple_lower_bounds(constraints, total_p_joint)
                .ok()
                .flatten();
            if let Some(bounds) = lower_bounds_opt.as_ref() {
                match solve_quadratic_with_simple_lower_bounds(
                    &h_dense,
                    &rhs,
                    &beta_joint,
                    bounds,
                    warm.as_deref(),
                ) {
                    Ok((beta_new, _active)) => &beta_new - &beta_joint,
                    Err(_) => break,
                }
            } else {
                match solve_quadratic_with_linear_constraints(
                    &h_dense,
                    &rhs,
                    &beta_joint,
                    constraints,
                    warm.as_deref(),
                ) {
                    Ok((beta_new, _active)) => &beta_new - &beta_joint,
                    Err(_) => break,
                }
            }
        } else {
            let solver = crate::linalg::utils::StableSolver::new("joint polish");
            match solver.solvevectorwithridge_retries(&h_dense, &rhs, JOINT_TRACE_STABILITY_RIDGE) {
                Some(d) => d,
                None => break,
            }
        };
        if !delta.iter().all(|v| v.is_finite()) {
            break;
        }
        // Keep polishing until the free-space joint residual is small; a
        // tiny delta alone is not a certificate of stationarity.
        // Damped line search with projection.
        let old_states: Vec<ParameterBlockState> = states.clone();
        let old_obj = -eval_for_polish.log_likelihood + *current_penalty;
        let mut accepted_polish = false;
        for bt in 0..10 {
            let alpha = 0.5f64.powi(bt);
            for b in 0..specs.len() {
                let (start, end) = ranges_joint[b];
                let mut trial_beta = old_states[b].beta.clone();
                trial_beta.scaled_add(alpha, &delta.slice(ndarray::s![start..end]));
                let projected =
                    family.post_update_block_beta(&old_states, b, &specs[b], trial_beta.clone())?;
                reject_constrained_post_update_repair(
                    b,
                    &specs[b],
                    &trial_beta,
                    &projected,
                    block_constraints_now[b].as_ref(),
                )?;
                states[b].beta.assign(&projected);
            }
            refresh_all_block_etas(family, specs, states)?;
            let trial_ll = match family.log_likelihood_only(states) {
                Ok(v) => v,
                Err(_) => {
                    for (b, s) in old_states.iter().enumerate() {
                        states[b] = s.clone();
                    }
                    refresh_all_block_etas(family, specs, states)?;
                    continue;
                }
            };
            let trial_penalty = total_quadratic_penalty(
                states,
                s_lambdas,
                ridge,
                options.ridge_policy,
                joint_bundle,
                Some(specs),
            );
            let trial_obj = -trial_ll + trial_penalty;
            if trial_obj.is_finite() && trial_obj <= old_obj + 1e-12 {
                *current_penalty = trial_penalty;
                *cached_eval = family.evaluate(states)?;
                accepted_polish = true;
                break;
            }
        }
        if !accepted_polish {
            // Restore and stop polishing.
            for (b, s) in old_states.iter().enumerate() {
                states[b] = s.clone();
            }
            refresh_all_block_etas(family, specs, states)?;
            break;
        }
    }
    Ok(())
}

/// Final result assembly for the blockwise / polish fall-through path of
/// [`inner_blockwise_fit`]. Computes the penalty value, the block log-dets, the
/// (converged-only) projected KKT residual for the IFT, and the active-constraint
/// block, then moves `states`, `s_lambdas`, and `cached_active_sets` into the
/// returned [`BlockwiseInnerResult`]. Behavior is identical to the inline code it
/// replaced — the `?`-propagation and the `converged`-gate on `kkt_residual` are
/// preserved verbatim.
pub(crate) fn assemble_inner_blockwise_result<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    mut states: Vec<ParameterBlockState>,
    block_log_lambdas: &[Array1<f64>],
    options: &BlockwiseFitOptions,
    s_lambdas: Vec<Array2<f64>>,
    ridge: f64,
    joint_bundle: Option<&crate::families::joint_penalty::JointPenaltyBundle>,
    cached_active_sets: Vec<Option<Vec<usize>>>,
    cached_eval: &FamilyEvaluation,
    converged: bool,
    cycles_done: usize,
    last_residual_tol: f64,
) -> Result<BlockwiseInnerResult, String> {
    // Reuse cached evaluation from the last cycle's end (or the initial eval if 0 cycles ran).
    let penalty_value = total_quadratic_penalty(
        &states,
        &s_lambdas,
        ridge,
        options.ridge_policy,
        joint_bundle,
        Some(specs),
    );

    let (block_logdet_h, block_logdet_s) =
        blockwise_logdet_terms(family, specs, &mut states, block_log_lambdas, options)?;
    let kkt_residual = if converged {
        match exact_newton_joint_gradient_from_eval(cached_eval, specs, &states)? {
            Some(gradient) => {
                let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
                let local_total_p: usize = specs.iter().map(|spec| spec.design.ncols()).sum();
                let active_set_rows_total: usize = cached_active_sets
                    .iter()
                    .map(|maybe| maybe.as_ref().map(|v| v.len()).unwrap_or(0))
                    .sum();
                let free_rank_at_cert = local_total_p.saturating_sub(active_set_rows_total);
                exact_newton_joint_projected_kkt_residual_for_ift_from_gradient(
                    &gradient,
                    specs,
                    &states,
                    &s_lambdas,
                    ridge,
                    options.ridge_policy,
                    &block_constraints,
                    Some(cached_active_sets.as_slice()),
                )?
                .map(|r| r.with_metadata(last_residual_tol, free_rank_at_cert))
            }
            None => None,
        }
    } else {
        // Inner did not converge; no caller should trust an IFT correction
        // at a non-KKT iterate.
        None
    };

    let active_constraints = {
        let local_ranges = block_param_ranges(specs);
        let local_total_p = local_ranges.last().map(|(_, end)| *end).unwrap_or(0);
        let block_constraints = collect_block_linear_constraints(family, &states, specs)?;
        assemble_active_constraint_block(
            &block_constraints,
            &cached_active_sets,
            &local_ranges,
            local_total_p,
        )
        .map(std::sync::Arc::new)
    };
    Ok(BlockwiseInnerResult {
        block_states: states,
        active_sets: normalize_active_sets(cached_active_sets),
        log_likelihood: cached_eval.log_likelihood,
        penalty_value,
        cycles: cycles_done,
        converged,
        block_logdet_h,
        block_logdet_s,
        s_lambdas,
        joint_workspace: None,
        kkt_residual,
        active_constraints,
    })
}

/// Borrowed derivative provider for joint models that wraps closures with
/// non-`'static` lifetimes.
///
/// The closures borrow data from the calling stack frame (family, synced states,
/// specs), so we use borrowed closures with a non-`'static` lifetime.
/// Instead we borrow the closures and implement `HessianDerivativeProvider` directly.
///
/// # Sign convention
///
/// The unified evaluator passes `v_k = H⁻¹(A_k β̂)` to `hessian_derivative_correction`.
/// By the implicit function theorem, `dβ̂/dρ_k = −v_k`. The stored `compute_dh`
/// expects the actual perturbation direction `δβ`, so we negate `v_k` before calling it.
pub(crate) struct BorrowedJointDerivProvider<'a> {
    pub(crate) compute_dh: &'a DriftDerivFn<'a>,
    pub(crate) compute_dh_many: Option<&'a DriftDerivManyFn<'a>>,
    pub(crate) compute_d2h: &'a DriftSecondDerivFn<'a>,
    /// Optional batched second-derivative callback. The unified evaluator's
    /// outer-Hessian ρ-ρ pair loop precomputes all K(K+1)/2 (v_k, v_l, u_kl)
    /// triples and calls this once per outer Hessian assembly when set, so
    /// families that fuse the per-row D²H walk across pairs (e.g. survival
    /// marginal-slope which scans n rows once per outer eval) replace
    /// K(K+1)/2 separate row-walks with one. The default `None` falls back
    /// to the per-pair `compute_d2h` dispatch and preserves the historical
    /// dispatch cost.
    pub(crate) compute_d2h_many: Option<&'a DriftSecondDerivManyFn<'a>>,
    pub(crate) family_outer_hessian_operator:
        Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>,
}

/// Shared `(term1, term2)` second-derivative correction assembly used by both
/// the borrowed and owned joint derivative providers. `compute_dh` supplies the
/// drift derivative `D_β H[u_kl]` (term1) and `compute_d2h` the mixed second
/// derivative `D²_β H[−v_l, −v_k]` (term2); the two are fused into a single
/// `CompositeHyperOperator`. Returns `None` as soon as either term is absent.
pub(crate) fn joint_second_derivative_correction_result(
    compute_dh: &dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String>,
    compute_d2h: &dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>,
    v_k: &Array1<f64>,
    v_l: &Array1<f64>,
    u_kl: &Array1<f64>,
) -> Result<Option<DriftDerivResult>, String> {
    let Some(term1) = compute_dh(u_kl)? else {
        return Ok(None);
    };
    let neg_v_k = -v_k;
    let neg_v_l = -v_l;
    let Some(term2) = compute_d2h(&neg_v_l, &neg_v_k)? else {
        return Ok(None);
    };
    let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
        dense: None,
        operators: vec![term1.into_operator(), term2.into_operator()],
        dim_hint: u_kl.len(),
    };
    Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
}

impl HessianDerivativeProvider for BorrowedJointDerivProvider<'_> {
    fn hessian_derivative_correction(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_derivative_correction_result(v_k)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        let neg_v = -v_k;
        (self.compute_dh)(&neg_v)
    }

    fn hessian_derivative_corrections_result(
        &self,
        v_ks: &[Array1<f64>],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        let neg_vs: Vec<Array1<f64>> = v_ks.iter().map(|v_k| -v_k).collect();
        if let Some(compute_dh_many) = self.compute_dh_many {
            compute_dh_many(&neg_vs)
        } else {
            neg_vs
                .iter()
                .map(|neg_v| (self.compute_dh)(neg_v))
                .collect()
        }
    }

    fn has_batched_hessian_derivative_corrections(&self) -> bool {
        self.compute_dh_many.is_some()
    }

    fn hessian_second_derivative_correction(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_second_derivative_correction_result(v_k, v_l, u_kl)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_second_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        joint_second_derivative_correction_result(self.compute_dh, self.compute_d2h, v_k, v_l, u_kl)
    }

    fn hessian_second_derivative_corrections_result(
        &self,
        triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        // Fast path: family supplied a batched D²H callback that fuses the
        // per-row scan across all K(K+1)/2 (v_k, v_l, u_kl) triples in one
        // pass. Pair it with the (also potentially batched) `compute_dh`
        // term1 walk over `u_kl` directions to keep the (term1, term2)
        // CompositeHyperOperator semantics that the singular hook produces.
        if let Some(compute_d2h_many) = self.compute_d2h_many {
            let u_kls: Vec<Array1<f64>> = triples.iter().map(|(_, _, u_kl)| u_kl.clone()).collect();
            let term1s = self.hessian_derivative_corrections_result(
                &u_kls.iter().map(|u| -u).collect::<Vec<_>>(),
            )?;
            let pairs: Vec<(Array1<f64>, Array1<f64>)> =
                triples.iter().map(|(v_k, v_l, _)| (-v_l, -v_k)).collect();
            let term2s = compute_d2h_many(&pairs)?;
            triples
                .iter()
                .enumerate()
                .map(|(idx, (_, _, u_kl))| match (&term1s[idx], &term2s[idx]) {
                    (Some(t1), Some(t2)) => {
                        let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
                            dense: None,
                            operators: vec![t1.clone().into_operator(), t2.clone().into_operator()],
                            dim_hint: u_kl.len(),
                        };
                        Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
                    }
                    _ => Ok(None),
                })
                .collect()
        } else {
            triples
                .iter()
                .map(|(v_k, v_l, u_kl)| {
                    self.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
                })
                .collect()
        }
    }

    fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
        self.compute_d2h_many.is_some()
    }

    fn has_corrections(&self) -> bool {
        true
    }

    fn family_outer_hessian_operator(
        &self,
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        self.family_outer_hessian_operator.clone()
    }
}

pub(crate) struct OwnedJointDerivProvider {
    pub(crate) compute_dh:
        Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
    pub(crate) compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    pub(crate) compute_d2h: Arc<
        dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
            + Send
            + Sync,
    >,
    /// Optional batched second-derivative callback. See the matching field on
    /// `BorrowedJointDerivProvider` for the dispatch contract.
    pub(crate) compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    pub(crate) family_outer_hessian_operator:
        Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>,
}

impl HessianDerivativeProvider for OwnedJointDerivProvider {
    fn hessian_derivative_correction(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_derivative_correction_result(v_k)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        let neg_v = -v_k;
        (self.compute_dh)(&neg_v)
    }

    fn hessian_derivative_corrections_result(
        &self,
        v_ks: &[Array1<f64>],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        let neg_vs: Vec<Array1<f64>> = v_ks.iter().map(|v_k| -v_k).collect();
        if let Some(compute_dh_many) = self.compute_dh_many.as_ref() {
            compute_dh_many(&neg_vs)
        } else {
            neg_vs
                .iter()
                .map(|neg_v| (self.compute_dh)(neg_v))
                .collect()
        }
    }

    fn has_batched_hessian_derivative_corrections(&self) -> bool {
        self.compute_dh_many.is_some()
    }

    fn hessian_second_derivative_correction(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        Ok(self
            .hessian_second_derivative_correction_result(v_k, v_l, u_kl)?
            .map(|result| result.into_operator().to_dense()))
    }

    fn hessian_second_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        joint_second_derivative_correction_result(
            &*self.compute_dh,
            &*self.compute_d2h,
            v_k,
            v_l,
            u_kl,
        )
    }

    fn hessian_second_derivative_corrections_result(
        &self,
        triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        if let Some(compute_d2h_many) = self.compute_d2h_many.as_ref() {
            let u_kls: Vec<Array1<f64>> = triples.iter().map(|(_, _, u_kl)| u_kl.clone()).collect();
            let term1s = self.hessian_derivative_corrections_result(
                &u_kls.iter().map(|u| -u).collect::<Vec<_>>(),
            )?;
            let pairs: Vec<(Array1<f64>, Array1<f64>)> =
                triples.iter().map(|(v_k, v_l, _)| (-v_l, -v_k)).collect();
            let term2s = compute_d2h_many(&pairs)?;
            triples
                .iter()
                .enumerate()
                .map(|(idx, (_, _, u_kl))| match (&term1s[idx], &term2s[idx]) {
                    (Some(t1), Some(t2)) => {
                        let op = crate::solver::estimate::reml::unified::CompositeHyperOperator {
                            dense: None,
                            operators: vec![t1.clone().into_operator(), t2.clone().into_operator()],
                            dim_hint: u_kl.len(),
                        };
                        Ok(Some(DriftDerivResult::Operator(Arc::new(op))))
                    }
                    _ => Ok(None),
                })
                .collect()
        } else {
            triples
                .iter()
                .map(|(v_k, v_l, u_kl)| {
                    self.hessian_second_derivative_correction_result(v_k, v_l, u_kl)
                })
                .collect()
        }
    }

    fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
        self.compute_d2h_many.is_some()
    }

    fn has_corrections(&self) -> bool {
        true
    }

    fn outer_hessian_derivative_kernel(
        &self,
    ) -> Option<crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel> {
        Some(
            crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel::Callback {
                first: Arc::clone(&self.compute_dh),
                second: Arc::clone(&self.compute_d2h),
            },
        )
    }

    fn family_outer_hessian_operator(
        &self,
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        self.family_outer_hessian_operator.clone()
    }
}

/// Drift closure producing the Tier-B Jeffreys-curvature drift
/// `D_β H_Φ[δβ]` for a mode-response direction `δβ = dβ̂/dρ_k`.
///
/// The closure already expects the actual perturbation direction `δβ` (NOT the
/// raw `v_k` the trait hands the provider); the wrapper negates `v_k → δβ = −v_k`
/// before calling, exactly mirroring `BorrowedJointDerivProvider`'s sign
/// convention and the inner `compute_dh` it composes with. Returns `None` when
/// the Jeffreys term is gated out or the family lacks the exact derivatives, so
/// the wrapper falls back to the inner provider's drift unchanged.
pub(crate) type JeffreysHphiDriftFn =
    Arc<dyn Fn(&Array1<f64>) -> Result<Option<Array2<f64>>, String> + Send + Sync>;

/// Jeffreys-`H_Φ`-aware joint derivative provider.
///
/// Wraps an inner Tier-B joint provider (which supplies the likelihood-Hessian
/// drift `D_β H_L[v_k]`) and ADDS the Jeffreys-curvature drift `D_β H_Φ[v_k]` to
/// the first-order trace corrections. This closes the bug where the Tier-B outer
/// LAML gradient omitted `H_Φ`'s ρ-dependence (through β̂): the objective folds
/// `H_Φ` into `½ log|H + S_λ + H_Φ|`, so its exact gradient
///   `½ tr[(H+S_λ+H_Φ)⁻¹ (∂_ρ S_λ + D_β H_L[v_k] + D_β H_Φ[v_k])]`
/// MUST include the `D_β H_Φ[v_k]` term. It is the exact analogue of the Tier-A
/// `FirthAwareGlmDerivatives` (`unified.rs`) `−D(Hφ)[B_k]` first-order term, and
/// of `BarrierDerivativeProvider`'s additive-correction composition pattern.
///
/// SIGN. The trait passes `v_k = H⁻¹(A_kβ̂)`; the mode response is `δβ = −v_k`.
/// We negate before invoking the drift closure, so `corr = + D_β H_Φ[δβ]` is
/// added on top of the inner provider's already-correct likelihood drift.
pub(crate) struct JeffreysHphiAwareJointDerivatives<'a> {
    pub(crate) inner: Box<dyn HessianDerivativeProvider + 'a>,
    pub(crate) drift: JeffreysHphiDriftFn,
    pub(crate) p: usize,
}

impl<'a> JeffreysHphiAwareJointDerivatives<'a> {
    pub(crate) fn new(
        inner: Box<dyn HessianDerivativeProvider + 'a>,
        drift: JeffreysHphiDriftFn,
        p: usize,
    ) -> Self {
        Self { inner, drift, p }
    }

    /// `D_β H_Φ[δβ]` with the trait's `v_k → δβ = −v_k` mode-response convention.
    pub(crate) fn hphi_drift(&self, v_k: &Array1<f64>) -> Result<Option<Array2<f64>>, String> {
        let delta = v_k.mapv(|value| -value);
        (self.drift)(&delta)
    }
}

impl HessianDerivativeProvider for JeffreysHphiAwareJointDerivatives<'_> {
    fn hessian_derivative_correction(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        let inner = self.inner.hessian_derivative_correction(v_k)?;
        let drift = self.hphi_drift(v_k)?;
        Ok(match (inner, drift) {
            (Some(mut ic), Some(d)) => {
                ic += &d;
                Some(ic)
            }
            (Some(ic), None) => Some(ic),
            (None, Some(d)) => Some(d),
            (None, None) => None,
        })
    }

    fn hessian_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        let inner = self.inner.hessian_derivative_correction_result(v_k)?;
        let drift = self.hphi_drift(v_k)?;
        Ok(match (inner, drift) {
            (Some(DriftDerivResult::Dense(mut dense)), Some(d)) => {
                dense += &d;
                Some(DriftDerivResult::Dense(dense))
            }
            (Some(DriftDerivResult::Operator(operator)), Some(d)) => {
                Some(DriftDerivResult::Operator(Arc::new(
                    crate::solver::estimate::reml::unified::CompositeHyperOperator {
                        dense: Some(d),
                        operators: vec![operator],
                        dim_hint: self.p,
                    },
                )))
            }
            (Some(other), None) => Some(other),
            (None, Some(d)) => Some(DriftDerivResult::Dense(d)),
            (None, None) => None,
        })
    }

    fn hessian_derivative_corrections_result(
        &self,
        v_ks: &[Array1<f64>],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        // Delegate the (possibly batched) inner walk, then fold the per-direction
        // H_Φ drift into each result so the batched path stays consistent with the
        // singular one.
        let inner = self.inner.hessian_derivative_corrections_result(v_ks)?;
        inner
            .into_iter()
            .zip(v_ks.iter())
            .map(|(inner_result, v_k)| {
                let drift = self.hphi_drift(v_k)?;
                Ok(match (inner_result, drift) {
                    (Some(DriftDerivResult::Dense(mut dense)), Some(d)) => {
                        dense += &d;
                        Some(DriftDerivResult::Dense(dense))
                    }
                    (Some(DriftDerivResult::Operator(operator)), Some(d)) => {
                        Some(DriftDerivResult::Operator(Arc::new(
                            crate::solver::estimate::reml::unified::CompositeHyperOperator {
                                dense: Some(d),
                                operators: vec![operator],
                                dim_hint: self.p,
                            },
                        )))
                    }
                    (Some(other), None) => Some(other),
                    (None, Some(d)) => Some(DriftDerivResult::Dense(d)),
                    (None, None) => None,
                })
            })
            .collect()
    }

    fn has_batched_hessian_derivative_corrections(&self) -> bool {
        self.inner.has_batched_hessian_derivative_corrections()
    }

    // SECOND-ORDER (outer Hessian) RESIDUAL GAP. The full second-order Jeffreys
    // drift `D²_β H_Φ[v_k, v_l]` (the analogue of Tier-A's
    // `−D(Hφ)[B_{kl}] − D²(Hφ)[B_k, B_l]`) is NOT yet folded in here: the
    // second-derivative methods delegate to the inner likelihood drift only. This
    // leaves the OUTER HESSIAN's Jeffreys contribution first-order-incomplete, but
    // the FIRST-ORDER outer GRADIENT — the term the line search and KKT
    // certification actually consume — is now exact. ARC/Newton on the outer
    // problem still gets a consistent gradient; the Hessian is a (PD) curvature
    // surrogate as before.
    fn hessian_second_derivative_correction(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<Array2<f64>>, String> {
        self.inner
            .hessian_second_derivative_correction(v_k, v_l, u_kl)
    }

    fn hessian_second_derivative_correction_result(
        &self,
        v_k: &Array1<f64>,
        v_l: &Array1<f64>,
        u_kl: &Array1<f64>,
    ) -> Result<Option<DriftDerivResult>, String> {
        self.inner
            .hessian_second_derivative_correction_result(v_k, v_l, u_kl)
    }

    fn hessian_second_derivative_corrections_result(
        &self,
        triples: &[(Array1<f64>, Array1<f64>, Array1<f64>)],
    ) -> Result<Vec<Option<DriftDerivResult>>, String> {
        self.inner
            .hessian_second_derivative_corrections_result(triples)
    }

    fn has_batched_hessian_second_derivative_corrections(&self) -> bool {
        self.inner
            .has_batched_hessian_second_derivative_corrections()
    }

    fn has_corrections(&self) -> bool {
        true
    }

    fn outer_hessian_derivative_kernel(
        &self,
    ) -> Option<crate::solver::estimate::reml::unified::OuterHessianDerivativeKernel> {
        // Delegate to the inner provider so the matrix-free outer-HESSIAN route
        // (the `Callback { first, second }` kernel) is preserved. This kernel
        // feeds ONLY the outer Hessian, never the gradient (the gradient's
        // first-order trace flows through `hessian_derivative_correction_result`,
        // which IS wrapped above). The H_Φ SECOND-order drift is the documented
        // residual gap; routing the kernel unchanged keeps the Hessian a
        // consistent PD curvature surrogate without forcing dense assembly.
        self.inner.outer_hessian_derivative_kernel()
    }

    fn family_outer_hessian_operator(
        &self,
    ) -> Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>> {
        self.inner.family_outer_hessian_operator()
    }
}

/// Optional bundle of extended (ψ) hyperparameter coordinate data to attach
/// to an `InnerSolution` before calling the unified evaluator.
pub(crate) struct ExtCoordBundle {
    pub(crate) coords: Vec<HyperCoord>,
    pub(crate) ext_ext_fn: Option<Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>>,
    pub(crate) rho_ext_fn: Option<Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>>,
    pub(crate) drift_fn: Option<FixedDriftDerivFn>,
    /// Direction-contracted ψψ second-order hook (#740). When `Some`, the
    /// outer-Hessian operator builder skips the `K²` per-pair ψψ assembly
    /// (`ext_ext_fn`) and applies this once per matvec. `ext_ext_fn` is still
    /// kept as the documented fallback for the dense `compute_outer_hessian`
    /// path and for outer evaluations that do not build the matrix-free
    /// operator.
    pub(crate) contracted_psi_fn: Option<ContractedPsiSecondOrderFn>,
}

pub(crate) struct ScaledHyperOperator {
    pub(crate) inner: Arc<dyn HyperOperator>,
    pub(crate) scale: f64,
}

impl HyperOperator for ScaledHyperOperator {
    fn dim(&self) -> usize {
        self.inner.dim()
    }

    fn mul_vec(&self, v: &Array1<f64>) -> Array1<f64> {
        self.inner.mul_vec(v).mapv(|value| self.scale * value)
    }

    fn bilinear(&self, v: &Array1<f64>, u: &Array1<f64>) -> f64 {
        self.scale * self.inner.bilinear(v, u)
    }

    fn to_dense(&self) -> Array2<f64> {
        self.inner.to_dense().mapv(|value| self.scale * value)
    }

    fn is_implicit(&self) -> bool {
        false
    }
}

pub(crate) fn scale_hypercoord_drift(mut drift: HyperCoordDrift, scale: f64) -> HyperCoordDrift {
    if scale == 1.0 {
        return drift;
    }
    if let Some(ref mut dense) = drift.dense {
        *dense *= scale;
    }
    if let Some(ref mut block_local) = drift.block_local {
        block_local.local *= scale;
    }
    if let Some(operator) = drift.operator.take() {
        drift.operator = Some(Arc::new(ScaledHyperOperator {
            inner: operator,
            scale,
        }));
    }
    drift
}

pub(crate) fn scale_hypercoord(mut coord: HyperCoord, scale: f64) -> HyperCoord {
    if scale == 1.0 {
        return coord;
    }
    coord.g *= scale;
    if let Some(firth_g) = coord.firth_g.as_mut() {
        *firth_g *= scale;
    }
    if let Some(tk_eta_fixed) = coord.tk_eta_fixed.as_mut() {
        *tk_eta_fixed *= scale;
    }
    if let Some(tk_x_fixed) = coord.tk_x_fixed.as_mut() {
        *tk_x_fixed *= scale;
    }
    coord.drift = scale_hypercoord_drift(coord.drift, scale);
    coord
}

pub(crate) fn scale_hypercoord_pair(mut pair: HyperCoordPair, scale: f64) -> HyperCoordPair {
    if scale == 1.0 {
        return pair;
    }
    pair.g *= scale;
    pair.b_mat *= scale;
    if let Some(operator) = pair.b_operator.take() {
        pair.b_operator = Some(Box::new(ScaledHyperOperator {
            inner: Arc::from(operator),
            scale,
        }));
    }
    pair
}

pub(crate) fn scale_drift_deriv_result(result: DriftDerivResult, scale: f64) -> DriftDerivResult {
    if scale == 1.0 {
        return result;
    }
    match result {
        DriftDerivResult::Dense(mut dense) => {
            dense *= scale;
            DriftDerivResult::Dense(dense)
        }
        DriftDerivResult::Operator(operator) => {
            DriftDerivResult::Operator(Arc::new(ScaledHyperOperator {
                inner: operator,
                scale,
            }))
        }
    }
}

impl ExtCoordBundle {
    pub(crate) fn scaled(self, scale: f64) -> Self {
        if scale == 1.0 {
            return self;
        }
        let coords = self
            .coords
            .into_iter()
            .map(|coord| scale_hypercoord(coord, scale))
            .collect();
        let ext_ext_fn = self.ext_ext_fn.map(|callback| {
            Box::new(move |i: usize, j: usize| scale_hypercoord_pair(callback(i, j), scale))
                as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
        });
        let rho_ext_fn = self.rho_ext_fn.map(|callback| {
            Box::new(move |i: usize, j: usize| scale_hypercoord_pair(callback(i, j), scale))
                as Box<dyn Fn(usize, usize) -> HyperCoordPair + Send + Sync>
        });
        let drift_fn = self.drift_fn.map(|callback| {
            Box::new(move |ext_idx: usize, direction: &Array1<f64>| {
                callback(ext_idx, direction).map(|result| scale_drift_deriv_result(result, scale))
            }) as FixedDriftDerivFn
        });
        // The contracted ψψ hook is a (scaled) linear functional of the same
        // family curvature `ext_ext_fn` reproduces, so the `rho_curvature_scale`
        // applies term-for-term: objective/score/ld_s by `scale`, and each
        // `hessian[i]` drift via `scale_drift_deriv_result` (matching how
        // `scale_hypercoord_pair` scales the per-pair `b_mat`/`b_operator`).
        let contracted_psi_fn = self.contracted_psi_fn.map(|callback| {
            Arc::new(move |alpha_psi: &[f64]| {
                callback(alpha_psi).map(|opt| {
                    opt.map(|contracted| ContractedPsiSecondOrder {
                        objective: contracted.objective.mapv(|v| scale * v),
                        score: contracted.score.mapv(|v| scale * v),
                        hessian: contracted
                            .hessian
                            .into_iter()
                            .map(|drift| scale_drift_deriv_result(drift, scale))
                            .collect(),
                        ld_s: contracted.ld_s.mapv(|v| scale * v),
                    })
                })
            }) as ContractedPsiSecondOrderFn
        });
        Self {
            coords,
            ext_ext_fn,
            rho_ext_fn,
            drift_fn,
            contracted_psi_fn,
        }
    }
}

/// Build the canonical unified REML/LAML assembly for a custom-family outer
/// evaluation.
pub(crate) fn build_custom_family_inner_assembly<'dp>(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    beta_flat: &Array1<f64>,
    hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
    include_logdet_h: bool,
    include_logdet_s: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    deriv_provider: Box<dyn HessianDerivativeProvider + 'dp>,
    ext_bundle: Option<ExtCoordBundle>,
    firth_value: Option<f64>,
) -> Result<(crate::estimate::reml::assembly::InnerAssembly<'dp>, usize), String> {
    use crate::estimate::reml::assembly::{
        InnerAssembly, PenaltyBlockDesc, penalty_coords_from_blocks,
    };

    // Collect dense penalty matrices so references stay valid for the assembler.
    let per_block_penalties_dense: Vec<Vec<Array2<f64>>> = {
        use rayon::iter::{IntoParallelIterator, ParallelIterator};
        (0..specs.len())
            .into_par_iter()
            .map(|b| specs[b].penalties.iter().map(|p| p.to_dense()).collect())
            .collect()
    };
    let block_descs: Vec<PenaltyBlockDesc> = (0..specs.len())
        .flat_map(|b| {
            let (start, end) = ranges[b];
            per_block_penalties_dense[b]
                .iter()
                .map(move |dense| PenaltyBlockDesc {
                    matrix: dense,
                    range_start: start,
                    range_end: end,
                })
        })
        .collect();
    let penalty_coords = penalty_coords_from_blocks(&block_descs, total)?;

    // Compute penalty logdet derivatives.
    let per_block_penalties: Vec<&[Array2<f64>]> = per_block_penalties_dense
        .iter()
        .map(|v| v.as_slice())
        .collect();
    let penalty_logdet_ridge = if options.ridge_policy.include_penalty_logdet {
        ridge
    } else {
        0.0
    };
    let penalty_logdet =
        compute_block_penalty_logdet_derivs(per_block, &per_block_penalties, penalty_logdet_ridge)?;

    let n_observations = inner.block_states.first().map(|s| s.eta.len()).unwrap_or(0);

    // Unpack optional ext-coord bundle.
    let (ext_coords, ext_coord_pair_fn, rho_ext_pair_fn, fixed_drift_deriv, contracted_psi_fn) =
        if let Some(bundle) = ext_bundle {
            (
                bundle.coords,
                bundle.ext_ext_fn,
                bundle.rho_ext_fn,
                bundle.drift_fn,
                bundle.contracted_psi_fn,
            )
        } else {
            (Vec::new(), None, None, None, None)
        };

    let ext_dim = ext_coords.len();

    let evaluator = InnerAssembly {
        log_likelihood: inner.log_likelihood,
        // inner.penalty_value includes the 0.5 factor (= 0.5 β̂ᵀSβ̂), but the
        // unified evaluator convention expects the FULL quadratic β̂ᵀSβ̂ and
        // applies 0.5 itself. Double to match the convention.
        penalty_quadratic: 2.0 * inner.penalty_value,
        beta: beta_flat.clone(),
        n_observations,
        hessian_op,
        penalty_coords,
        penalty_logdet,
        dispersion: DispersionHandling::Fixed {
            phi: 1.0,
            include_logdet_h,
            include_logdet_s,
        },
        rho_curvature_scale,
        rho_prior,
        hessian_logdet_correction,
        penalty_subspace_trace,
        deriv_provider: Some(deriv_provider),
        tk_correction: 0.0,
        tk_gradient: None,
        // Tier-B Firth fold (gam#979): the inner mode minimizes
        // `−ℓ + ½βᵀSβ − Φ`, so the LAML cost must subtract the same gated
        // `Φ(β̂)` or the envelope-based analytic outer gradient and the value
        // describe different criteria at every Firth-active mode.
        firth: firth_value.map(crate::estimate::reml::unified::ExactJeffreysTerm::value_only),
        nullspace_dim: None,
        barrier_config: None,
        ext_coords,
        ext_coord_pair_fn,
        rho_ext_pair_fn,
        fixed_drift_deriv,
        contracted_psi_second_order: contracted_psi_fn,
        kkt_residual: inner.kkt_residual.clone(),
        active_constraints: inner.active_constraints.clone(),
    };

    Ok((evaluator, ext_dim))
}

pub(crate) struct FirstOrderTraceSkipOperator {
    pub(crate) inner: Arc<dyn HessianOperator>,
    pub(crate) remaining_first_order_traces: AtomicUsize,
}

impl FirstOrderTraceSkipOperator {
    pub(crate) fn new(inner: Arc<dyn HessianOperator>, skip_count: usize) -> Self {
        Self {
            inner,
            remaining_first_order_traces: AtomicUsize::new(skip_count),
        }
    }

    pub(crate) fn first_order_skip_active(&self) -> bool {
        self.remaining_first_order_traces.load(Ordering::Acquire) > 0
    }

    pub(crate) fn consume_first_order_trace(&self) -> bool {
        let mut current = self.remaining_first_order_traces.load(Ordering::Acquire);
        while current > 0 {
            match self.remaining_first_order_traces.compare_exchange(
                current,
                current - 1,
                Ordering::AcqRel,
                Ordering::Acquire,
            ) {
                Ok(_) => return true,
                Err(actual) => current = actual,
            }
        }
        false
    }
}

impl HessianOperator for FirstOrderTraceSkipOperator {
    fn logdet(&self) -> f64 {
        self.inner.logdet()
    }

    fn trace_hinv_product(&self, a: &Array2<f64>) -> f64 {
        self.inner.trace_hinv_product(a)
    }

    fn as_exact_dense_spectral(&self) -> Option<&DenseSpectralOperator> {
        if self.first_order_skip_active() {
            None
        } else {
            self.inner.as_exact_dense_spectral()
        }
    }

    fn assemble_h_dense_for_tangent_projection(&self) -> Result<Array2<f64>, String> {
        if self.first_order_skip_active() {
            Err("backend does not support tangent projection".to_string())
        } else {
            self.inner.assemble_h_dense_for_tangent_projection()
        }
    }

    fn trace_hinv_operator(&self, op: &dyn HyperOperator) -> f64 {
        self.inner.trace_hinv_operator(op)
    }

    fn trace_hinv_h_k(
        &self,
        a_k: &Array2<f64>,
        third_deriv_correction: Option<&Array2<f64>>,
    ) -> f64 {
        self.inner.trace_hinv_h_k(a_k, third_deriv_correction)
    }

    fn solve(&self, rhs: &Array1<f64>) -> Array1<f64> {
        self.inner.solve(rhs)
    }

    fn solve_multi(&self, rhs: &Array2<f64>) -> Array2<f64> {
        self.inner.solve_multi(rhs)
    }

    fn stochastic_trace_solve(&self, rhs: &Array1<f64>, rel_tol: f64) -> Array1<f64> {
        self.inner.stochastic_trace_solve(rhs, rel_tol)
    }

    fn stochastic_trace_solve_for_probe(
        &self,
        rhs: &Array1<f64>,
        rel_tol: f64,
        probe_id: u64,
        trace_state: Option<&Arc<Mutex<StochasticTraceState>>>,
    ) -> Array1<f64> {
        self.inner
            .stochastic_trace_solve_for_probe(rhs, rel_tol, probe_id, trace_state)
    }

    fn stochastic_trace_solve_multi(&self, rhs: &Array2<f64>, rel_tol: f64) -> Array2<f64> {
        self.inner.stochastic_trace_solve_multi(rhs, rel_tol)
    }

    fn has_matrix_free_trace_cg_operator(&self) -> bool {
        self.inner.has_matrix_free_trace_cg_operator()
    }

    fn trace_hinv_product_cross(&self, a: &Array2<f64>, b: &Array2<f64>) -> f64 {
        self.inner.trace_hinv_product_cross(a, b)
    }

    fn trace_hinv_matrix_operator_cross(
        &self,
        matrix: &Array2<f64>,
        op: &dyn HyperOperator,
    ) -> f64 {
        self.inner.trace_hinv_matrix_operator_cross(matrix, op)
    }

    fn trace_hinv_operator_cross(
        &self,
        left: &dyn HyperOperator,
        right: &dyn HyperOperator,
    ) -> f64 {
        self.inner.trace_hinv_operator_cross(left, right)
    }

    fn trace_logdet_gradient(&self, a: &Array2<f64>) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner.trace_logdet_gradient(a)
        }
    }

    fn xt_logdet_kernel_x_diagonal(&self, x: &DesignMatrix) -> Array1<f64> {
        self.inner.xt_logdet_kernel_x_diagonal(x)
    }

    fn trace_logdet_operator(&self, op: &dyn HyperOperator) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner.trace_logdet_operator(op)
        }
    }

    fn trace_logdet_h_k(
        &self,
        a_k: &Array2<f64>,
        third_deriv_correction: Option<&Array2<f64>>,
    ) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner.trace_logdet_h_k(a_k, third_deriv_correction)
        }
    }

    fn trace_logdet_h_k_operator(
        &self,
        b_k: &dyn HyperOperator,
        third_deriv_correction: Option<&Array2<f64>>,
    ) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner
                .trace_logdet_h_k_operator(b_k, third_deriv_correction)
        }
    }

    fn trace_logdet_block_local(
        &self,
        block: &Array2<f64>,
        scale: f64,
        start: usize,
        end: usize,
    ) -> f64 {
        if self.consume_first_order_trace() {
            0.0
        } else {
            self.inner
                .trace_logdet_block_local(block, scale, start, end)
        }
    }

    fn trace_hinv_block_local(
        &self,
        block: &Array2<f64>,
        scale: f64,
        start: usize,
        end: usize,
    ) -> f64 {
        self.inner.trace_hinv_block_local(block, scale, start, end)
    }

    fn trace_hinv_block_local_cross(
        &self,
        block: &Array2<f64>,
        scale: f64,
        start: usize,
        end: usize,
    ) -> f64 {
        self.inner
            .trace_hinv_block_local_cross(block, scale, start, end)
    }

    fn trace_logdet_hessian_cross(&self, h_i: &Array2<f64>, h_j: &Array2<f64>) -> f64 {
        self.inner.trace_logdet_hessian_cross(h_i, h_j)
    }

    fn trace_logdet_hessian_cross_matrix_operator(
        &self,
        h_i: &Array2<f64>,
        h_j: &dyn HyperOperator,
    ) -> f64 {
        self.inner
            .trace_logdet_hessian_cross_matrix_operator(h_i, h_j)
    }

    fn trace_logdet_hessian_cross_operator(
        &self,
        h_i: &dyn HyperOperator,
        h_j: &dyn HyperOperator,
    ) -> f64 {
        self.inner.trace_logdet_hessian_cross_operator(h_i, h_j)
    }

    fn trace_logdet_hessian_crosses(&self, matrices: &[&Array2<f64>]) -> Array2<f64> {
        self.inner.trace_logdet_hessian_crosses(matrices)
    }

    fn active_rank(&self) -> usize {
        self.inner.active_rank()
    }

    fn dim(&self) -> usize {
        self.inner.dim()
    }

    fn is_dense(&self) -> bool {
        self.inner.is_dense()
    }

    fn prefers_stochastic_trace_estimation(&self) -> bool {
        if self.first_order_skip_active() {
            false
        } else {
            self.inner.prefers_stochastic_trace_estimation()
        }
    }

    fn logdet_traces_match_hinv_kernel(&self) -> bool {
        self.inner.logdet_traces_match_hinv_kernel()
    }

    fn as_dense_spectral(&self) -> Option<&DenseSpectralOperator> {
        if self.first_order_skip_active() {
            None
        } else {
            self.inner.as_dense_spectral()
        }
    }
}

/// Build an `InnerSolution` from joint Hessian data and call the unified evaluator.
///
/// Bridge between the custom family's joint Hessian infrastructure and the
/// unified REML/LAML evaluator, routed through the canonical assembly module.
pub(crate) fn unified_joint_cost_gradient(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
    include_logdet_h: bool,
    include_logdet_s: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    deriv_provider: Box<dyn HessianDerivativeProvider + '_>,
    eval_mode: EvalMode,
    ext_bundle: Option<ExtCoordBundle>,
    first_order_trace_skip: Option<Array1<f64>>,
    // Gated Tier-B Jeffreys value `Φ(β̂)`, folded into the LAML cost
    // (`cost −= Φ`) so the outer criterion matches the Φ-augmented inner
    // objective (gam#979). `None` when the term is unavailable/gated to zero.
    firth_value: Option<f64>,
) -> Result<
    (
        f64,
        Array1<f64>,
        crate::solver::outer_strategy::HessianResult,
    ),
    String,
> {
    let hessian_op: Arc<dyn HessianOperator> = match first_order_trace_skip.as_ref() {
        Some(trace_values) if !trace_values.is_empty() => Arc::new(
            FirstOrderTraceSkipOperator::new(hessian_op, trace_values.len()),
        ),
        _ => hessian_op,
    };
    let (evaluator, ext_dim) = build_custom_family_inner_assembly(
        inner,
        specs,
        per_block,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        deriv_provider,
        ext_bundle,
        firth_value,
    )?;
    let rho_slice = rho
        .as_slice()
        .ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
    let first_order_trace_correction = first_order_trace_skip.map(|trace_values| {
        let gradient_correction = trace_values.mapv(|trace| 0.5 * trace);
        (0.0, gradient_correction, None)
    });
    let result = evaluator.evaluate(rho_slice, eval_mode, first_order_trace_correction)?;

    let cost = result.cost;
    let gradient = result
        .gradient
        .unwrap_or_else(|| Array1::zeros(rho.len() + ext_dim));

    let hessian = result.hessian;

    Ok((cost, gradient, hessian))
}

pub(crate) fn unified_joint_efs_eval(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator>,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    penalty_subspace_trace: Option<Arc<PenaltySubspaceTrace>>,
    include_logdet_h: bool,
    include_logdet_s: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    deriv_provider: Box<dyn HessianDerivativeProvider + '_>,
    ext_bundle: Option<ExtCoordBundle>,
) -> Result<crate::solver::outer_strategy::EfsEval, String> {
    let (assembly, _) = build_custom_family_inner_assembly(
        inner,
        specs,
        per_block,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        deriv_provider,
        ext_bundle,
        // The EFS screening path evaluates the Φ-less criterion with an
        // unaugmented operator throughout; it stays self-consistent without
        // the Tier-B Firth fold.
        None,
    )?;
    let rho_slice = rho
        .as_slice()
        .ok_or_else(|| "outer rho vector must be contiguous".to_string())?;
    let inner_solution = assembly.build();
    let has_psi = inner_solution
        .ext_coords
        .iter()
        .any(|coord| !coord.is_penalty_like);
    // Always evaluate gradient: the universal-form EFS step
    // `Δρ = log(1 − 2·g_full / q_eff)` reads it directly from the cost
    // gradient slot, so out-of-band cost terms (TK, prior, Firth,
    // barrier, SAS log-δ ridge) shift the multiplicative target through
    // their gradient contribution without needing per-augmentation
    // post-corrections.
    let eval_mode = EvalMode::ValueAndGradient;
    let result = crate::estimate::reml::assembly::evaluate_solution(
        &inner_solution,
        rho_slice,
        eval_mode,
        None,
    )?;

    let gradient = result
        .gradient
        .as_ref()
        .ok_or_else(|| "EFS evaluation did not return the required gradient".to_string())?;
    let gradient_slice = gradient
        .as_slice()
        .ok_or_else(|| "outer gradient must be contiguous for EFS".to_string())?;

    if has_psi {
        let inner_hessian_scale = crate::estimate::reml::unified::hessian_operator_geometric_scale(
            inner_solution.hessian_op.as_ref(),
        );
        let hybrid = crate::estimate::reml::unified::compute_hybrid_efs_update(
            &inner_solution,
            rho_slice,
            gradient_slice,
        );
        Ok(crate::solver::outer_strategy::EfsEval {
            cost: result.cost,
            steps: hybrid.steps,
            beta: Some(inner_solution.beta.clone()),
            psi_gradient: if hybrid.psi_gradient.is_empty() {
                None
            } else {
                Some(Array1::from_vec(hybrid.psi_gradient))
            },
            psi_indices: if hybrid.psi_indices.is_empty() {
                None
            } else {
                Some(hybrid.psi_indices)
            },
            inner_hessian_scale,
            logdet_enclosure_gap: None,
        })
    } else {
        let inner_hessian_scale = crate::estimate::reml::unified::hessian_operator_geometric_scale(
            inner_solution.hessian_op.as_ref(),
        );
        Ok(crate::solver::outer_strategy::EfsEval {
            cost: result.cost,
            steps: crate::estimate::reml::unified::compute_efs_update(
                &inner_solution,
                rho_slice,
                gradient_slice,
            ),
            beta: Some(inner_solution.beta.clone()),
            psi_gradient: None,
            psi_indices: None,
            inner_hessian_scale,
            logdet_enclosure_gap: None,
        })
    }
}

/// Shared implementation for the joint exact-Newton and surrogate outer paths.
///
/// Both paths differ only in:
/// - how the joint Hessian source is obtained (exact vs surrogate family methods)
/// - the closure for computing D_β H_L[v] (`compute_dh`)
/// - the closure for computing D²_β H_L[u, v] (`compute_d2h`)
/// - whether a tangent-basis projection is applied to the mode inverse
///
/// This function encapsulates all shared logic: penalty assembly, mode inverse
/// computation, precomputation of joint corrections + second-order traces, and
/// routing through `unified_joint_cost_gradient`.
pub(crate) fn joint_outer_evaluate(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    h_joint_unpen: JointHessianSource,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    moderidge: f64,
    extra_logdet_ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    include_logdet_h: bool,
    include_logdet_s: bool,
    strict_spd: bool,
    project_hessian_logdet: bool,
    eval_mode: EvalMode,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    pseudo_logdet_mode: PseudoLogdetMode,
    compute_dh: &DriftDerivFn<'_>,
    compute_dh_many: Option<&DriftDerivManyFn<'_>>,
    compute_d2h: &DriftSecondDerivFn<'_>,
    compute_d2h_many: Option<&DriftSecondDerivManyFn<'_>>,
    owned_compute_dh: Option<
        Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
    >,
    owned_compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    owned_compute_d2h: Option<
        Arc<
            dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
                + Send
                + Sync,
        >,
    >,
    owned_compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    ext_bundle: Option<ExtCoordBundle>,
    first_order_trace_skip: Option<Array1<f64>>,
    batched_outer_hessian_operator: Option<
        Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
    >,
    // Universal under-identification robustness (always armed when the family can
    // expose an exact joint Hessian). The
    // outer REML logdet AND its trace derivatives must run on the same
    // Jeffreys-augmented Hessian `H + S_λ + H_Φ` the inner Newton converged on,
    // or the LAML value and its analytic gradient describe different objectives.
    // Folding `H_Φ` into the operator's matvec augments the inverse/logdet, but is
    // NOT by itself sufficient: `H_Φ` depends on ρ THROUGH β̂, so the trace
    // contraction also needs its mode-response drift `D_β H_Φ[v_k]` — supplied
    // separately via `jeffreys_hphi_drift` and folded into the first-order trace
    // by `JeffreysHphiAwareJointDerivatives`. `None` means this evaluation has
    // no active Jeffreys curvature (empty system, unavailable exact derivatives,
    // or the conditioning gate proved the term zero), not a user-selected
    // robustness-off mode.
    // Gated Jeffreys VALUE `Φ(β̂)` paired with the divided-difference curvature
    // `H_Φ` and its (optional) second-order completion, all from the same term
    // evaluation. The value is folded into the LAML cost (`cost −= Φ`) so the
    // outer criterion is the Laplace approximation of the SAME Firth-augmented
    // objective the inner Newton converged on; the completion is folded into
    // the mode-response OPERATOR only (see
    // `custom_family_outer_jeffreys_hphi` for the chain-rule split) (gam#979).
    robust_jeffreys_phi_hphi: Option<(f64, Array2<f64>, Option<Array2<f64>>)>,
    // Companion mode-response drift `D_β H_Φ[δβ]` for the outer gradient's trace
    // identity. `Some` exactly when `robust_jeffreys_phi_hphi` is `Some` (same
    // under-identified span); installing it wraps the derivative provider so the
    // first-order trace gains the `½ tr[(H+S_λ+H_Φ)⁻¹ D_β H_Φ[v_k]]` term that
    // makes the analytic gradient match the augmented objective. `None` ⇒ the
    // provider is used unwrapped.
    jeffreys_hphi_drift: Option<JeffreysHphiDriftFn>,
) -> Result<OuterObjectiveEvalResult, String> {
    let joint_trace_diagonal_ridge = moderidge + if !strict_spd { extra_logdet_ridge } else { 0.0 };
    let scaled_joint_trace_diagonal_ridge = rho_curvature_scale * joint_trace_diagonal_ridge;

    let (robust_jeffreys_phi, robust_jeffreys_hphi, robust_jeffreys_completion): (
        Option<f64>,
        Option<Array2<f64>>,
        Option<Array2<f64>>,
    ) = match robust_jeffreys_phi_hphi {
        Some((phi, hphi, completion)) => (Some(phi), Some(hphi), completion),
        None => (None, None, None),
    };
    // Mode-response operator curvature: the divided-difference `H_Φ` PLUS its
    // second-order completion when available — the TRUE Hessian of the
    // Φ-augmented inner objective, which is what `v_k = ∂β̂/∂ρ_k` solves
    // against. The logdet VALUE and its trace kernel keep the bare `H_Φ`
    // (value↔drift consistency); see `custom_family_outer_jeffreys_hphi`.
    // Folded ONLY when the projected kernel will own the value and the
    // first-order traces (the same precondition as the kernel install below);
    // on the unprojected route the operator IS the value/trace object and
    // must stay on the divided-difference pair.
    let completion_in_operator = project_hessian_logdet
        && include_logdet_h
        && include_logdet_s
        && pseudo_logdet_mode == PseudoLogdetMode::Smooth;
    let robust_jeffreys_hphi_for_operator: Option<Array2<f64>> = match (
        robust_jeffreys_hphi.as_ref(),
        robust_jeffreys_completion.filter(|_| completion_in_operator),
    ) {
        (Some(hphi), Some(completion)) => Some(hphi + &completion),
        (Some(hphi), None) => Some(hphi.clone()),
        (None, _) => None,
    };
    // Pre-scale the outer-REML Jeffreys curvature into the same rescaled space as
    // the penalties so the projected-logdet path and the operator agree. `None`
    // (flag OFF / no under-identified span) keeps the released outer REML exact.
    let scaled_robust_jeffreys_hphi: Option<Array2<f64>> = robust_jeffreys_hphi
        .as_ref()
        .map(|hphi| hphi.mapv(|value| rho_curvature_scale * value));

    // Build derivative provider from the caller-supplied closures.
    let base_provider_box: Box<dyn HessianDerivativeProvider + '_> =
        if let (Some(owned_dh), Some(owned_d2h)) = (owned_compute_dh, owned_compute_d2h) {
            Box::new(OwnedJointDerivProvider {
                compute_dh: owned_dh,
                compute_dh_many: owned_compute_dh_many,
                compute_d2h: owned_d2h,
                compute_d2h_many: owned_compute_d2h_many,
                family_outer_hessian_operator: batched_outer_hessian_operator.clone(),
            })
        } else {
            Box::new(BorrowedJointDerivProvider {
                compute_dh,
                compute_dh_many,
                compute_d2h,
                compute_d2h_many,
                family_outer_hessian_operator: batched_outer_hessian_operator.clone(),
            })
        };

    // Install the Jeffreys-`H_Φ` mode-response drift on top of the likelihood
    // drift whenever the Jeffreys term is active. This is the term that makes the
    // analytic outer gradient match the augmented objective `½ log|H+S_λ+H_Φ|`;
    // without it the gradient omits `D_β H_Φ[v_k]` and the line search / KKT
    // certification drifts in exactly the near-separating regime this machinery
    // exists for. `None` ⇒ provider used unwrapped (byte-identical released path).
    let provider_box: Box<dyn HessianDerivativeProvider + '_> = match jeffreys_hphi_drift {
        Some(drift) => Box::new(JeffreysHphiAwareJointDerivatives::new(
            base_provider_box,
            drift,
            total,
        )),
        None => base_provider_box,
    };

    let scaled_s_lambdas: Vec<Array2<f64>> = inner
        .s_lambdas
        .iter()
        .map(|matrix| {
            if rho_curvature_scale == 1.0 {
                matrix.clone()
            } else {
                matrix.mapv(|value| rho_curvature_scale * value)
            }
        })
        .collect();

    let hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator> =
        if use_joint_matrix_free_path(total, joint_observation_count(&inner.block_states)) {
            let ranges_vec = ranges.to_vec();
            let s_lambdas = Arc::new(scaled_s_lambdas.clone());
            let trace_diagonal_ridge = scaled_joint_trace_diagonal_ridge
                + rho_curvature_scale * JOINT_TRACE_STABILITY_RIDGE;
            match &h_joint_unpen {
                JointHessianSource::Dense(h_joint) => {
                    let h_joint = Arc::new(h_joint.clone());
                    let apply_h = Arc::clone(&h_joint);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    let apply_hphi = robust_jeffreys_hphi_for_operator.clone();
                    let hphi_scale = rho_curvature_scale;
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = apply_h.dot(v);
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            if let Some(hphi) = apply_hphi.as_ref() {
                                let jeffreys = hphi.dot(v);
                                out.scaled_add(hphi_scale, &jeffreys);
                            }
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
                JointHessianSource::Operator { apply, .. } => {
                    let apply_h = Arc::clone(apply);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    let apply_hphi = robust_jeffreys_hphi_for_operator.clone();
                    let hphi_scale = rho_curvature_scale;
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = match apply_h(v) {
                                Ok(out) => out,
                                Err(error) => {
                                    log::warn!(
                                        "joint exact-newton operator matvec failed during outer trace construction: {error}"
                                    );
                                    Array1::<f64>::from_elem(total, f64::NAN)
                                }
                            };
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            if let Some(hphi) = apply_hphi.as_ref() {
                                let jeffreys = hphi.dot(v);
                                out.scaled_add(hphi_scale, &jeffreys);
                            }
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
            }
        } else {
            let mut j_for_traces = materialize_joint_hessian_source(
                &h_joint_unpen,
                total,
                "joint exact-newton Hessian materialization",
            )?;
            add_joint_penalty_to_matrix(
                &mut j_for_traces,
                ranges,
                &scaled_s_lambdas,
                scaled_joint_trace_diagonal_ridge,
                None,
            );
            if let Some(hphi) = robust_jeffreys_hphi_for_operator.as_ref() {
                j_for_traces.scaled_add(rho_curvature_scale, hphi);
            }
            Arc::new(
                BlockCoupledOperator::from_joint_hessian_with_mode(
                    &j_for_traces,
                    pseudo_logdet_mode,
                )
                .map_err(|e| format!("BlockCoupledOperator from joint Hessian: {e}"))?,
            )
        };

    let (projected_logdet_correction, penalty_subspace_trace) = if project_hessian_logdet
        && include_logdet_h
        && include_logdet_s
        && pseudo_logdet_mode == PseudoLogdetMode::Smooth
    {
        let (projected_logdet, kernel) = joint_penalty_subspace_trace_parts(
            &h_joint_unpen,
            ranges,
            &scaled_s_lambdas,
            total,
            scaled_joint_trace_diagonal_ridge,
            scaled_robust_jeffreys_hphi.as_ref(),
        )?;
        let correction = projected_logdet - hessian_op.logdet();
        if kernel.is_some() {
            log::debug!(
                "[OUTER hessian-route] joint penalty subspace trace installed correction={:.6e}",
                correction
            );
        }
        (correction, kernel.map(Arc::new))
    } else {
        (0.0, None)
    };
    let hessian_logdet_correction = hessian_logdet_correction + projected_logdet_correction;

    let expected_theta_dim = rho.len()
        + ext_bundle
            .as_ref()
            .map(|bundle| bundle.coords.len())
            .unwrap_or(0);
    let has_penalty_subspace_trace = penalty_subspace_trace.is_some();

    // Option C: when the caller already has the batched first-order
    // logdet traces, let the unified VGH path keep all mode-response,
    // second-order, and Hessian work, but short-circuit only the
    // soon-discarded first-order trace calls. The projected-subspace
    // trace path is left untouched because the Hessian shares that
    // kernel and it is not routed through HessianOperator trace methods.
    // Bind the gating flag before `penalty_subspace_trace` is consumed by
    // the call below so the trace-skip choice does not depend on a moved
    // value (was: `if penalty_subspace_trace.is_none()` evaluated AFTER
    // the trace had already been forwarded to `unified_joint_cost_gradient`).
    let first_order_trace_skip = if penalty_subspace_trace.is_none() {
        first_order_trace_skip
    } else {
        None
    };
    let (objective, grad, outer_hessian) = unified_joint_cost_gradient(
        inner,
        specs,
        per_block,
        rho,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        provider_box,
        eval_mode,
        ext_bundle.map(|bundle| bundle.scaled(rho_curvature_scale)),
        // Option C: when the caller already has the batched first-order
        // logdet traces, let the unified VGH path keep all mode-response,
        // second-order, and Hessian work, but short-circuit only the
        // soon-discarded first-order trace calls. The projected-subspace
        // trace path is left untouched because the Hessian shares that
        // kernel and it is not routed through HessianOperator trace methods.
        if has_penalty_subspace_trace {
            None
        } else {
            first_order_trace_skip
        },
        robust_jeffreys_phi,
    )?;
    if !objective.is_finite() {
        log::warn!(
            "joint outer evaluation produced non-finite objective: log_likelihood={} penalty_value={} block_logdet_h={} block_logdet_s={} include_logdet_h={} include_logdet_s={} rho_curvature_scale={}",
            inner.log_likelihood,
            inner.penalty_value,
            inner.block_logdet_h,
            inner.block_logdet_s,
            include_logdet_h,
            include_logdet_s,
            rho_curvature_scale,
        );
        return Err(CustomFamilyError::NumericalFailure {
            reason: "joint outer evaluation produced a non-finite objective".to_string(),
        }
        .into());
    }
    if grad.iter().any(|value| !value.is_finite()) {
        return Err(CustomFamilyError::NumericalFailure {
            reason: "joint outer evaluation produced a non-finite gradient".to_string(),
        }
        .into());
    }
    if grad.len() != expected_theta_dim {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "joint outer evaluation returned gradient length {}, expected {}",
                grad.len(),
                expected_theta_dim
            ),
        }
        .into());
    }
    match &outer_hessian {
        crate::solver::outer_strategy::HessianResult::Analytic(hessian) => {
            if hessian.iter().any(|value| !value.is_finite()) {
                return Err(CustomFamilyError::NumericalFailure {
                    reason: "joint outer evaluation produced a non-finite Hessian".to_string(),
                }
                .into());
            }
            if hessian.nrows() != expected_theta_dim || hessian.ncols() != expected_theta_dim {
                return Err(CustomFamilyError::DimensionMismatch {
                    reason: format!(
                        "joint outer evaluation returned Hessian shape {}x{}, expected {}x{}",
                        hessian.nrows(),
                        hessian.ncols(),
                        expected_theta_dim,
                        expected_theta_dim
                    ),
                }
                .into());
            }
        }
        crate::solver::outer_strategy::HessianResult::Operator(op) => {
            if op.dim() != expected_theta_dim {
                return Err(format!(
                    "joint outer evaluation returned operator Hessian dim {}, expected {}",
                    op.dim(),
                    expected_theta_dim
                ));
            }
        }
        crate::solver::outer_strategy::HessianResult::Unavailable => {}
    }

    let warm = ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|st| st.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(inner)),
    };

    Ok(OuterObjectiveEvalResult {
        objective,
        gradient: grad,
        outer_hessian,
        warm_start: warm,
        inner_converged: inner.converged,
    })
}

pub(crate) fn joint_outer_evaluate_efs(
    inner: &BlockwiseInnerResult,
    specs: &[ParameterBlockSpec],
    per_block: &[Array1<f64>],
    rho: &Array1<f64>,
    beta_flat: &Array1<f64>,
    h_joint_unpen: JointHessianSource,
    ranges: &[(usize, usize)],
    total: usize,
    ridge: f64,
    moderidge: f64,
    extra_logdet_ridge: f64,
    rho_curvature_scale: f64,
    hessian_logdet_correction: f64,
    include_logdet_h: bool,
    include_logdet_s: bool,
    strict_spd: bool,
    project_hessian_logdet: bool,
    options: &BlockwiseFitOptions,
    rho_prior: crate::types::RhoPrior,
    pseudo_logdet_mode: PseudoLogdetMode,
    compute_dh: &DriftDerivFn<'_>,
    compute_dh_many: Option<&DriftDerivManyFn<'_>>,
    compute_d2h: &DriftSecondDerivFn<'_>,
    compute_d2h_many: Option<&DriftSecondDerivManyFn<'_>>,
    owned_compute_dh: Option<
        Arc<dyn Fn(&Array1<f64>) -> Result<Option<DriftDerivResult>, String> + Send + Sync>,
    >,
    owned_compute_dh_many: Option<
        Arc<dyn Fn(&[Array1<f64>]) -> Result<Vec<Option<DriftDerivResult>>, String> + Send + Sync>,
    >,
    owned_compute_d2h: Option<
        Arc<
            dyn Fn(&Array1<f64>, &Array1<f64>) -> Result<Option<DriftDerivResult>, String>
                + Send
                + Sync,
        >,
    >,
    owned_compute_d2h_many: Option<
        Arc<
            dyn Fn(&[(Array1<f64>, Array1<f64>)]) -> Result<Vec<Option<DriftDerivResult>>, String>
                + Send
                + Sync,
        >,
    >,
    ext_bundle: Option<ExtCoordBundle>,
) -> Result<crate::solver::outer_strategy::EfsEval, String> {
    let joint_trace_diagonal_ridge = moderidge + if !strict_spd { extra_logdet_ridge } else { 0.0 };
    let scaled_joint_trace_diagonal_ridge = rho_curvature_scale * joint_trace_diagonal_ridge;

    let provider_box: Box<dyn HessianDerivativeProvider + '_> =
        if let (Some(owned_dh), Some(owned_d2h)) = (owned_compute_dh, owned_compute_d2h) {
            Box::new(OwnedJointDerivProvider {
                compute_dh: owned_dh,
                compute_dh_many: owned_compute_dh_many,
                compute_d2h: owned_d2h,
                compute_d2h_many: owned_compute_d2h_many,
                family_outer_hessian_operator: None,
            })
        } else {
            Box::new(BorrowedJointDerivProvider {
                compute_dh,
                compute_dh_many,
                compute_d2h,
                compute_d2h_many,
                family_outer_hessian_operator: None,
            })
        };

    let scaled_s_lambdas: Vec<Array2<f64>> = inner
        .s_lambdas
        .iter()
        .map(|matrix| {
            if rho_curvature_scale == 1.0 {
                matrix.clone()
            } else {
                matrix.mapv(|value| rho_curvature_scale * value)
            }
        })
        .collect();

    let hessian_op: Arc<dyn crate::solver::estimate::reml::unified::HessianOperator> =
        if use_joint_matrix_free_path(total, joint_observation_count(&inner.block_states)) {
            let ranges_vec = ranges.to_vec();
            let s_lambdas = Arc::new(scaled_s_lambdas.clone());
            let trace_diagonal_ridge = scaled_joint_trace_diagonal_ridge
                + rho_curvature_scale * JOINT_TRACE_STABILITY_RIDGE;
            match &h_joint_unpen {
                JointHessianSource::Dense(h_joint) => {
                    let h_joint = Arc::new(h_joint.clone());
                    let apply_h = Arc::clone(&h_joint);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = apply_h.dot(v);
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
                JointHessianSource::Operator { apply, .. } => {
                    let apply_h = Arc::clone(apply);
                    let apply_ranges = ranges_vec.clone();
                    let apply_s = Arc::clone(&s_lambdas);
                    Arc::new(MatrixFreeSpdOperator::new_with_mode(
                        total,
                        move |v| {
                            let mut out = match apply_h(v) {
                                Ok(out) => out,
                                Err(error) => {
                                    log::warn!(
                                        "joint exact-newton operator matvec failed during fixed-point trace construction: {error}"
                                    );
                                    Array1::<f64>::from_elem(total, f64::NAN)
                                }
                            };
                            let penalty = apply_joint_block_penalty(
                                &apply_ranges,
                                apply_s.as_ref(),
                                v,
                                trace_diagonal_ridge,
                                None,
                            );
                            out += &penalty;
                            out
                        },
                        pseudo_logdet_mode,
                    ))
                }
            }
        } else {
            let mut j_for_traces = materialize_joint_hessian_source(
                &h_joint_unpen,
                total,
                "joint exact-newton Hessian materialization for fixed-point evaluation",
            )?;
            add_joint_penalty_to_matrix(
                &mut j_for_traces,
                ranges,
                &scaled_s_lambdas,
                scaled_joint_trace_diagonal_ridge,
                None,
            );
            Arc::new(
                BlockCoupledOperator::from_joint_hessian_with_mode(
                    &j_for_traces,
                    pseudo_logdet_mode,
                )
                .map_err(|e| format!("BlockCoupledOperator from joint Hessian: {e}"))?,
            )
        };

    let (projected_logdet_correction, penalty_subspace_trace) = if project_hessian_logdet
        && include_logdet_h
        && include_logdet_s
        && pseudo_logdet_mode == PseudoLogdetMode::Smooth
    {
        let (projected_logdet, kernel) = joint_penalty_subspace_trace_parts(
            &h_joint_unpen,
            ranges,
            &scaled_s_lambdas,
            total,
            scaled_joint_trace_diagonal_ridge,
            None,
        )?;
        let correction = projected_logdet - hessian_op.logdet();
        if kernel.is_some() {
            log::debug!(
                "[OUTER hessian-route] joint EFS penalty subspace trace installed correction={:.6e}",
                correction
            );
        }
        (correction, kernel.map(Arc::new))
    } else {
        (0.0, None)
    };
    let hessian_logdet_correction = hessian_logdet_correction + projected_logdet_correction;

    unified_joint_efs_eval(
        inner,
        specs,
        per_block,
        rho,
        beta_flat,
        hessian_op,
        ranges,
        total,
        ridge,
        rho_curvature_scale,
        hessian_logdet_correction,
        penalty_subspace_trace,
        include_logdet_h,
        include_logdet_s,
        options,
        rho_prior,
        provider_box,
        ext_bundle.map(|bundle| bundle.scaled(rho_curvature_scale)),
    )
}

/// Evaluate the rho-only custom-family outer objective through the unified
/// joint hyperpath with no external ψ coordinates attached.
pub(crate) fn outerobjectivegradienthessian_internal<
    F: CustomFamily + Clone + Send + Sync + 'static,
>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho: &Array1<f64>,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: crate::types::RhoPrior,
    eval_mode: EvalMode,
) -> Result<OuterObjectiveEvalResult, String> {
    let derivative_blocks = vec![Vec::<CustomFamilyBlockPsiDerivative>::new(); specs.len()];
    evaluate_custom_family_hyper_internal(
        family,
        specs,
        options,
        penalty_counts,
        rho,
        &derivative_blocks,
        warm_start,
        rho_prior,
        eval_mode,
    )
    .map_err(String::from)
}

pub(crate) fn outerobjectiveefs<F: CustomFamily + Clone + Send + Sync + 'static>(
    family: &F,
    specs: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
    penalty_counts: &[usize],
    rho: &Array1<f64>,
    warm_start: Option<&ConstrainedWarmStart>,
    rho_prior: crate::types::RhoPrior,
) -> Result<
    (
        crate::solver::outer_strategy::EfsEval,
        ConstrainedWarmStart,
        bool,
    ),
    String,
> {
    let include_logdet_h = include_exact_newton_logdet_h(family, options);
    let include_logdet_s = include_exact_newton_logdet_s(family, options);
    let strict_spd = use_exact_newton_strict_spd(family);
    let per_block = split_log_lambdas(rho, penalty_counts)?;
    let mut inner = inner_blockwise_fit(family, specs, &per_block, options, warm_start)?;
    if !inner.converged {
        log::warn!(
            "[OUTER] custom-family EFS inner solve did not converge after {} cycle(s); \
             skipping EFS derivative assembly for theta_dim={}",
            inner.cycles,
            rho.len(),
        );
        return nonconverged_outer_efs_result(
            &inner,
            rho,
            rho.len(),
            include_logdet_h,
            include_logdet_s,
            "custom-family EFS non-converged inner solve",
        );
    }
    let ridge = effective_solverridge(options.ridge_floor);
    let moderidge = if options.ridge_policy.include_quadratic_penalty {
        ridge
    } else {
        0.0
    };
    let extra_logdet_ridge = if options.ridge_policy.include_penalty_logdet
        && !options.ridge_policy.include_quadratic_penalty
    {
        ridge
    } else {
        0.0
    };

    refresh_all_block_etas(family, specs, &mut inner.block_states)?;
    let ranges = block_param_ranges(specs);
    let total = ranges.last().map(|(_, end)| *end).unwrap_or(0);

    let efs_eval = {
        if let Some(joint_bundle) = build_joint_hessian_closures(
            family,
            &inner.block_states,
            specs,
            total,
            options,
            inner.joint_workspace.clone(),
        )? {
            let JointHessianBundle {
                source: h_joint_unpen,
                beta_flat,
                compute_dh,
                compute_dh_many,
                compute_d2h,
                compute_d2h_many,
                owned_compute_dh,
                owned_compute_dh_many,
                owned_compute_d2h,
                owned_compute_d2h_many,
                rho_curvature_scale,
                hessian_logdet_correction,
            } = joint_bundle;
            joint_outer_evaluate_efs(
                &inner,
                specs,
                &per_block,
                rho,
                &beta_flat,
                h_joint_unpen,
                &ranges,
                total,
                ridge,
                moderidge,
                extra_logdet_ridge,
                rho_curvature_scale,
                hessian_logdet_correction,
                include_logdet_h,
                include_logdet_s,
                strict_spd,
                family.use_projected_penalty_logdet(),
                options,
                rho_prior.clone(),
                family.pseudo_logdet_mode(),
                compute_dh.as_ref(),
                compute_dh_many.as_deref(),
                compute_d2h.as_ref(),
                compute_d2h_many.as_deref(),
                owned_compute_dh,
                owned_compute_dh_many,
                owned_compute_d2h,
                owned_compute_d2h_many,
                None,
            )
        } else {
            if family.requires_joint_outer_hyper_path() {
                return Err(
                        "outer hyper fixed-point evaluation requires a joint exact path for this family"
                            .to_string(),
                    );
            }
            if specs.len() != 1 {
                return Err(
                        "generic fixed-point outer fallback is only valid for single-block families; multi-block families must provide a joint outer path"
                            .to_string(),
                    );
            }

            let eval = family.evaluate(&inner.block_states)?;
            let block_idx = 0;
            let spec = &specs[block_idx];
            let work = &eval.blockworking_sets[block_idx];
            let p = spec.design.ncols();
            let mut diagonal_design = None::<DesignMatrix>;
            let h_joint_unpen = match work {
                BlockWorkingSet::Diagonal {
                    working_response: _,
                    working_weights,
                } => with_block_geometry(
                    family,
                    &inner.block_states,
                    spec,
                    block_idx,
                    |x_dyn, _| {
                        let w = floor_positiveworking_weights(working_weights, options.minweight);
                        let (xtwx, _) = weighted_normal_equations(x_dyn, &w, None)?;
                        diagonal_design = Some(x_dyn.clone());
                        Ok(xtwx)
                    },
                )?,
                BlockWorkingSet::ExactNewton {
                    gradient: _,
                    hessian,
                } => {
                    if hessian.nrows() != p || hessian.ncols() != p {
                        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                            "block {block_idx} exact-newton Hessian shape mismatch in fixed-point outer evaluation: got {}x{}, expected {}x{}",
                            hessian.nrows(),
                            hessian.ncols(),
                            p,
                            p
                        ) }.into());
                    }
                    hessian.to_dense()
                }
            };
            let beta_flat = inner.block_states[block_idx].beta.clone();
            let compute_dh = |direction: &Array1<f64>| -> Result<Option<DriftDerivResult>, String> {
                if !include_logdet_h {
                    return Ok(None);
                }
                match work {
                    BlockWorkingSet::ExactNewton { .. } => {
                        match family.exact_newton_hessian_directional_derivative(
                            &inner.block_states,
                            block_idx,
                            direction,
                        )? {
                            Some(h_exact) => {
                                Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                                    h_exact,
                                    p,
                                    &format!(
                                        "block {block_idx} exact-newton dH shape mismatch in fixed-point outer evaluation"
                                    ),
                                )?)))
                            }
                            None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                "missing exact-newton dH callback for block {block_idx} while fixed-point evaluation requires H_beta term"
                            ) }.into()),
                        }
                    }
                    BlockWorkingSet::Diagonal {
                        working_response: _,
                        working_weights,
                    } => {
                        let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
                                    format!(
                                        "missing dynamic design for block {block_idx} diagonal fixed-point correction"
                                    )
                                })?;
                        let wwork =
                            floor_positiveworking_weights(working_weights, options.minweight);
                        let x_dense = x_dyn.to_dense();
                        let n = x_dense.nrows();

                        let mut d_eta = x_dyn.matrixvectormultiply(direction);
                        let geom = family.block_geometry_directional_derivative(
                            &inner.block_states,
                            block_idx,
                            spec,
                            direction,
                        )?;
                        let mut correction_mat = Array2::<f64>::zeros((p, p));

                        if let Some(geom_dir) = geom {
                            d_eta += &geom_dir.d_offset;
                            if let Some(dx) = geom_dir.d_design {
                                d_eta += &fast_av(&dx, &beta_flat);
                                let mut wx = x_dense.clone();
                                let mut wdx = dx.clone();
                                ndarray::Zip::from(wx.rows_mut())
                                    .and(wdx.rows_mut())
                                    .and(wwork.view())
                                    .par_for_each(|mut wxr, mut wdxr, &wi| {
                                        if wi != 1.0 {
                                            wxr.mapv_inplace(|v| v * wi);
                                            wdxr.mapv_inplace(|v| v * wi);
                                        }
                                    });
                                correction_mat += &fast_atb(&dx, &wx);
                                correction_mat += &fast_atb(&x_dense, &wdx);
                            }
                        }

                        let dw = family
                                    .diagonalworking_weights_directional_derivative(
                                        &inner.block_states,
                                        block_idx,
                                        &d_eta,
                                    )?
                                    .ok_or_else(|| {
                                        format!(
                                            "missing diagonal dW callback for block {block_idx} while fixed-point evaluation requires H_beta term"
                                        )
                                    })?;
                        if dw.len() != n {
                            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                                "block {block_idx} diagonal dW length mismatch in fixed-point outer evaluation: got {}, expected {}",
                                dw.len(),
                                n
                            ) }.into());
                        }
                        let mut scaled_x = x_dense.clone();
                        ndarray::Zip::from(scaled_x.rows_mut())
                            .and(&dw)
                            .par_for_each(|mut sr, &dwi| sr.mapv_inplace(|v| v * dwi));
                        correction_mat += &fast_atb(&x_dense, &scaled_x);

                        Ok(Some(DriftDerivResult::Dense(correction_mat)))
                    }
                }
            };
            let compute_d2h = |u: &Array1<f64>,
                               v: &Array1<f64>|
             -> Result<Option<DriftDerivResult>, String> {
                if !include_logdet_h {
                    return Ok(None);
                }
                match work {
                    BlockWorkingSet::ExactNewton { .. } => {
                        match family.exact_newton_hessian_second_directional_derivative(
                            &inner.block_states,
                            block_idx,
                            u,
                            v,
                        )? {
                            Some(h_exact) => {
                                Ok(Some(DriftDerivResult::Dense(symmetrized_square_matrix(
                                    h_exact,
                                    p,
                                    &format!(
                                        "block {block_idx} exact-newton d2H shape mismatch in fixed-point outer evaluation"
                                    ),
                                )?)))
                            }
                            None => Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                "missing exact-newton d2H callback for block {block_idx} while fixed-point evaluation requires H_beta_beta term"
                            ) }.into()),
                        }
                    }
                    BlockWorkingSet::Diagonal { .. } => {
                        let x_dyn = diagonal_design.as_ref().ok_or_else(|| {
                            format!(
                                "missing dynamic design for block {block_idx} diagonal fixed-point second correction"
                            )
                        })?;
                        let x_dense = x_dyn.to_dense();
                        let n = x_dense.nrows();
                        let reject_second_order_geometry =
                            |label: &str,
                             geom: Option<BlockGeometryDirectionalDerivative>|
                             -> Result<(), String> {
                                if let Some(geom_dir) = geom {
                                    let has_offset =
                                        geom_dir.d_offset.iter().any(|value| *value != 0.0);
                                    if geom_dir.d_design.is_some() || has_offset {
                                        return Err(CustomFamilyError::UnsupportedConfiguration { reason: format!(
                                            "block {block_idx} diagonal d2H requires second-order block-geometry derivatives for {label}; use an exact-newton or joint outer path"
                                        ) }.into());
                                    }
                                }
                                Ok(())
                            };
                        reject_second_order_geometry(
                            "first direction",
                            family.block_geometry_directional_derivative(
                                &inner.block_states,
                                block_idx,
                                spec,
                                u,
                            )?,
                        )?;
                        reject_second_order_geometry(
                            "second direction",
                            family.block_geometry_directional_derivative(
                                &inner.block_states,
                                block_idx,
                                spec,
                                v,
                            )?,
                        )?;
                        let d_eta_u = x_dyn.matrixvectormultiply(u);
                        let d_eta_v = x_dyn.matrixvectormultiply(v);
                        let d2w = family
                            .diagonalworking_weights_second_directional_derivative(
                                &inner.block_states,
                                block_idx,
                                &d_eta_u,
                                &d_eta_v,
                            )?
                            .ok_or_else(|| {
                                format!(
                                    "missing diagonal d2W callback for block {block_idx} while fixed-point evaluation requires H_beta_beta term"
                                )
                            })?;
                        if d2w.len() != n {
                            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                                "block {block_idx} diagonal d2W length mismatch in fixed-point outer evaluation: got {}, expected {}",
                                d2w.len(),
                                n
                            ) }.into());
                        }
                        let mut scaled_x = x_dense.clone();
                        ndarray::Zip::from(scaled_x.rows_mut())
                            .and(&d2w)
                            .par_for_each(|mut sr, &d2wi| sr.mapv_inplace(|value| value * d2wi));
                        Ok(Some(DriftDerivResult::Dense(fast_atb(&x_dense, &scaled_x))))
                    }
                }
            };
            joint_outer_evaluate_efs(
                &inner,
                specs,
                &per_block,
                rho,
                &beta_flat,
                JointHessianSource::Dense(h_joint_unpen),
                &ranges,
                total,
                ridge,
                moderidge,
                extra_logdet_ridge,
                1.0,
                0.0,
                include_logdet_h,
                include_logdet_s,
                strict_spd,
                family.use_projected_penalty_logdet(),
                options,
                rho_prior.clone(),
                family.pseudo_logdet_mode(),
                &compute_dh,
                None,
                &compute_d2h,
                None,
                None,
                None,
                None,
                None,
                None,
            )
        }
    }?;

    let warm = ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|state| state.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(&inner)),
    };

    Ok((efs_eval, warm, inner.converged))
}

pub(crate) fn normalize_outer_eval_error_detail(error: &str) -> &str {
    // Any `String` round-tripped through `CustomFamilyError::From<String>`
    // gets re-wrapped as `InvalidInput { context: "custom-family string
    // boundary", … }`, which `Display`s as `custom-family invalid input
    // in custom-family string boundary: <reason>`. Strip that "boundary"
    // wrapper first, then the historical bare `custom-family invalid
    // input: ` form, so the `last objective error: …` summary surfaces
    // the inner reason root cause once — not the doubly-wrapped form
    // that masked the synthetic-failure marker the outer-objective error
    // contract pins.
    let stripped = error
        .strip_prefix("custom-family invalid input in custom-family string boundary: ")
        .unwrap_or(error);
    stripped
        .strip_prefix("custom-family invalid input: ")
        .unwrap_or(stripped)
}

// ═══════════════════════════════════════════════════════════════════════════
//  Section: joint outer hyper surface — unified calculus for [rho, psi]
// ═══════════════════════════════════════════════════════════════════════════
//
// The callers have already applied the current spatial coordinates `psi` when
// constructing `family`, `specs`, and `derivative_blocks`, so the explicit
// input into the section below is still only the smoothing vector
// `rho_current`. Mathematically, however, the surface being differentiated
// is the full joint profiled/Laplace objective in
//
//     theta = [rho, psi].
//
// The exact outer calculus is unified across all hypercoordinates:
//
//     J(theta)
//     = V(beta^(theta), theta)
//       + 0.5 log|H(beta^(theta), theta)|
//       - 0.5 log|S(theta)|_+,
//
// with stationarity and joint curvature
//
//     F(beta, theta) := V_beta(beta, theta) = 0,
//     H(beta, theta) := V_beta_beta(beta, theta).
//
// For each theta_i we need the fixed-beta objects
//
//     V_i, g_i := F_i, H_i,
//
// and for each pair (i, j)
//
//     V_ij, g_ij, H_ij,
//
// together with the beta-curvature contractions
//
//     D_beta H[u], D_beta^2 H[u, v], T_i[u] := D_beta H_i[u].
//
// These determine the exact joint mode responses
//
//     beta_i  = -H^{-1} g_i,
//     beta_ij = -H^{-1}(g_ij + H_i beta_j + H_j beta_i + D_beta H[beta_i] beta_j),
//
// and the total Hessian drifts
//
//     dot H_i
//     = H_i + D_beta H[beta_i],
//
//     ddot H_ij
//     = H_ij
//       + T_i[beta_j]
//       + T_j[beta_i]
//       + D_beta H[beta_ij]
//       + D_beta^2 H[beta_i, beta_j].
//
// Therefore the exact joint outer derivatives are
//
//     J_i
//     = V_i
//       + 0.5 tr(H^{-1} dot H_i)
//       - 0.5 partial_i log|S(theta)|_+,
//
//     J_ij
//     = (V_ij - g_i^T H^{-1} g_j)
//       + 0.5 [ tr(H^{-1} ddot H_ij)
//               - tr(H^{-1} dot H_j H^{-1} dot H_i) ]
//       - 0.5 partial^2_{ij} log|S(theta)|_+.
//
// In this unified view rho and psi differ only in the likelihood-side
// fixed-beta derivative objects contributed by the family. The generic exact
// assembler always adds realized penalty motion through `S(theta)` for every
// hypercoordinate:
//
// - `rho` coordinates usually have zero likelihood-side objects and pick up
//   their fixed-beta derivatives entirely from `S_rho` / `S_{rho rho}`
// - `psi` coordinates contribute likelihood-side objects from the family's
//   joint exact psi hooks and may also pick up extra penalty terms through
//   `S_psi`, `S_{rho psi}`, and `S_{psi psi}` when realized penalties move
//   with `psi`
//
// The implementation below follows this unified calculus directly. Once a
// family supplies the joint fixed-beta psi objects and the mixed
// `D_beta H_psi[u]` contraction, exact joint hyper evaluation treats `rho`
// and `psi` identically and returns the full profiled/Laplace Hessian over
// `theta = [rho, psi]`.
//
// ═══════════════════════════════════════════════════════════════════════════
//  Unified HyperCoord builders for ψ coordinates
// ═══════════════════════════════════════════════════════════════════════════

/// Assemble the penalty derivative matrix S_ψ = Σ_k exp(ρ_k) ∂S_k/∂ψ
/// in the *block-local* coefficient space (p_block × p_block).
///
/// When the derivative carries multi-penalty components the sum iterates
/// over all `(penalty_idx, s_part)` pairs.  When only a single
/// `penalty_index` is stored the derivative `s_psi` is scaled by that
/// penalty's current lambda.  If neither is present, the derivative is
/// zero (the ψ coordinate does not move any realized penalty).
pub(crate) fn assemble_block_local_s_psi(
    deriv: &CustomFamilyBlockPsiDerivative,
    per_block_rho: &Array1<f64>,
    p_block: usize,
) -> Array2<f64> {
    if let Some(ref components) = deriv.s_psi_penalty_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        for (penalty_idx, s_part) in components {
            s_part.add_scaled_to(per_block_rho[*penalty_idx].exp(), &mut s);
        }
        return s;
    }
    if let Some(ref components) = deriv.s_psi_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        for (penalty_idx, s_part) in components {
            s.scaled_add(per_block_rho[*penalty_idx].exp(), s_part);
        }
        s
    } else if let Some(penalty_idx) = deriv.penalty_index {
        deriv.s_psi.mapv(|v| per_block_rho[penalty_idx].exp() * v)
    } else {
        Array2::<f64>::zeros((p_block, p_block))
    }
}

/// Assemble the second penalty derivative matrix S_{ψ_i ψ_j} in block-local
/// coefficient space.
///
/// This mirrors the psi/psi branch of `joint_theta_penaltysecond_matrix` but
/// returns the block-local matrix directly instead of embedding it into the
/// full flattened coefficient space.
pub(crate) fn assemble_block_local_s_psi_psi(
    deriv_i: &CustomFamilyBlockPsiDerivative,
    local_j: usize,
    per_block_rho: &Array1<f64>,
    p_block: usize,
) -> Array2<f64> {
    if let Some(ref parts) = deriv_i.s_psi_psi_penalty_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        if let Some(pair_parts) = parts.get(local_j) {
            for (penalty_idx, s_part) in pair_parts {
                s_part.add_scaled_to(per_block_rho[*penalty_idx].exp(), &mut s);
            }
        }
        return s;
    }
    if let Some(ref parts) = deriv_i.s_psi_psi_components {
        let mut s = Array2::<f64>::zeros((p_block, p_block));
        if let Some(pair_parts) = parts.get(local_j) {
            for (penalty_idx, s_part) in pair_parts {
                s.scaled_add(per_block_rho[*penalty_idx].exp(), s_part);
            }
        }
        s
    } else if let Some(ref parts) = deriv_i.s_psi_psi {
        if let Some(s_part) = parts.get(local_j) {
            if let Some(penalty_index) = deriv_i.penalty_index {
                s_part.mapv(|v| per_block_rho[penalty_index].exp() * v)
            } else {
                Array2::<f64>::zeros((p_block, p_block))
            }
        } else {
            Array2::<f64>::zeros((p_block, p_block))
        }
    } else {
        Array2::<f64>::zeros((p_block, p_block))
    }
}

#[derive(Clone)]
pub struct BlockwiseInnerResult {
    pub block_states: Vec<ParameterBlockState>,
    pub active_sets: Vec<Option<Vec<usize>>>,
    pub log_likelihood: f64,
    pub penalty_value: f64,
    pub cycles: usize,
    pub converged: bool,
    pub block_logdet_h: f64,
    pub block_logdet_s: f64,
    /// Cached assembled penalty matrices S(ρ) = Σ_k exp(ρ_k) S_k per block.
    /// Avoids redundant re-assembly in the outer objective evaluation.
    pub s_lambdas: Vec<Array2<f64>>,
    pub joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    /// Projected KKT residual at the converged inner iterate, propagated to
    /// the unified evaluator's `InnerAssembly::kkt_residual` for the
    /// outer REML/LAML scoring path. `None` when the solver path doesn't
    /// produce a typed KKT diagnostic (blockwise NR fallback, eager-stop).
    pub kkt_residual: Option<crate::estimate::reml::unified::ProjectedKktResidual>,
    /// Active linear-inequality constraint rows at the converged inner
    /// iterate. When `Some`, the unified evaluator builds the
    /// constraint-aware kernel `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`
    /// for per-coordinate mode responses `v_k = ∂β/∂ρ_k`.
    pub active_constraints:
        Option<Arc<crate::estimate::reml::unified::ActiveLinearConstraintBlock>>,
}

impl std::fmt::Debug for BlockwiseInnerResult {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("BlockwiseInnerResult")
            .field("block_states", &self.block_states)
            .field("active_sets", &self.active_sets)
            .field("log_likelihood", &self.log_likelihood)
            .field("penalty_value", &self.penalty_value)
            .field("cycles", &self.cycles)
            .field("converged", &self.converged)
            .field("block_logdet_h", &self.block_logdet_h)
            .field("block_logdet_s", &self.block_logdet_s)
            .field("s_lambdas", &self.s_lambdas)
            .field(
                "joint_workspace",
                &self.joint_workspace.as_ref().map(|_| "<workspace>"),
            )
            .finish()
    }
}

#[derive(Clone)]
pub(crate) struct ConstrainedWarmStart {
    pub(crate) rho: Array1<f64>,
    pub(crate) block_beta: Vec<Array1<f64>>,
    pub(crate) active_sets: Vec<Option<Vec<usize>>>,
    pub(crate) cached_inner: Option<CachedInnerMode>,
}

#[derive(Clone)]
pub(crate) struct CachedInnerMode {
    pub(crate) log_likelihood: f64,
    pub(crate) penalty_value: f64,
    pub(crate) cycles: usize,
    pub(crate) converged: bool,
    pub(crate) block_logdet_h: f64,
    pub(crate) block_logdet_s: f64,
    pub(crate) joint_workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    pub(crate) kkt_residual: Option<crate::estimate::reml::unified::ProjectedKktResidual>,
    pub(crate) active_constraints:
        Option<Arc<crate::estimate::reml::unified::ActiveLinearConstraintBlock>>,
}

pub(crate) fn screened_outer_warm_start<'a>(
    warm_start: Option<&'a ConstrainedWarmStart>,
    rho: &Array1<f64>,
) -> Option<&'a ConstrainedWarmStart> {
    warm_start.filter(|seed| seed.rho.len() == rho.len())
}

pub(crate) fn warm_start_matches_block_log_lambdas(
    seed: &ConstrainedWarmStart,
    block_log_lambdas: &[Array1<f64>],
) -> bool {
    let expected = block_log_lambdas
        .iter()
        .map(|values| values.len())
        .sum::<usize>();
    if seed.rho.len() != expected {
        return false;
    }
    let mut offset = 0usize;
    for block in block_log_lambdas {
        let end = offset + block.len();
        if seed.rho.slice(s![offset..end]) != block.view() {
            return false;
        }
        offset = end;
    }
    true
}

pub(crate) fn cached_inner_mode_from_result(result: &BlockwiseInnerResult) -> CachedInnerMode {
    CachedInnerMode {
        log_likelihood: result.log_likelihood,
        penalty_value: result.penalty_value,
        cycles: result.cycles,
        converged: result.converged,
        block_logdet_h: result.block_logdet_h,
        block_logdet_s: result.block_logdet_s,
        joint_workspace: result.joint_workspace.clone(),
        kkt_residual: result.kkt_residual.clone(),
        active_constraints: result.active_constraints.clone(),
    }
}

pub(crate) fn constrained_warm_start_from_inner(
    rho: &Array1<f64>,
    inner: &BlockwiseInnerResult,
) -> ConstrainedWarmStart {
    ConstrainedWarmStart {
        rho: rho.clone(),
        block_beta: inner
            .block_states
            .iter()
            .map(|state| state.beta.clone())
            .collect(),
        active_sets: inner.active_sets.clone(),
        cached_inner: Some(cached_inner_mode_from_result(inner)),
    }
}

pub(crate) fn constrained_warm_start_from_cached_beta(
    rho_dim: usize,
    specs: &[ParameterBlockSpec],
    beta: &Array1<f64>,
) -> Result<ConstrainedWarmStart, EstimationError> {
    let expected = specs.iter().map(|spec| spec.design.ncols()).sum::<usize>();
    if beta.len() != expected {
        crate::bail_invalid_estim!(
            "cached inner beta has length {}, but custom-family blocks require length {}",
            beta.len(),
            expected
        );
    }
    crate::families::marginal_slope_shared::bail_if_cached_beta_non_finite(beta)?;

    let mut offset = 0usize;
    let mut block_beta = Vec::with_capacity(specs.len());
    for spec in specs {
        let end = offset + spec.design.ncols();
        block_beta.push(beta.slice(s![offset..end]).to_owned());
        offset = end;
    }

    Ok(ConstrainedWarmStart {
        rho: Array1::zeros(rho_dim),
        block_beta,
        active_sets: vec![None; specs.len()],
        cached_inner: None,
    })
}

pub(crate) fn inner_penalized_objective(
    inner: &BlockwiseInnerResult,
    include_logdet_h: bool,
    include_logdet_s: bool,
    context: &str,
) -> Result<f64, String> {
    let reml_term = if include_logdet_h {
        0.5 * inner.block_logdet_h
    } else {
        0.0
    } - if include_logdet_s {
        0.5 * inner.block_logdet_s
    } else {
        0.0
    };
    checked_penalizedobjective(
        inner.log_likelihood,
        inner.penalty_value,
        reml_term,
        context,
    )
}

pub(crate) fn nonconverged_outer_efs_result(
    inner: &BlockwiseInnerResult,
    rho: &Array1<f64>,
    theta_dim: usize,
    include_logdet_h: bool,
    include_logdet_s: bool,
    context: &str,
) -> Result<
    (
        crate::solver::outer_strategy::EfsEval,
        ConstrainedWarmStart,
        bool,
    ),
    String,
> {
    Ok((
        crate::solver::outer_strategy::EfsEval {
            cost: inner_penalized_objective(inner, include_logdet_h, include_logdet_s, context)?,
            steps: vec![0.0; theta_dim],
            beta: None,
            psi_gradient: None,
            psi_indices: None,
            inner_hessian_scale: None,
            logdet_enclosure_gap: None,
        },
        constrained_warm_start_from_inner(rho, inner),
        false,
    ))
}

pub(crate) fn warm_start_without_cached_inner_for_psi_derivatives(
    warm_start: Option<&ConstrainedWarmStart>,
    has_psi_derivatives: bool,
) -> Option<ConstrainedWarmStart> {
    if !has_psi_derivatives {
        return None;
    }
    warm_start.cloned().map(|mut warm| {
        warm.cached_inner = None;
        warm
    })
}

/// Helper struct mirroring the old `BlockwiseFitResultParts`.
pub struct BlockwiseFitResultParts {
    pub block_states: Vec<ParameterBlockState>,
    pub log_likelihood: f64,
    pub log_lambdas: Array1<f64>,
    pub lambdas: Array1<f64>,
    pub covariance_conditional: Option<Array2<f64>>,
    pub stable_penalty_term: f64,
    pub penalized_objective: f64,
    pub outer_iterations: usize,
    /// `None` = no gradient measured at termination (cache-hit, gradient-free,
    /// or trivial early-exit); `Some(g)` = measured norm. `outer_converged`
    /// is the authoritative convergence signal.
    pub outer_gradient_norm: Option<f64>,
    /// First-order optimality certificate from the outer smoothing solve
    /// (#934); `None` when no outer ran (fixed-λ, one-cycle probe) or the
    /// audit could not evaluate.
    pub criterion_certificate: Option<crate::solver::outer_strategy::CriterionCertificate>,
    pub inner_cycles: usize,
    pub outer_converged: bool,
    pub geometry: Option<FitGeometry>,
    /// Effective degrees of freedom computed by the caller in the *reduced*
    /// (canonical) coefficient space, where the penalized Hessian is full rank,
    /// as `(edf_total, edf_by_penalty, block_edf)`. The trace edf is invariant
    /// under the canonical reparameterization, so computing it in the reduced
    /// space and reporting it on the raw fit is exact — and it avoids the
    /// `tr((H_raw + εI)⁻¹ S_raw)` blow-up that a rank-deficient raw-lifted
    /// Hessian (zero rows/cols on canonicalization-dropped directions) would
    /// otherwise inject. `None` when the caller has no reduced geometry (e.g.
    /// the one-cycle inner probe), in which case `blockwise_fit_from_parts`
    /// falls back to computing edf from whatever geometry it was handed.
    pub precomputed_edf: Option<(f64, Vec<f64>, Vec<f64>)>,
}

pub(crate) fn validate_parameter_block_state_finiteness(
    label: &str,
    state: &ParameterBlockState,
) -> Result<(), String> {
    validate_all_finite_estimation(&format!("{label}.beta"), state.beta.iter().copied())
        .map_err(|e| e.to_string())?;
    validate_all_finite_estimation(&format!("{label}.eta"), state.eta.iter().copied())
        .map_err(|e| e.to_string())?;
    Ok(())
}

pub(crate) fn validate_lambda_pair_consistency(
    log_lambdas: &Array1<f64>,
    lambdas: &Array1<f64>,
    label: &str,
) -> Result<(), String> {
    if log_lambdas.len() != lambdas.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "{label} length mismatch: log_lambdas={}, lambdas={}",
                log_lambdas.len(),
                lambdas.len()
            ),
        }
        .into());
    }
    for (idx, (&log_lambda, &lambda)) in log_lambdas.iter().zip(lambdas.iter()).enumerate() {
        let expected = log_lambda.exp();
        let tolerance = 1e-10 * expected.abs().max(1.0);
        if (lambda - expected).abs() > tolerance {
            return Err(format!(
                "{label}[{idx}] inconsistent with exp(log_lambda): got {lambda}, expected {expected}",
            ));
        }
    }
    Ok(())
}

/// Effective degrees of freedom for a converged blockwise custom-family fit,
/// computed from the joint penalized Hessian `H = X'W_HX + S(λ)` and the
/// per-penalty matrices `S_k` exactly as the standard GAM path and mgcv do:
///
/// ```text
/// edf_total   = p − Σ_k λ_k · tr(H⁻¹ S_k)
/// edf_penalty = (rank_k − λ_k · tr(H⁻¹ S_k))   clamped to [0, rank_k]
/// ```
///
/// `S_k` here is the *unscaled* penalty (its `λ_k` factor is applied here), and
/// each `S_k.to_dense()` is already embedded in the joint `p × p` coefficient
/// layout (the Blockwise / Kronecker variants place their local block at the
/// correct column range), so the trace solve runs in the full joint space and
/// no per-block offset bookkeeping is required.
///
/// The custom-family path (CTN transformation-normal, Dirichlet, …) builds its
/// fit through `blockwise_fit_from_parts` and previously left `inference` at
/// `None`, so `edf_total` was unavailable for every custom family even though
/// the converged geometry already carries the penalized Hessian. This mirrors
/// the survival-path repair (`survival_transformation_edf`, #565) for the
/// blockwise engine: the same trace formula, factorized with the same
/// ridge-retry stabilization so a marginally indefinite Hessian at a boundary
/// optimum still yields a usable trace instead of dropping inference.
///
/// `edf_penalty` is returned aligned 1:1 with the flattened `lambdas`
/// (one entry per penalty across all blocks), matching the
/// `FitInference::edf_by_block` ↔ `lambdas` length invariant. The per-block
/// aggregate edf (for `FittedBlock::edf`) is the sum of that block's penalty
/// edfs, with an unpenalized block contributing its full column count.
pub(crate) fn custom_family_blockwise_edf(
    penalized_hessian: &Array2<f64>,
    specs: &[ParameterBlockSpec],
    lambdas: &ndarray::ArrayView1<'_, f64>,
) -> Result<(f64, Vec<f64>, Vec<f64>), String> {
    let p = penalized_hessian.nrows();
    let total_cols: usize = specs.iter().map(|s| s.design.ncols()).sum();
    if penalized_hessian.ncols() != p || total_cols != p {
        return Err(format!(
            "custom-family edf: penalized Hessian {}x{} inconsistent with total block width {}",
            penalized_hessian.nrows(),
            penalized_hessian.ncols(),
            total_cols
        ));
    }
    let expected_rho: usize = specs.iter().map(|s| s.penalties.len()).sum();
    if lambdas.len() != expected_rho {
        return Err(format!(
            "custom-family edf: lambdas length {} does not match total penalty count {}",
            lambdas.len(),
            expected_rho
        ));
    }

    let h_sym = SymmetricMatrix::Dense(penalized_hessian.clone());
    // Sparse-aware factorization with ridge retry (mirrors estimate.rs and
    // survival_transformation_edf): a boundary-constrained optimum can leave
    // the penalized Hessian marginally indefinite, in which case we add the
    // smallest diagonal shift that restores definiteness so the trace solve
    // succeeds rather than dropping inference for the whole fit.
    let factor = {
        let scale = h_sym.max_abs_diag();
        let min_step = scale * 1e-10;
        let mut ridge = 0.0_f64;
        let mut attempts = 0_usize;
        loop {
            let candidate = if ridge > 0.0 {
                h_sym.addridge(ridge).unwrap_or_else(|_| h_sym.clone())
            } else {
                h_sym.clone()
            };
            if let Ok(f) = candidate.factorize() {
                break f;
            }
            attempts += 1;
            if attempts >= 8 {
                return Err(
                    "custom-family edf: penalized Hessian could not be factorized".to_string(),
                );
            }
            ridge = if ridge <= 0.0 { min_step } else { ridge * 10.0 };
        }
    };

    let mut edf_by_penalty = vec![0.0_f64; expected_rho];
    let mut block_edf = Vec::with_capacity(specs.len());
    let mut total_trace = 0.0_f64;
    let mut penalty_offset = 0usize;
    let mut block_col_start = 0usize;
    for spec in specs.iter() {
        let block_cols = spec.design.ncols();
        let mut block_edf_acc = block_cols as f64;
        for (local_k, penalty) in spec.penalties.iter().enumerate() {
            let global_k = penalty_offset + local_k;
            let lambda = lambdas[global_k];
            // Embed S_k into the full p×p joint layout. `PenaltyMatrix::to_dense`
            // returns the *local* block matrix for the `Dense` variant but the
            // already-embedded full-width matrix for `Blockwise`/`Kronecker`, so
            // dispatch on the materialized dimension: a local (block_cols-wide)
            // penalty is placed at this block's column range, a full-width
            // penalty is used as-is (mirrors `survival_transformation_edf`'s
            // explicit block placement).
            let s_local = penalty.to_dense();
            let mut s_full = Array2::<f64>::zeros((p, p));
            if s_local.nrows() == p && s_local.ncols() == p {
                s_full.assign(&s_local);
            } else if s_local.nrows() == block_cols && s_local.ncols() == block_cols {
                let r = block_col_start..block_col_start + block_cols;
                s_full.slice_mut(ndarray::s![r.clone(), r]).assign(&s_local);
            } else {
                return Err(format!(
                    "custom-family edf: penalty {global_k} materialized to {}x{}, expected {p}x{p} or {block_cols}x{block_cols}",
                    s_local.nrows(),
                    s_local.ncols()
                ));
            }
            // tr(H⁻¹ S_k) via H Z = S_k, summing the diagonal of Z.
            let z = factor.solvemulti(&s_full).map_err(|e| {
                format!("custom-family edf trace solve failed for penalty {global_k}: {e}")
            })?;
            let mut trace = 0.0_f64;
            for d in 0..p {
                trace += z[[d, d]];
            }
            let lam_trace = if lambda > 0.0 { lambda * trace } else { 0.0 };
            total_trace += lam_trace;
            // Per-penalty edf is bounded by the columns this penalty acts on,
            // i.e. its block's column count (a `Blockwise` penalty reports the
            // full joint width from `dim()`, so cap at `block_cols`, not `dim()`).
            let penalty_cols = block_cols as f64;
            let edf_k = (penalty_cols - lam_trace).clamp(0.0, penalty_cols);
            edf_by_penalty[global_k] = edf_k;
            // The block's edf is the column count minus the total trace this
            // block's penalties spend (so multiple penalties on one block
            // compose), clamped to the block's column count.
            block_edf_acc -= lam_trace;
        }
        block_edf.push(block_edf_acc.clamp(0.0, block_cols as f64));
        penalty_offset += spec.penalties.len();
        block_col_start += block_cols;
    }

    let edf_total = (p as f64 - total_trace).clamp(0.0, p as f64);
    if !edf_total.is_finite()
        || edf_by_penalty.iter().any(|v| !v.is_finite())
        || block_edf.iter().any(|v| !v.is_finite())
    {
        return Err("custom-family edf: non-finite effective degrees of freedom".to_string());
    }
    Ok((edf_total, edf_by_penalty, block_edf))
}

/// Compute reduced-space effective degrees of freedom for a converged fit,
/// to be carried through `BlockwiseFitResultParts::precomputed_edf`.
///
/// The reduced (canonical) geometry's penalized Hessian is full rank and its
/// `reduced_specs` carry the pulled-back penalties `T_iᵀ S_k T_i`, so the trace
/// edf is computed exactly here (no rank-deficiency ridge bias). Because the
/// trace edf is invariant under the canonical reparameterization, the resulting
/// `edf_total` / per-penalty / per-block values are the same as they would be
/// in the raw basis and are reported directly on the lifted raw fit. Returns
/// `None` when no reduced geometry is available, so the caller can leave
/// `precomputed_edf` unset (and the raw-geometry fallback applies).
pub(crate) fn reduced_blockwise_edf(
    reduced_geometry: Option<&FitGeometry>,
    canonical: &crate::solver::identifiability_canonical::CanonicalSpecs,
    lambdas: &Array1<f64>,
) -> Option<(f64, Vec<f64>, Vec<f64>)> {
    let geom = reduced_geometry?;
    match custom_family_blockwise_edf(
        geom.penalized_hessian.as_array(),
        &canonical.reduced_specs,
        &lambdas.view(),
    ) {
        Ok(triple) => Some(triple),
        Err(err) => {
            log::warn!(
                "[custom-family inference] reduced-space effective degrees of freedom unavailable: {err}"
            );
            None
        }
    }
}

/// Build a `UnifiedFitResult` from blockwise-specific fields.
pub fn blockwise_fit_from_parts(
    parts: BlockwiseFitResultParts,
    specs: &[ParameterBlockSpec],
) -> Result<crate::solver::estimate::UnifiedFitResult, String> {
    let BlockwiseFitResultParts {
        block_states,
        log_likelihood,
        log_lambdas,
        lambdas,
        covariance_conditional,
        stable_penalty_term,
        penalized_objective,
        outer_iterations,
        outer_gradient_norm,
        criterion_certificate,
        inner_cycles,
        outer_converged,
        geometry,
        precomputed_edf,
    } = parts;

    if block_states.is_empty() {
        return Err(CustomFamilyError::UnsupportedConfiguration {
            reason: "blockwise fit requires at least one block state".to_string(),
        }
        .into());
    }
    ensure_finite_scalar_estimation("blockwise_fit.log_likelihood", log_likelihood)
        .map_err(|e| e.to_string())?;
    validate_all_finite_estimation("blockwise_fit.log_lambdas", log_lambdas.iter().copied())
        .map_err(|e| e.to_string())?;
    validate_all_finite_estimation("blockwise_fit.lambdas", lambdas.iter().copied())
        .map_err(|e| e.to_string())?;
    validate_lambda_pair_consistency(&log_lambdas, &lambdas, "blockwise_fit.lambdas")?;
    ensure_finite_scalar_estimation("blockwise_fit.penalized_objective", penalized_objective)
        .map_err(|e| e.to_string())?;
    ensure_finite_scalar_estimation("blockwise_fit.stable_penalty_term", stable_penalty_term)
        .map_err(|e| e.to_string())?;
    if let Some(g) = outer_gradient_norm {
        ensure_finite_scalar_estimation("blockwise_fit.outer_gradient_norm", g)
            .map_err(|e| e.to_string())?;
    }

    if block_states.len() != specs.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "blockwise_fit.block_states length ({}) does not match specs length ({})",
                block_states.len(),
                specs.len()
            ),
        }
        .into());
    }
    let n = specs[0].design.nrows();
    let total_p = block_states
        .iter()
        .map(|state| state.beta.len())
        .sum::<usize>();
    for (idx, state) in block_states.iter().enumerate() {
        validate_parameter_block_state_finiteness(
            &format!("blockwise_fit.block_states[{idx}]"),
            state,
        )?;
        let expected_rows = specs[idx].solver_design().nrows();
        if state.eta.len() != expected_rows {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.block_states[{idx}] eta length mismatch: got {}, expected {} (solver design rows)",
                state.eta.len(),
                expected_rows
            ) }.into());
        }
    }

    if let Some(cov) = covariance_conditional.as_ref() {
        validate_all_finite_estimation("blockwise_fit.covariance_conditional", cov.iter().copied())
            .map_err(|e| e.to_string())?;
        let (rows, cols) = cov.dim();
        if rows != total_p || cols != total_p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "blockwise_fit.covariance_conditional must be {}x{}, got {}x{}",
                    total_p, total_p, rows, cols
                ),
            }
            .into());
        }
    }

    if let Some(geom) = geometry.as_ref() {
        geom.validate_numeric_finiteness()
            .map_err(|e| e.to_string())?;
        let (rows, cols) = geom.penalized_hessian.dim();
        if rows != total_p || cols != total_p {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "blockwise_fit.geometry.penalized_hessian must be {}x{}, got {}x{}",
                    total_p, total_p, rows, cols
                ),
            }
            .into());
        }
        let geom_len = geom.working_weights.len();
        if geom_len != geom.working_response.len() {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.geometry working vector length mismatch: weights={}, response={}",
                geom.working_weights.len(),
                geom.working_response.len(),
            ) }.into());
        }
        if geom_len != n && (n == 0 || geom_len % n != 0) {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.geometry.working_weights length mismatch: got {geom_len}, expected {n} or a stacked multiple of {n}",
            ) }.into());
        }
        if geom.working_response.len() != n && (n == 0 || geom.working_response.len() % n != 0) {
            return Err(CustomFamilyError::DimensionMismatch { reason: format!(
                "blockwise_fit.geometry.working_response length mismatch: got {}, expected {n} or a stacked multiple of {n}",
                geom.working_response.len(),
            ) }.into());
        }
    }

    // Build unified blocks from the blockwise states.
    use crate::solver::estimate::{FittedBlock, FittedLinkState, UnifiedFitResultParts};
    let expected_rho: usize = specs.iter().map(|s| s.penalties.len()).sum();
    if lambdas.len() != expected_rho {
        return Err(CustomFamilyError::DimensionMismatch { reason: format!(
            "blockwise_fit.lambdas length ({}) does not match sum of per-block penalty counts ({})",
            lambdas.len(),
            expected_rho
        ) }.into());
    }
    // Effective degrees of freedom and the inference block. When the
    // converged geometry carries the joint penalized Hessian we compute the
    // mgcv trace edf `p − Σ_k λ_k·tr(H⁻¹ S_k)` here so every custom-family fit
    // (CTN transformation-normal, Dirichlet, …) reports `edf_total` /
    // per-block `edf` like the standard GAM path, instead of leaving inference
    // unpopulated. A factorization failure is non-fatal: the fit still returns
    // with `edf=0`/`inference=None` rather than aborting, but in practice the
    // ridge-retry inside `custom_family_blockwise_edf` recovers any boundary
    // indefiniteness.
    let (edf_total_opt, edf_by_penalty, block_edf): (Option<f64>, Vec<f64>, Vec<f64>) =
        match precomputed_edf {
            // Reduced-space edf supplied by the caller (the principled path:
            // the trace is computed where the Hessian is full rank, then
            // reported on the raw fit — exact because the trace edf is
            // reparameterization-invariant).
            Some((edf_total, edf_by_penalty, block_edf)) => {
                (Some(edf_total), edf_by_penalty, block_edf)
            }
            // Fallback: compute from whatever geometry we were handed. Used
            // only when the caller did not precompute (no reduced geometry);
            // the ridge-retry factorization makes this robust to a marginally
            // indefinite Hessian.
            None => match geometry.as_ref() {
                Some(geom) => {
                    match custom_family_blockwise_edf(
                        geom.penalized_hessian.as_array(),
                        specs,
                        &lambdas.view(),
                    ) {
                        Ok((edf_total, edf_by_penalty, block_edf)) => {
                            (Some(edf_total), edf_by_penalty, block_edf)
                        }
                        Err(err) => {
                            log::warn!(
                                "[custom-family inference] effective degrees of freedom unavailable: {err}"
                            );
                            (None, Vec::new(), vec![0.0; block_states.len()])
                        }
                    }
                }
                None => (None, Vec::new(), vec![0.0; block_states.len()]),
            },
        };

    let mut lambda_offset = 0usize;
    let blocks: Vec<FittedBlock> = block_states
        .iter()
        .enumerate()
        .map(|(i, bs)| {
            let role = custom_family_block_role(&specs[i].name, i, block_states.len());
            let k = specs[i].penalties.len();
            let block_lambdas = lambdas
                .slice(s![lambda_offset..lambda_offset + k])
                .to_owned();
            lambda_offset += k;
            FittedBlock {
                beta: bs.beta.clone(),
                role,
                edf: block_edf.get(i).copied().unwrap_or(0.0),
                lambdas: block_lambdas,
            }
        })
        .collect();
    let deviance = -2.0 * log_likelihood;

    // Assemble the inference block from the converged geometry. CTN and other
    // custom families estimate their own likelihood scale, so the penalized
    // Hessian is reported unscaled (dispersion = 1) — the EDF trace is
    // dispersion-free, and downstream covariance scaling pairs `H` with the
    // family's own dispersion where needed.
    let inference = match (edf_total_opt, geometry.as_ref()) {
        (Some(edf_total), Some(geom)) => Some(crate::solver::estimate::FitInference {
            edf_by_block: edf_by_penalty,
            edf_total,
            smoothing_correction: None,
            penalized_hessian: geom.penalized_hessian.clone(),
            working_weights: geom.working_weights.clone(),
            working_response: geom.working_response.clone(),
            reparam_qs: None,
            dispersion: crate::solver::estimate::Dispersion::Known(1.0),
            beta_covariance: None,
            beta_standard_errors: None,
            beta_covariance_corrected: None,
            beta_standard_errors_corrected: None,
            beta_covariance_frequentist: None,
            coefficient_influence: None,
            weighted_gram: None,
            bias_correction_beta: None,
        }),
        _ => None,
    };

    crate::solver::estimate::UnifiedFitResult::try_from_parts(UnifiedFitResultParts {
        blocks,
        log_lambdas: log_lambdas.clone(),
        lambdas: lambdas.clone(),
        likelihood_family: None,
        likelihood_scale: crate::types::LikelihoodScaleMetadata::Unspecified,
        log_likelihood_normalization: crate::types::LogLikelihoodNormalization::UserProvided,
        log_likelihood,
        deviance,
        reml_score: penalized_objective,
        stable_penalty_term,
        penalized_objective,
        outer_iterations,
        outer_converged,
        outer_gradient_norm,
        standard_deviation: 1.0,
        covariance_conditional,
        covariance_corrected: None,
        inference,
        fitted_link: FittedLinkState::Standard(None),
        geometry,
        block_states,
        // Report the inner status honestly from the threaded `outer_converged`
        // flag rather than hardcoding `Converged`. When the outer optimization
        // did not converge (e.g. it escalated to posterior sampling), surface
        // `StalledAtValidMinimum` — the same non-converged-but-usable bucket the
        // smooth-term path maps to — so downstream consumers
        // (`pirls_status.is_converged()`, `outer_converged` derivation) do not
        // report a non-converged fit as converged.
        pirls_status: if outer_converged {
            crate::pirls::PirlsStatus::Converged
        } else {
            crate::pirls::PirlsStatus::StalledAtValidMinimum
        },
        max_abs_eta: 0.0,
        constraint_kkt: None,
        artifacts: crate::solver::estimate::FitArtifacts {
            pirls: None,
            criterion_certificate,
            ..Default::default()
        },
        inner_cycles,
    })
    .map_err(|e| e.to_string())
}

pub(crate) fn checked_penalizedobjective(
    log_likelihood: f64,
    penalty_value: f64,
    reml_term: f64,
    context: &str,
) -> Result<f64, String> {
    let objective = -log_likelihood + penalty_value + reml_term;
    if objective.is_finite() {
        Ok(objective)
    } else {
        Err(CustomFamilyError::NumericalFailure {
            reason: format!(
                "{context}: non-finite penalized objective \
             (log_likelihood={log_likelihood}, penalty_value={penalty_value}, \
             reml_term={reml_term}, objective={objective})"
            ),
        }
        .into())
    }
}

#[derive(Clone)]
pub struct CustomFamilyWarmStart {
    pub(crate) inner: ConstrainedWarmStart,
}

impl CustomFamilyWarmStart {
    pub(crate) fn compatible_with_rho(&self, rho: &Array1<f64>) -> bool {
        screened_outer_warm_start(Some(&self.inner), rho).is_some()
    }

    /// Borrow the converged per-block coefficient vector for `block_idx`.
    /// Callers that need to evaluate the block's fitted linear predictor
    /// `X·β` (rather than inspect raw coefficient magnitudes) read β through
    /// this view.
    pub(crate) fn block_beta_view(&self, block_idx: usize) -> Option<ArrayView1<'_, f64>> {
        self.inner.block_beta.get(block_idx).map(|beta| beta.view())
    }

    /// Build a warm-start payload from a flat cached β and the per-block
    /// coefficient widths. The returned warm-start carries a zero `rho`
    /// (the outer cache will overwrite it on the next eval) and empty
    /// active sets; only the per-block β slices feed the next inner
    /// PIRLS / Newton solve. Used by the spatial-joint outer cache to
    /// seed the family-owned warm-start slot on cache hits so the inner
    /// solve opens at the prior converged iterate instead of cold β.
    pub fn from_cached_beta(
        block_col_counts: &[usize],
        beta: &Array1<f64>,
    ) -> Result<Self, EstimationError> {
        let expected: usize = block_col_counts.iter().copied().sum();
        if beta.len() != expected {
            crate::bail_invalid_estim!(
                "cached inner beta has length {}, but spatial-joint blocks require length {}",
                beta.len(),
                expected
            );
        }
        crate::families::marginal_slope_shared::bail_if_cached_beta_non_finite(beta)?;
        let mut offset = 0usize;
        let mut block_beta = Vec::with_capacity(block_col_counts.len());
        for &width in block_col_counts {
            let end = offset + width;
            block_beta.push(beta.slice(s![offset..end]).to_owned());
            offset = end;
        }
        Ok(CustomFamilyWarmStart {
            inner: ConstrainedWarmStart {
                rho: Array1::zeros(0),
                block_beta,
                active_sets: vec![None; block_col_counts.len()],
                cached_inner: None,
            },
        })
    }
}

pub(crate) struct CustomOuterState {
    pub(crate) warm_cache: Option<ConstrainedWarmStart>,
    pub(crate) reset_warm_cache: Option<ConstrainedWarmStart>,
    pub(crate) last_error: Option<String>,
    pub(crate) initial_gradient_norm: Option<f64>,
}

impl CustomOuterState {
    pub(crate) fn new(warm_start: Option<ConstrainedWarmStart>) -> Self {
        Self {
            warm_cache: warm_start.clone(),
            reset_warm_cache: warm_start,
            last_error: None,
            initial_gradient_norm: None,
        }
    }

    pub(crate) fn reset(&mut self) {
        self.warm_cache = self.reset_warm_cache.clone();
    }

    pub(crate) fn seed_cached_beta(
        &mut self,
        rho_dim: usize,
        specs: &[ParameterBlockSpec],
        beta: &Array1<f64>,
    ) -> Result<(), EstimationError> {
        let warm_start = constrained_warm_start_from_cached_beta(rho_dim, specs, beta)?;
        self.reset_warm_cache = Some(warm_start.clone());
        self.warm_cache = Some(warm_start);
        self.last_error = None;
        Ok(())
    }
}

pub struct CustomFamilyJointHyperResult {
    pub objective: f64,
    pub gradient: Array1<f64>,
    pub outer_hessian: crate::solver::outer_strategy::HessianResult,
    pub warm_start: CustomFamilyWarmStart,
    /// `false` when the inner blockwise/Newton solve hit its divergence
    /// early-exit or its max-cycle cap. Envelope-theorem outer gradients
    /// and analytic outer Hessians are valid only at a stationary β̂ —
    /// callers that consume `gradient`/`outer_hessian` MUST gate on this
    /// flag and treat non-converged evaluations as inexact (e.g. let ARC
    /// back off the trust region) rather than feeding pathological
    /// derivatives into the outer optimizer.
    pub inner_converged: bool,
}

pub struct CustomFamilyJointHyperEfsResult {
    pub efs_eval: crate::solver::outer_strategy::EfsEval,
    pub warm_start: CustomFamilyWarmStart,
    /// See [`CustomFamilyJointHyperResult::inner_converged`]. EFS gradients
    /// also assume a stationary inner solve.
    pub inner_converged: bool,
}

pub(crate) struct OuterObjectiveEvalResult {
    pub(crate) objective: f64,
    pub(crate) gradient: Array1<f64>,
    pub(crate) outer_hessian: crate::solver::outer_strategy::HessianResult,
    pub(crate) warm_start: ConstrainedWarmStart,
    pub(crate) inner_converged: bool,
}

pub(crate) fn outer_eval_result_to_joint_hyper_result(
    result: OuterObjectiveEvalResult,
) -> CustomFamilyJointHyperResult {
    CustomFamilyJointHyperResult {
        objective: result.objective,
        gradient: result.gradient,
        outer_hessian: result.outer_hessian,
        warm_start: CustomFamilyWarmStart {
            inner: result.warm_start,
        },
        inner_converged: result.inner_converged,
    }
}

pub(crate) struct OwnedDenseOuterHessianOperator {
    pub(crate) matrix: Array2<f64>,
}

impl crate::solver::outer_strategy::OuterHessianOperator for OwnedDenseOuterHessianOperator {
    fn dim(&self) -> usize {
        self.matrix.nrows()
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        if v.len() != self.matrix.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "batched dense outer Hessian matvec length mismatch: got {}, expected {}",
                    v.len(),
                    self.matrix.ncols()
                ),
            }
            .into());
        }
        Ok(self.matrix.dot(v))
    }

    /// Zero-alloc override: write `matrix · v` directly into `out` using a
    /// row-dot loop, avoiding the `matrix.dot(v)` allocation.
    fn apply_into(&self, v: &Array1<f64>, out: &mut Array1<f64>) -> Result<(), String> {
        if v.len() != self.matrix.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "batched dense outer Hessian apply_into input length mismatch: got {}, expected {}",
                    v.len(),
                    self.matrix.ncols()
                ),
            }
            .into());
        }
        if out.len() != self.matrix.nrows() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "batched dense outer Hessian apply_into output length mismatch: got {}, expected {}",
                    out.len(),
                    self.matrix.nrows()
                ),
            }
            .into());
        }
        for (row, cell) in self.matrix.rows().into_iter().zip(out.iter_mut()) {
            *cell = row.dot(v);
        }
        Ok(())
    }

    fn is_cheap_to_materialize(&self) -> bool {
        true
    }
}

pub(crate) struct LabeledOuterHessianOperator {
    pub(crate) base: Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
    pub(crate) physical_to_outer: Vec<Option<usize>>,
    pub(crate) outer_dim: usize,
    /// Scratch buffers reused across `apply_into` calls to avoid
    /// per-call allocation of the permuted input and output vectors.
    /// `(physical_in, physical_out)`, each of length `physical_to_outer.len()`.
    pub(crate) scratch: std::sync::Mutex<(ndarray::Array1<f64>, ndarray::Array1<f64>)>,
}

impl LabeledOuterHessianOperator {
    pub(crate) fn new(
        base: Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>,
        layout: &PenaltyLabelLayout,
    ) -> Self {
        let n_physical = layout.physical_to_outer.len();
        Self {
            base,
            physical_to_outer: layout.physical_to_outer.clone(),
            outer_dim: layout.initial_rho.len(),
            scratch: std::sync::Mutex::new((
                ndarray::Array1::zeros(n_physical),
                ndarray::Array1::zeros(n_physical),
            )),
        }
    }
}

impl crate::solver::outer_strategy::OuterHessianOperator for LabeledOuterHessianOperator {
    fn dim(&self) -> usize {
        self.outer_dim
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        if v.len() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian input length mismatch: got {}, expected {}",
                v.len(),
                self.outer_dim
            ));
        }
        let mut physical = Array1::<f64>::zeros(self.physical_to_outer.len());
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            physical[physical_idx] = outer_idx.map(|idx| v[idx]).unwrap_or(0.0);
        }
        let physical_out = self.base.matvec(&physical)?;
        if physical_out.len() != self.physical_to_outer.len() {
            return Err(format!(
                "labeled outer Hessian physical matvec length mismatch: got {}, expected {}",
                physical_out.len(),
                self.physical_to_outer.len()
            ));
        }
        let mut out = Array1::<f64>::zeros(self.outer_dim);
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                out[outer_idx] += physical_out[physical_idx];
            }
        }
        Ok(out)
    }

    /// Zero-alloc override: reuses hoisted scratch buffers to avoid the
    /// per-call `physical` and `out` allocations in `matvec`.
    fn apply_into(
        &self,
        v: &ndarray::Array1<f64>,
        out: &mut ndarray::Array1<f64>,
    ) -> Result<(), String> {
        if v.len() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian apply_into input length mismatch: got {}, expected {}",
                v.len(),
                self.outer_dim
            ));
        }
        if out.len() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian apply_into output length mismatch: got {}, expected {}",
                out.len(),
                self.outer_dim
            ));
        }
        let mut guard = self
            .scratch
            .lock()
            .map_err(|_| "labeled outer Hessian scratch lock poisoned".to_string())?;
        let (physical_in, physical_out) = &mut *guard;
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            physical_in[physical_idx] = outer_idx.map(|idx| v[idx]).unwrap_or(0.0);
        }
        self.base.apply_into(physical_in, physical_out)?;
        if physical_out.len() != self.physical_to_outer.len() {
            return Err(format!(
                "labeled outer Hessian physical apply_into length mismatch: got {}, expected {}",
                physical_out.len(),
                self.physical_to_outer.len()
            ));
        }
        out.fill(0.0);
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                out[outer_idx] += physical_out[physical_idx];
            }
        }
        Ok(())
    }

    fn mul_mat(&self, factor: ndarray::ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
        if factor.nrows() != self.outer_dim {
            return Err(format!(
                "labeled outer Hessian factor row mismatch: got {}, expected {}",
                factor.nrows(),
                self.outer_dim
            ));
        }
        let mut physical_factor =
            Array2::<f64>::zeros((self.physical_to_outer.len(), factor.ncols()));
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                physical_factor
                    .row_mut(physical_idx)
                    .assign(&factor.row(outer_idx));
            }
        }
        let physical_out = self.base.mul_mat(physical_factor.view())?;
        if physical_out.nrows() != self.physical_to_outer.len() {
            return Err(format!(
                "labeled outer Hessian physical output row mismatch: got {}, expected {}",
                physical_out.nrows(),
                self.physical_to_outer.len()
            ));
        }
        let mut out = Array2::<f64>::zeros((self.outer_dim, factor.ncols()));
        for (physical_idx, outer_idx) in self.physical_to_outer.iter().enumerate() {
            if let Some(outer_idx) = *outer_idx {
                let physical_row = physical_out.row(physical_idx);
                out.row_mut(outer_idx).scaled_add(1.0, &physical_row);
            }
        }
        Ok(out)
    }

    fn is_cheap_to_materialize(&self) -> bool {
        self.base.is_cheap_to_materialize()
    }

    fn materialization_capability(
        &self,
    ) -> crate::solver::outer_strategy::OuterHessianMaterialization {
        self.base.materialization_capability()
    }
}

pub(crate) fn custom_family_batched_outer_hessian_operator<F: CustomFamily>(
    family: &F,
    states: &[ParameterBlockState],
    specs: &[ParameterBlockSpec],
    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
    rho: &Array1<f64>,
    workspace: Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
    eval_mode: EvalMode,
) -> Result<Option<Arc<dyn crate::solver::outer_strategy::OuterHessianOperator>>, String> {
    if eval_mode != EvalMode::ValueGradientHessian {
        return Ok(None);
    }
    let Some(terms) =
        family.batched_outer_hessian_terms(states, specs, derivative_blocks, rho, workspace)?
    else {
        return Ok(None);
    };
    match terms.outer_hessian {
        crate::solver::outer_strategy::HessianResult::Operator(operator) => Ok(Some(operator)),
        crate::solver::outer_strategy::HessianResult::Analytic(matrix) => {
            Ok(Some(Arc::new(OwnedDenseOuterHessianOperator { matrix })))
        }
        crate::solver::outer_strategy::HessianResult::Unavailable => Ok(None),
    }
}

pub(crate) fn outer_efs_result_to_joint_hyper_efs_result(
    efs_eval: crate::solver::outer_strategy::EfsEval,
    warm_start: ConstrainedWarmStart,
    inner_converged: bool,
) -> CustomFamilyJointHyperEfsResult {
    CustomFamilyJointHyperEfsResult {
        efs_eval,
        warm_start: CustomFamilyWarmStart { inner: warm_start },
        inner_converged,
    }
}

// Unified exact joint hyper-calculus over theta = [rho, psi].
//
// The correct outer problem is not “a rho objective plus a separate psi
// objective”. It is one profiled/Laplace surface over one flattened hypervector
//
//   theta = [rho, psi],
//
// one flattened joint coefficient vector
//
//   beta = [beta_1; ...; beta_B],
//
// and one joint exact mode system
//
//   F(beta, theta) := V_beta(beta, theta) = 0,
//   H(beta, theta) := V_beta_beta(beta, theta).
//
// For every hypercoordinate theta_i we need the fixed-beta objects
//
//   V_i = partial_{theta_i} V,
//   g_i = partial_{theta_i} F,
//   H_i = partial_{theta_i} H,
//
// and for every pair (i, j)
//
//   V_ij, g_ij, H_ij,
//
// together with the beta-curvature contractions
//
//   D_beta H[u],
//   D_beta^2 H[u, v],
//   T_i[u] := D_beta H_i[u].
//
// The exact profiled mode response and total Hessian drifts are then
//
//   beta_i  = -H^{-1} g_i,
//   beta_ij = -H^{-1}(g_ij + H_i beta_j + H_j beta_i + D_beta H[beta_i] beta_j),
//
//   dot H_i
//   = H_i + D_beta H[beta_i],
//
//   ddot H_ij
//   = H_ij
//     + T_i[beta_j]
//     + T_j[beta_i]
//     + D_beta H[beta_ij]
//     + D_beta^2 H[beta_i, beta_j].
//
// Hence the exact joint profiled/Laplace derivatives are
//
//   J_i
//   = V_i + 0.5 tr(H^{-1} dot H_i) - 0.5 partial_i log|S(theta)|_+,
//
//   J_ij
//   = (V_ij - g_i^T H^{-1} g_j)
//     + 0.5 [ tr(H^{-1} ddot H_ij)
//             - tr(H^{-1} dot H_j H^{-1} dot H_i) ]
//     - 0.5 partial^2_{ij} log|S(theta)|_+.
//
// In this unified view rho and psi are the same outer calculus. They differ
// only in where their fixed-beta derivative objects come from:
//
// - rho coordinates often contribute only through the penalty surface,
//     but the generic assembler intentionally treats the penalty as S(theta),
//     not S(rho), so mixed rho/psi penalty terms are allowed whenever realized
//     component penalties move with psi:
//       V_i  = D_i  + 0.5 beta^T S_i beta
//       g_i  = D_beta_i  + S_i beta
//       H_i  = D_beta_beta_i + S_i
//       V_ij = D_ij + 0.5 beta^T S_ij beta
//       g_ij = D_beta_ij + S_ij beta
//       H_ij = D_beta_beta_ij + S_ij.
//
// - psi coordinates come from the family-specific joint exact psi hooks, while
//   the generic assembler still owns any realized-penalty motion through
//   S_i / S_ij:
//     objective_psi            <-> V_i
//     score_psi                <-> g_i
//     hessian_psi              <-> H_i
//     objective_psi_psi        <-> V_ij
//     score_psi_psi            <-> g_ij
//     hessian_psi_psi          <-> H_ij
//     D_beta H_psi[u]          <-> T_i[u].
//
// For coupled families this means any block-local psi path is wrong. Even when
// g_i is sparse or penalty-local, beta_i is defined by the full joint solve
//
//   beta_i = -H^{-1} g_i,
//
// so every exact outer derivative must be assembled in this joint flattened
// space.

pub(crate) fn with_block_geometry<F: CustomFamily + ?Sized, T>(
    family: &F,
    block_states: &[ParameterBlockState],
    spec: &ParameterBlockSpec,
    block_idx: usize,
    f: impl FnOnce(&DesignMatrix, &Array1<f64>) -> Result<T, String>,
) -> Result<T, String> {
    if family.block_geometry_is_dynamic() {
        let (x_dyn, off_dyn) = family.block_geometry(block_states, spec)?;
        let expected_rows = spec.solver_design().nrows();
        if x_dyn.nrows() != expected_rows {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} dynamic design row mismatch: got {}, expected {}",
                    x_dyn.nrows(),
                    expected_rows
                ),
            }
            .into());
        }
        if x_dyn.ncols() != spec.design.ncols() {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} dynamic design col mismatch: got {}, expected {}",
                    x_dyn.ncols(),
                    spec.design.ncols()
                ),
            }
            .into());
        }
        if off_dyn.len() != expected_rows {
            return Err(CustomFamilyError::DimensionMismatch {
                reason: format!(
                    "block {block_idx} dynamic offset length mismatch: got {}, expected {}",
                    off_dyn.len(),
                    expected_rows
                ),
            }
            .into());
        }
        f(&x_dyn, &off_dyn)
    } else {
        f(spec.solver_design(), spec.solver_offset())
    }
}

pub(crate) fn flatten_log_lambdas(specs: &[ParameterBlockSpec]) -> Array1<f64> {
    let total = specs
        .iter()
        .map(|s| s.initial_log_lambdas.len())
        .sum::<usize>();
    let mut out = Array1::<f64>::zeros(total);
    let mut at = 0usize;
    for spec in specs {
        let len = spec.initial_log_lambdas.len();
        if len > 0 {
            out.slice_mut(ndarray::s![at..at + len])
                .assign(&spec.initial_log_lambdas);
        }
        at += len;
    }
    out
}

#[derive(Clone, Debug)]
pub(crate) struct PenaltyLabelLayout {
    pub(crate) penalty_counts: Vec<usize>,
    pub(crate) physical_to_outer: Vec<Option<usize>>,
    pub(crate) fixed_log_lambdas: Vec<Option<f64>>,
    pub(crate) initial_rho: Array1<f64>,
}

impl PenaltyLabelLayout {
    pub(crate) fn physical_count(&self) -> usize {
        self.physical_to_outer.len()
    }

    pub(crate) fn has_tied_coordinates(&self) -> bool {
        self.initial_rho.len() != self.physical_to_outer.len()
    }
}

pub(crate) fn penalty_label_layout(
    specs: &[ParameterBlockSpec],
    penalty_counts: Vec<usize>,
) -> Result<PenaltyLabelLayout, String> {
    let mut label_to_outer = BTreeMap::<String, usize>::new();
    let mut physical_to_outer = Vec::<Option<usize>>::new();
    let mut fixed_log_lambdas = Vec::<Option<f64>>::new();
    let mut initial = Vec::<f64>::new();

    for (block_idx, spec) in specs.iter().enumerate() {
        for penalty_idx in 0..spec.penalties.len() {
            if let Some(fixed) = spec.penalties[penalty_idx].fixed_log_lambda() {
                if !fixed.is_finite() {
                    return Err(CustomFamilyError::ConstraintViolation {
                        reason: format!(
                            "block {block_idx} penalty {penalty_idx} fixed log-precision is non-finite: {fixed}"
                        ),
                    }
                    .into());
                }
                physical_to_outer.push(None);
                fixed_log_lambdas.push(Some(fixed));
                continue;
            }
            let label = spec.penalties[penalty_idx]
                .precision_label()
                .map(str::to_owned)
                .unwrap_or_else(|| format!("__block_{block_idx}_penalty_{penalty_idx}"));
            let rho0 = spec.initial_log_lambdas[penalty_idx];
            let outer = if let Some(&outer) = label_to_outer.get(&label) {
                let first = initial[outer];
                if first.is_finite() && rho0.is_finite() && (first - rho0).abs() > 1e-10 {
                    return Err(CustomFamilyError::ConstraintViolation { reason: format!(
                        "precision label '{label}' has inconsistent initial log-precisions: {first} and {rho0}"
                    ) }.into());
                }
                outer
            } else {
                let outer = initial.len();
                label_to_outer.insert(label, outer);
                initial.push(rho0);
                outer
            };
            physical_to_outer.push(Some(outer));
            fixed_log_lambdas.push(None);
        }
    }

    Ok(PenaltyLabelLayout {
        penalty_counts,
        physical_to_outer,
        fixed_log_lambdas,
        initial_rho: Array1::from_vec(initial),
    })
}

pub(crate) fn expand_labeled_log_lambdas(
    rho: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Array1<f64>, String> {
    if rho.len() != layout.initial_rho.len() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "log-lambda label coordinate mismatch: got {}, expected {}",
                rho.len(),
                layout.initial_rho.len()
            ),
        }
        .into());
    }
    let mut expanded = Array1::<f64>::zeros(layout.physical_count());
    for (physical, outer) in layout.physical_to_outer.iter().enumerate() {
        expanded[physical] = match *outer {
            Some(outer) => rho[outer],
            None => layout.fixed_log_lambdas[physical].ok_or_else(|| {
                CustomFamilyError::ConstraintViolation {
                    reason: format!(
                        "fixed penalty layout missing value at physical slot {physical}"
                    ),
                }
                .to_string()
            })?,
        };
    }
    Ok(expanded)
}

pub(crate) fn split_labeled_log_lambdas(
    rho: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Vec<Array1<f64>>, String> {
    let expanded = expand_labeled_log_lambdas(rho, layout)?;
    split_log_lambdas(&expanded, &layout.penalty_counts)
}

pub(crate) fn aggregate_labeled_gradient(
    gradient: &Array1<f64>,
    layout: &PenaltyLabelLayout,
) -> Result<Array1<f64>, String> {
    if gradient.len() != layout.physical_count() {
        return Err(CustomFamilyError::DimensionMismatch {
            reason: format!(
                "physical gradient length mismatch: got {}, expected {}",
                gradient.len(),
                layout.physical_count()
            ),
        }
        .into());
    }
    let mut out = Array1::<f64>::zeros(layout.initial_rho.len());
    for (physical, outer) in layout.physical_to_outer.iter().enumerate() {
        if let Some(outer) = *outer {
            out[outer] += gradient[physical];
        }
    }
    Ok(out)
}