gam 0.3.104 - Docs.rs

//! Generic penalized vector-response GLM Newton solver (fixed λ).
//!
//! This is the shared scaffold extracted from
//! [`crate::families::multinomial::fit_penalized_multinomial`] (dense softmax
//! Fisher block) and
//! [`crate::families::binomial_multi::fit_penalized_binomial_multi`]
//! (row-diagonal independent-binomial Fisher block). Both families fit a
//! penalized vector-response GLM with a shared design `X ∈ ℝ^{N×P}` and a
//! shared penalty `S ∈ ℝ^{P×P}` replicated per output, differing **only** in
//! the per-row Fisher-block algebra and the likelihood/residual. Everything
//! else — input validation, penalized objective / gradient / Hessian assembly,
//! damped Newton with backtracking, the relative-step convergence test, and the
//! final penalized-objective / deviance tally — is written once here.
//!
//! # Fit problem
//!
//! With `β = [β_0; β_1; …; β_{M-1}]` stacked in output-major order
//! (`β_a ∈ ℝ^P` is the coefficient block for output `a`), minimise the
//! penalized negative log-likelihood
//!
//! ```text
//!   F(β) = − log L(β) + ½ Σ_{a=0}^{M-1} λ_a · β_aᵀ S β_a
//! ```
//!
//! where `log L` and its η-derivatives are supplied by the family's
//! [`VectorLikelihood`] adapter and `λ_a` is a per-output smoothing parameter
//! scaling the shared penalty `S`. The active linear predictor is
//! `η_{n,a} = (X β_a)_n`, shape `(N, M)`.
//!
//! # Newton step
//!
//! Each iteration assembles the coupled penalized Hessian and gradient in
//! output-major coefficient ordering `flat[a·P + i] = β[i, a]` (matching
//! [`crate::solver::pirls::dense_block_xtwx`]):
//!
//! ```text
//!   H[a·P + i, b·P + j] = Σ_n W_{n,a,b} · X[n,i] · X[n,j]   (+ δ_{ab} λ_a S[i,j])
//!   g[a·P + i]          = Σ_n r_{n,a} · X[n,i]              (+ λ_a (S β_a)[i])
//! ```
//!
//! with the per-row Fisher block `W_{n,·,·} = −∂² log L / ∂η ∂η` (the family's
//! [`VectorLikelihood::hess_block`], or a caller override) and the residual
//! `r_{n,a} = −∂ log L / ∂η_a` (`−`[`VectorLikelihood::grad_eta`]). The step
//! `δ = − H^{-1} g` is solved through faer's symmetric-PD-with-fallback
//! factorisation under an adaptive Levenberg–Marquardt ridge: when a
//! rank-deficient block (collinear / quasi-separated columns under a small
//! per-output λ) makes the Bunch–Kaufman fallback back-substitute through
//! near-zero pivots into a non-finite δ, a diagonal ridge `τ·I` — scaled by the
//! Hessian's largest diagonal so it is curvature-scale invariant — is added and
//! the system re-solved, escalating τ geometrically until δ is finite. The
//! step is then accepted by a backtracking line search on `F` (full step first,
//! halve up to 8 times). Because the line search validates against the
//! *unridged* objective `F`, the ridge never biases the converged β̂ (at the
//! optimum the gradient vanishes and δ → 0 for any τ). Convergence is the
//! relative coefficient step `‖δ‖ / (1 + ‖β‖) ≤ tol`.
//!
//! # Fisher-block override
//!
//! When `fisher_w_override` is `Some`, each Newton step uses the supplied
//! per-row `(N, M, M)` curvature block in place of the analytic
//! [`VectorLikelihood::hess_block`]; the gradient/residual path stays analytic
//! (issue #349). The two families differ in what they accept off the diagonal:
//! multinomial admits a full dense block, while independent-binomial columns
//! only consume the per-output diagonal (a non-zero cross term cannot be
//! represented by the separable columns). That family-specific precondition is
//! enforced by the adapter before it constructs the override view; the engine
//! consumes whatever block it is given.

use crate::faer_ndarray::{FaerArrayView, array2_to_matmut, factorize_symmetricwith_fallback};
use crate::families::vector_response::VectorLikelihood;
use crate::pirls::dense_block_xtwx;
use crate::solver::estimate::EstimationError;
use faer::Side;
use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayView3};

/// Inputs to [`fit_penalized_vector_glm`].
///
/// `M` (the number of active outputs / linear-predictor columns) is taken from
/// `lambdas.len()`; the engine validates it against the design and override
/// shapes. The response `y` is passed verbatim to the [`VectorLikelihood`]
/// adapter, which owns its own `(N, ·)` shape contract (binomial columns use
/// `K = M`; multinomial one-hot uses `K = M + 1`), so the engine does not
/// constrain its column count beyond `y.nrows() == N`.
pub struct PenalizedVectorGlmInputs<'a> {
    /// Design matrix `X ∈ ℝ^{N×P}` (one row per observation, shared across
    /// every output column).
    pub design: ArrayView2<'a, f64>,
    /// Response `Y ∈ ℝ^{N×·}`, interpreted by the [`VectorLikelihood`].
    pub y: ArrayView2<'a, f64>,
    /// Shared smoothing penalty `S ∈ ℝ^{P×P}` (symmetric, PSD).
    pub penalty: ArrayView2<'a, f64>,
    /// Per-output smoothing parameter `λ_a`, length `M`.
    pub lambdas: ArrayView1<'a, f64>,
    /// Optional per-row Fisher-block override, shape `(N, M, M)`. When `Some`,
    /// it replaces the analytic [`VectorLikelihood::hess_block`] as the Newton
    /// curvature; the gradient/residual path stays analytic (issue #349). The
    /// adapter is responsible for any family-specific structural precondition
    /// on the block (e.g. zero off-diagonals for independent columns).
    pub fisher_w_override: Option<ArrayView3<'a, f64>>,
    /// Maximum Newton iterations; recommend 50.
    pub max_iter: usize,
    /// Relative-step convergence tolerance; recommend 1e-7.
    pub tol: f64,
}

/// Outputs of [`fit_penalized_vector_glm`].
pub struct PenalizedVectorGlmOutputs {
    /// Coefficient matrix, shape `(P, M)` (column `a` is `β_a`).
    pub coefficients: Array2<f64>,
    /// Final active linear predictor `η = X β̂`, shape `(N, M)`. The adapter
    /// turns this into fitted probabilities via its own inverse link.
    pub eta: Array2<f64>,
    /// Number of Newton iterations executed (including the final step that
    /// satisfied the tolerance).
    pub iterations: usize,
    /// `true` if the relative-step test was satisfied before `max_iter`.
    pub converged: bool,
    /// Unpenalized log-likelihood `log L(β̂)`.
    pub log_likelihood: f64,
    /// Penalty term `½ Σ_a λ_a · β̂_aᵀ S β̂_a` at the returned `β̂`.
    pub penalty_term: f64,
}

/// Quadratic form `½ β_aᵀ S β_a` accumulated across outputs with per-output
/// weight `λ_a`. Shared by the objective evaluator and the final tally.
fn weighted_penalty_sum(
    beta: &Array2<f64>,
    penalty: ArrayView2<'_, f64>,
    lambdas: ArrayView1<'_, f64>,
) -> f64 {
    let (p, m) = beta.dim();
    let mut pen = 0.0_f64;
    for a in 0..m {
        let la = lambdas[a];
        if la == 0.0 {
            continue;
        }
        let beta_col = beta.column(a);
        let mut quad = 0.0_f64;
        for i in 0..p {
            let mut s_beta_i = 0.0_f64;
            for j in 0..p {
                s_beta_i += penalty[[i, j]] * beta_col[j];
            }
            quad += beta_col[i] * s_beta_i;
        }
        pen += 0.5 * la * quad;
    }
    pen
}

/// Fit a penalized vector-response GLM at fixed `λ` via damped Newton.
///
/// The `likelihood` adapter supplies the per-row Fisher block, the residual
/// gradient, and the log-likelihood; the engine owns the entire optimisation
/// scaffold. See the module docs for the optimisation problem, the
/// output-major coefficient ordering, and the convergence semantics.
///
/// `context` is woven into every diagnostic message so each family keeps its
/// own error prefix (e.g. `"fit_penalized_multinomial"`).
pub fn fit_penalized_vector_glm<L: VectorLikelihood>(
    inputs: PenalizedVectorGlmInputs<'_>,
    likelihood: &L,
    context: &str,
) -> Result<PenalizedVectorGlmOutputs, EstimationError> {
    let PenalizedVectorGlmInputs {
        design,
        y,
        penalty,
        lambdas,
        fisher_w_override,
        max_iter,
        tol,
    } = inputs;

    // ────────────────────────────── shape checks ──────────────────────────
    let n_obs = design.nrows();
    let p = design.ncols();
    if n_obs == 0 || p == 0 {
        crate::bail_invalid_estim!("{context}: design must be nonempty (got {n_obs}x{p})");
    }
    let m = lambdas.len();
    if m == 0 {
        crate::bail_invalid_estim!("{context}: need at least one active output (got M=0)");
    }
    if y.nrows() != n_obs {
        crate::bail_invalid_estim!("{context}: y rows {} ≠ design rows {n_obs}", y.nrows());
    }
    if penalty.dim() != (p, p) {
        crate::bail_invalid_estim!(
            "{context}: penalty shape {:?} ≠ (P, P) = ({p}, {p})",
            penalty.dim()
        );
    }
    for (i, &v) in lambdas.iter().enumerate() {
        if !(v.is_finite() && v >= 0.0) {
            crate::bail_invalid_estim!("{context}: lambdas[{i}] must be finite and ≥ 0 (got {v})");
        }
    }
    if let Some(fw) = fisher_w_override.as_ref() {
        if fw.dim() != (n_obs, m, m) {
            crate::bail_invalid_estim!(
                "{context}: fisher_w_override shape {:?} ≠ (N, M, M) = ({n_obs}, {m}, {m})",
                fw.dim()
            );
        }
    }
    for ((i, j), &v) in design.indexed_iter() {
        if !v.is_finite() {
            crate::bail_invalid_estim!("{context}: design[{i},{j}] must be finite (got {v})");
        }
    }

    // ────────────────────────── Newton iteration ──────────────────────────
    // β stored as (P, M) column-major-per-output; flat index uses output-major
    // ordering `flat[a · P + i] = β[i, a]` to align with `dense_block_xtwx`.
    let mut beta = Array2::<f64>::zeros((p, m));
    let mut eta = Array2::<f64>::zeros((n_obs, m));
    let beta_flat_dim = p * m;

    let mut iterations = 0usize;
    let mut converged = false;
    let mut last_objective = f64::INFINITY;

    // η = X · β for the current β, reused by the analytic Fisher / gradient.
    let recompute_eta = |beta: &Array2<f64>, eta: &mut Array2<f64>| {
        for a in 0..m {
            let beta_col = beta.column(a);
            for row in 0..n_obs {
                let mut eta_val = 0.0_f64;
                for i in 0..p {
                    eta_val += design[[row, i]] * beta_col[i];
                }
                eta[[row, a]] = eta_val;
            }
        }
    };

    // Penalized objective F(β) = − log L(X β) + ½ Σ_a λ_a β_aᵀ S β_a.
    let evaluate_objective = |beta_trial: &Array2<f64>| -> f64 {
        let mut eta_trial = Array2::<f64>::zeros((n_obs, m));
        for a in 0..m {
            let beta_col = beta_trial.column(a);
            for row in 0..n_obs {
                let mut v = 0.0_f64;
                for i in 0..p {
                    v += design[[row, i]] * beta_col[i];
                }
                eta_trial[[row, a]] = v;
            }
        }
        let ll = likelihood.log_lik(eta_trial.view(), y);
        let pen = weighted_penalty_sum(beta_trial, penalty, lambdas);
        -ll + pen
    };

    for iter in 0..max_iter {
        iterations = iter + 1;

        recompute_eta(&beta, &mut eta);

        // Per-row dense Fisher block W_{n,a,b} = −∂² log L / ∂η_a ∂η_b: either
        // the caller-supplied curvature override (issue #349 escape-hatch —
        // curvature only) or the analytic [`VectorLikelihood::hess_block`]. The
        // residual r_{n,a} = −∂ log L / ∂η_a stays analytic in both cases.
        let analytic_fisher = fisher_w_override
            .as_ref()
            .map_or_else(|| Some(likelihood.hess_block(eta.view(), y)), |_| None);
        let fisher_blocks = match fisher_w_override.as_ref() {
            Some(fw) => *fw,
            None => analytic_fisher
                .as_ref()
                .expect("analytic Fisher computed when no override")
                .view(),
        };
        let residual = likelihood.grad_eta(eta.view(), y).mapv(|v| -v);

        // Penalized Hessian: H = block(XᵀWX) + diag_a(λ_a S).
        let mut hessian = dense_block_xtwx(design, fisher_blocks, None)?;
        if hessian.nrows() != beta_flat_dim || hessian.ncols() != beta_flat_dim {
            crate::bail_invalid_estim!(
                "{context}: assembled Hessian shape {:?} ≠ ({beta_flat_dim}, {beta_flat_dim})",
                hessian.dim()
            );
        }
        for a in 0..m {
            let la = lambdas[a];
            if la == 0.0 {
                continue;
            }
            let base = a * p;
            for i in 0..p {
                for j in 0..p {
                    hessian[[base + i, base + j]] += la * penalty[[i, j]];
                }
            }
        }

        // Penalized gradient: g_a = Xᵀ r_{·,a} + λ_a S β_a.
        let mut grad_flat = Array1::<f64>::zeros(beta_flat_dim);
        for a in 0..m {
            for i in 0..p {
                let mut acc = 0.0_f64;
                for row in 0..n_obs {
                    acc += design[[row, i]] * residual[[row, a]];
                }
                grad_flat[a * p + i] = acc;
            }
        }
        for a in 0..m {
            let la = lambdas[a];
            if la == 0.0 {
                continue;
            }
            let beta_col = beta.column(a);
            for i in 0..p {
                let mut s_beta_i = 0.0_f64;
                for j in 0..p {
                    s_beta_i += penalty[[i, j]] * beta_col[j];
                }
                grad_flat[a * p + i] += la * s_beta_i;
            }
        }

        // δ = − H^{-1} · grad, solved through an adaptive Levenberg–Marquardt
        // ridge. The penalized Hessian `H = block(XᵀWX) + diag_a(λ_a S)` can be
        // rank-deficient — a multinomial class block with quasi-separated /
        // collinear columns and a small per-class λ leaves `XᵀW_aX + λ_a S`
        // singular. faer's symmetric fallback chain ends at Bunch–Kaufman
        // (LBLᵀ), which factorizes indefinite/singular matrices "successfully"
        // and then back-substitutes through near-zero pivots, yielding a
        // non-finite δ. Rather than aborting the whole fit on one bad block, we
        // add a small ridge `τ·I` (Levenberg style) to the diagonal and
        // re-factorize, escalating τ geometrically until the step is finite.
        //
        // The base ridge is scaled by the Hessian's largest diagonal entry so
        // it is invariant to the problem's overall curvature scale: a tiny
        // nudge relative to the dominant curvature, large enough to lift the
        // null directions off zero. A finite δ from the ridged system is a
        // descent direction for the *unridged* penalized objective `F`
        // (ridging only shrinks the step toward the gradient direction), and
        // the backtracking line search below validates it against `F` itself,
        // so the ridge never biases the converged β̂ — at the optimum the
        // gradient vanishes and the step → 0 regardless of τ.
        let max_diag =
            (0..beta_flat_dim).fold(0.0_f64, |acc, idx| acc.max(hessian[[idx, idx]].abs()));
        let base_ridge = if max_diag.is_finite() && max_diag > 0.0 {
            max_diag * 1.0e-10
        } else {
            1.0e-10
        };
        // 30 doublings span ~9 orders of magnitude over the base ridge, which
        // covers any conditioning a finite-curvature softmax/binomial block can
        // present.
        //
        // The ridge floors at `base_ridge` (not 0) for every solve. An exactly
        // rank-deficient block (e.g. duplicate / collinear design columns under
        // a near-zero λ) leaves `H = block(XᵀWX) + diag_a(λ_a S)` singular along
        // a null direction. faer's Bunch–Kaufman fallback factorizes a singular
        // matrix "successfully" and back-substitutes through the zero pivot to a
        // *finite but arbitrary* component in the null space, so the resulting
        // Newton direction is not a descent direction in the identified
        // subspace — the line search then shrinks α toward 0 and the step-norm
        // test declares a false convergence at a point where the unridged
        // penalized gradient on identified directions is still large (gam#856).
        // A minimal Tikhonov ridge `base_ridge·I = max_diag·1e-10·I` resolves the
        // null direction to its minimum-norm representative, giving a true
        // descent direction. Because `base_ridge` is ~1e-10 of the dominant
        // curvature, it is negligible relative to identified-direction curvature,
        // so it never biases the identified optimum: at β̂ the unridged gradient
        // still vanishes there.
        const MAX_RIDGE_ESCALATIONS: usize = 30;
        let mut delta = Array1::<f64>::zeros(beta_flat_dim);
        let mut ridge = base_ridge;
        let mut solved = false;
        for attempt in 0..=MAX_RIDGE_ESCALATIONS {
            let mut ridged = hessian.clone();
            if ridge > 0.0 {
                for idx in 0..beta_flat_dim {
                    ridged[[idx, idx]] += ridge;
                }
            }
            let factor = match factorize_symmetricwith_fallback(
                FaerArrayView::new(&ridged).as_ref(),
                Side::Lower,
            ) {
                Ok(factor) => factor,
                Err(err) => {
                    // A genuine factorization failure (not just a singular
                    // pivot) — escalate the ridge and retry; only give up after
                    // exhausting the escalation budget.
                    if attempt == MAX_RIDGE_ESCALATIONS {
                        return Err(EstimationError::InvalidInput(format!(
                            "{context}: Hessian factorization failed at iter {iter} \
                             even with ridge {ridge:.3e}: {err}"
                        )));
                    }
                    ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
                    continue;
                }
            };
            let mut rhs = Array2::<f64>::zeros((beta_flat_dim, 1));
            for i in 0..beta_flat_dim {
                rhs[[i, 0]] = -grad_flat[i];
            }
            {
                let rhs_view = array2_to_matmut(&mut rhs);
                factor.solve_in_place(rhs_view);
            }
            if (0..beta_flat_dim).all(|i| rhs[[i, 0]].is_finite()) {
                for i in 0..beta_flat_dim {
                    delta[i] = rhs[[i, 0]];
                }
                solved = true;
                break;
            }
            // Singular pivots back-substituted to ±inf/NaN: escalate the ridge.
            ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
        }
        assert!(
            solved,
            "{context}: Newton step remained non-finite at iter {iter} after {} ridge \
             escalations up to {ridge:.3e}; the penalized Hessian is pathologically \
             rank-deficient (grad_norm={:.3e}, max_diag={max_diag:.3e})",
            MAX_RIDGE_ESCALATIONS,
            grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt(),
        );

        // Damped acceptance: full step first, halve up to 8 times if the
        // penalized negative log-likelihood fails to decrease. The first
        // iteration seeds `last_objective` from the initial β.
        let proposed_beta = |alpha: f64| -> Array2<f64> {
            let mut out = beta.clone();
            for a in 0..m {
                for i in 0..p {
                    out[[i, a]] += alpha * delta[a * p + i];
                }
            }
            out
        };
        if iter == 0 {
            last_objective = evaluate_objective(&beta);
            if !last_objective.is_finite() {
                crate::bail_invalid_estim!("{context}: non-finite objective at β = 0");
            }
        }
        let mut alpha = 1.0_f64;
        let mut accepted_beta = proposed_beta(alpha);
        let mut new_objective = evaluate_objective(&accepted_beta);
        let mut backtrack = 0usize;
        while (!new_objective.is_finite() || new_objective > last_objective + 1.0e-12)
            && backtrack < 8
        {
            alpha *= 0.5;
            accepted_beta = proposed_beta(alpha);
            new_objective = evaluate_objective(&accepted_beta);
            backtrack += 1;
        }

        let mut step_norm_sq = 0.0_f64;
        let mut beta_norm_sq = 0.0_f64;
        for a in 0..m {
            for i in 0..p {
                let d = accepted_beta[[i, a]] - beta[[i, a]];
                step_norm_sq += d * d;
                let v = accepted_beta[[i, a]];
                beta_norm_sq += v * v;
            }
        }

        beta = accepted_beta;
        last_objective = new_objective;

        let step_norm = step_norm_sq.sqrt();
        let beta_norm = beta_norm_sq.sqrt();
        // First-order optimality gate (gam#856): the step-norm test alone can
        // fire prematurely when a backtracking line search has shrunk α on a
        // poor direction, leaving a point that is NOT stationary. `grad_flat`
        // is the unridged penalized gradient ∇F(β) at the pre-step β; with a
        // small step it is ≈ ∇F at the accepted β. Its norm reflects only
        // identified directions (it is exactly zero along an unidentified null
        // direction such as a duplicate-column e₁−e₂ split), so requiring it to
        // be small certifies first-order optimality on the identified subspace
        // without penalizing legitimate non-identifiability. Scale the gate by
        // the data magnitude so it is invariant to problem scale.
        let grad_norm = grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt();
        // Curvature-scaled optimality threshold: `max_diag` is the dominant
        // penalized-Hessian diagonal entry, so `1e-6·max_diag` is a tiny
        // gradient relative to the problem's curvature scale and is reached by
        // a few quadratically-converging Newton steps on this smooth, bounded
        // softmax/binomial likelihood.
        let grad_optimal = grad_norm <= 1.0e-6 * (1.0 + max_diag);
        if step_norm <= tol * (1.0 + beta_norm) && grad_optimal {
            converged = true;
            break;
        }
    }

    // ──────────────────────────── post-process ────────────────────────────
    recompute_eta(&beta, &mut eta);
    let log_likelihood = likelihood.log_lik(eta.view(), y);
    let penalty_term = weighted_penalty_sum(&beta, penalty, lambdas);

    Ok(PenalizedVectorGlmOutputs {
        coefficients: beta,
        eta,
        iterations,
        converged,
        log_likelihood,
        penalty_term,
    })
}

#[cfg(test)]
mod parity_tests {
    //! Parity tests for the shared scaffold across both Fisher-block families
    //! (issue #409). The engine is exercised through the two public adapters —
    //! [`crate::families::binomial_multi::fit_penalized_binomial_multi`]
    //! (row-diagonal block) and
    //! [`crate::families::multinomial::fit_penalized_multinomial`] (dense
    //! softmax block) — and we assert, with un-weakened bounds, that:
    //!
    //!   1. each fit hits the first-order optimality condition `∇F(β̂) = 0`,
    //!      verified by a central finite difference of the penalized objective
    //!      (the engine never sees this gradient, so this is an independent
    //!      check that the shared Newton scaffold converged correctly);
    //!   2. the reported fitted probabilities are consistent with `β̂` and the
    //!      reported deviance equals `−2 · log L(β̂)`;
    //!   3. for the binomial family, the `K`-column joint solve reproduces a
    //!      from-scratch single-column penalized logistic Newton solve column
    //!      for column (the row-diagonal block must decouple exactly).

    use crate::families::binomial_multi::{BinomialMultiFitInputs, fit_penalized_binomial_multi};
    use crate::families::multinomial::{MultinomialFitInputs, fit_penalized_multinomial};
    use ndarray::{Array1, Array2};

    fn sigmoid(eta: f64) -> f64 {
        if eta >= 0.0 {
            1.0 / (1.0 + (-eta).exp())
        } else {
            let e = eta.exp();
            e / (1.0 + e)
        }
    }

    /// Softmax with implicit reference column (η_ref = 0) over `M` active η.
    fn softmax_ref(eta_active: &[f64]) -> Vec<f64> {
        let m = eta_active.len();
        let mut out = vec![0.0_f64; m + 1];
        let mut max_eta = 0.0_f64;
        for &v in eta_active {
            if v > max_eta {
                max_eta = v;
            }
        }
        let baseline = (-max_eta).exp();
        let mut denom = baseline;
        for (idx, &v) in eta_active.iter().enumerate() {
            let e = (v - max_eta).exp();
            out[idx] = e;
            denom += e;
        }
        for v in out.iter_mut().take(m) {
            *v /= denom;
        }
        out[m] = baseline / denom;
        out
    }

    /// Penalized negative log-likelihood for the independent-binomial family at
    /// a candidate coefficient matrix `β ∈ ℝ^{P×K}`, computed directly from the
    /// definition (no engine internals).
    fn binomial_objective(
        design: &Array2<f64>,
        y: &Array2<f64>,
        penalty: &Array2<f64>,
        lambdas: &Array1<f64>,
        beta: &Array2<f64>,
    ) -> f64 {
        let (n, p) = design.dim();
        let k = y.ncols();
        let mut ll = 0.0_f64;
        for row in 0..n {
            for a in 0..k {
                let mut eta = 0.0_f64;
                for i in 0..p {
                    eta += design[[row, i]] * beta[[i, a]];
                }
                let mu = sigmoid(eta).clamp(1.0e-12, 1.0 - 1.0e-12);
                let yv = y[[row, a]];
                ll += yv * mu.ln() + (1.0 - yv) * (1.0 - mu).ln();
            }
        }
        let mut pen = 0.0_f64;
        for a in 0..k {
            let la = lambdas[a];
            for i in 0..p {
                let mut sbi = 0.0_f64;
                for j in 0..p {
                    sbi += penalty[[i, j]] * beta[[j, a]];
                }
                pen += 0.5 * la * beta[[i, a]] * sbi;
            }
        }
        -ll + pen
    }

    /// Penalized negative log-likelihood for the multinomial family at a
    /// candidate active-class coefficient matrix `β ∈ ℝ^{P×(K-1)}`.
    fn multinomial_objective(
        design: &Array2<f64>,
        y_one_hot: &Array2<f64>,
        penalty: &Array2<f64>,
        lambdas: &Array1<f64>,
        beta: &Array2<f64>,
    ) -> f64 {
        let (n, p) = design.dim();
        let k = y_one_hot.ncols();
        let m = k - 1;
        let mut ll = 0.0_f64;
        let mut eta_active = vec![0.0_f64; m];
        for row in 0..n {
            for a in 0..m {
                let mut eta = 0.0_f64;
                for i in 0..p {
                    eta += design[[row, i]] * beta[[i, a]];
                }
                eta_active[a] = eta;
            }
            let probs = softmax_ref(&eta_active);
            for c in 0..k {
                let yc = y_one_hot[[row, c]];
                if yc != 0.0 {
                    ll += yc * probs[c].max(1.0e-300).ln();
                }
            }
        }
        let mut pen = 0.0_f64;
        for a in 0..m {
            let la = lambdas[a];
            for i in 0..p {
                let mut sbi = 0.0_f64;
                for j in 0..p {
                    sbi += penalty[[i, j]] * beta[[j, a]];
                }
                pen += 0.5 * la * beta[[i, a]] * sbi;
            }
        }
        -ll + pen
    }

    /// Central finite-difference gradient of an objective over every entry of a
    /// `(P, C)` coefficient matrix. The optimum must drive every component to
    /// ~0; we assert the max |component| against an un-weakened bound.
    fn fd_grad<F: Fn(&Array2<f64>) -> f64>(beta: &Array2<f64>, f: F) -> f64 {
        let (p, c) = beta.dim();
        let h = 1.0e-6;
        let mut max_abs = 0.0_f64;
        for i in 0..p {
            for a in 0..c {
                let mut up = beta.clone();
                let mut dn = beta.clone();
                up[[i, a]] += h;
                dn[[i, a]] -= h;
                let g = (f(&up) - f(&dn)) / (2.0 * h);
                max_abs = max_abs.max(g.abs());
            }
        }
        max_abs
    }

    fn binomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
        let n = 40;
        let p = 3;
        let k = 3;
        let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
            0 => 1.0,
            1 => ((i + 1) as f64 * 0.37).sin(),
            _ => ((i + 1) as f64 * 0.11).cos(),
        });
        let y = Array2::<f64>::from_shape_fn((n, k), |(i, a)| {
            // Deterministic but non-degenerate {0,1} labels per column.
            if ((i * 7 + a * 13 + 3) % 5) < 3 {
                1.0
            } else {
                0.0
            }
        });
        let penalty = Array2::<f64>::eye(p);
        let lambdas = Array1::from(vec![0.3_f64, 1.2, 2.5]);
        (design, y, penalty, lambdas)
    }

    fn multinomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
        let n = 45;
        let p = 3;
        let k = 4;
        let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
            0 => 1.0,
            1 => ((i + 2) as f64 * 0.29).sin(),
            _ => ((i + 2) as f64 * 0.17).cos(),
        });
        let mut y = Array2::<f64>::zeros((n, k));
        for i in 0..n {
            y[[i, (i * 3 + 1) % k]] = 1.0;
        }
        let penalty = Array2::<f64>::eye(p);
        let lambdas = Array1::from(vec![0.5_f64, 1.0, 2.0]);
        (design, y, penalty, lambdas)
    }

    #[test]
    fn binomial_engine_hits_optimum_and_is_self_consistent() {
        let (design, y, penalty, lambdas) = binomial_fixture();
        let fit = fit_penalized_binomial_multi(BinomialMultiFitInputs {
            design: design.view(),
            y: y.view(),
            penalty: penalty.view(),
            lambdas: lambdas.view(),
            row_weights: None,
            fisher_w_override: None,
            max_iter: 100,
            tol: 1.0e-12,
        })
        .expect("binomial fit must succeed");
        assert!(fit.converged, "binomial fit must converge");

        // First-order optimality: ∇F(β̂) = 0 (engine never used this gradient).
        let g = fd_grad(&fit.coefficients, |b| {
            binomial_objective(&design, &y, &penalty, &lambdas, b)
        });
        assert!(
            g < 1.0e-6,
            "binomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
        );

        // Fitted probabilities reproduce σ(X β̂) and deviance = −2 log L.
        let (n, p) = design.dim();
        let k = y.ncols();
        let mut log_lik = 0.0_f64;
        for row in 0..n {
            for a in 0..k {
                let mut eta = 0.0_f64;
                for i in 0..p {
                    eta += design[[row, i]] * fit.coefficients[[i, a]];
                }
                let mu = sigmoid(eta);
                assert!(
                    (fit.fitted_probabilities[[row, a]] - mu).abs() < 1.0e-10,
                    "fitted probability must equal σ(X β̂)"
                );
                let muc = mu.clamp(1.0e-12, 1.0 - 1.0e-12);
                let yv = y[[row, a]];
                log_lik += yv * muc.ln() + (1.0 - yv) * (1.0 - muc).ln();
            }
        }
        assert!(
            (fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
            "deviance must equal −2 log L"
        );
    }

    #[test]
    fn binomial_joint_solve_decouples_into_single_column_solves() {
        // Parity: the row-diagonal Fisher block means the K-column joint solve
        // must reproduce, column for column, an independent single-column
        // penalized logistic Newton solve. This is the defining property the
        // shared engine preserves for the independent-binomial family.
        let (design, y, penalty, lambdas) = binomial_fixture();
        let joint = fit_penalized_binomial_multi(BinomialMultiFitInputs {
            design: design.view(),
            y: y.view(),
            penalty: penalty.view(),
            lambdas: lambdas.view(),
            row_weights: None,
            fisher_w_override: None,
            max_iter: 100,
            tol: 1.0e-12,
        })
        .expect("joint fit must succeed");

        let k = y.ncols();
        for a in 0..k {
            // Single-column problem: one binomial response, one λ.
            let y_col = y.column(a).to_owned().insert_axis(ndarray::Axis(1));
            let lam = Array1::from(vec![lambdas[a]]);
            let single = fit_penalized_binomial_multi(BinomialMultiFitInputs {
                design: design.view(),
                y: y_col.view(),
                penalty: penalty.view(),
                lambdas: lam.view(),
                row_weights: None,
                fisher_w_override: None,
                max_iter: 100,
                tol: 1.0e-12,
            })
            .expect("single-column fit must succeed");
            for i in 0..design.ncols() {
                let dj = joint.coefficients[[i, a]];
                let ds = single.coefficients[[i, 0]];
                assert!(
                    (dj - ds).abs() < 1.0e-8,
                    "joint column {a} coef {i} ({dj}) must match single-column solve ({ds})"
                );
            }
        }
    }

    #[test]
    fn multinomial_engine_hits_optimum_and_is_self_consistent() {
        let (design, y, penalty, lambdas) = multinomial_fixture();
        let fit = fit_penalized_multinomial(MultinomialFitInputs {
            design: design.view(),
            y_one_hot: y.view(),
            penalty: penalty.view(),
            lambdas: lambdas.view(),
            row_weights: None,
            fisher_w_override: None,
            max_iter: 100,
            tol: 1.0e-12,
        })
        .expect("multinomial fit must succeed");
        assert!(fit.converged, "multinomial fit must converge");

        // First-order optimality: ∇F(β̂) = 0.
        let g = fd_grad(&fit.coefficients_active, |b| {
            multinomial_objective(&design, &y, &penalty, &lambdas, b)
        });
        assert!(
            g < 1.0e-6,
            "multinomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
        );

        // Fitted probabilities are a valid simplex per row and reproduce the
        // softmax of X β̂; deviance = −2 log L.
        let (n, p) = design.dim();
        let k = y.ncols();
        let m = k - 1;
        let mut log_lik = 0.0_f64;
        let mut eta_active = vec![0.0_f64; m];
        for row in 0..n {
            for a in 0..m {
                let mut eta = 0.0_f64;
                for i in 0..p {
                    eta += design[[row, i]] * fit.coefficients_active[[i, a]];
                }
                eta_active[a] = eta;
            }
            let probs = softmax_ref(&eta_active);
            let mut row_sum = 0.0_f64;
            for c in 0..k {
                assert!(
                    (fit.fitted_probabilities[[row, c]] - probs[c]).abs() < 1.0e-10,
                    "fitted probability must equal softmax(X β̂)"
                );
                row_sum += fit.fitted_probabilities[[row, c]];
                let yc = y[[row, c]];
                if yc != 0.0 {
                    log_lik += yc * probs[c].max(1.0e-300).ln();
                }
            }
            assert!(
                (row_sum - 1.0).abs() < 1.0e-10,
                "fitted probabilities must sum to 1 per row"
            );
        }
        assert!(
            (fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
            "deviance must equal −2 log L"
        );
    }

    #[test]
    fn multinomial_rank_deficient_block_recovers_via_ridge_not_crash() {
        // Issue #557: a rank-deficient class block under a tiny per-class λ used
        // to make faer's Bunch–Kaufman fallback back-substitute through near-zero
        // pivots into a non-finite Newton step δ, and the solver aborted with
        // "Newton step is non-finite". The adaptive Levenberg–Marquardt ridge
        // must instead lift the null direction off zero, keep δ finite, and let
        // the backtracking line search converge to the penalized optimum.
        //
        // Construct an exactly rank-deficient design: column 2 is a perfect
        // duplicate of column 1, so XᵀWX is singular along (e₁ − e₂) for every
        // class, and we drive the corresponding λ to a tiny value so the penalty
        // cannot regularize that null direction. A non-robust solver crashes
        // here; the ridge path must produce a finite, self-consistent fit.
        let n = 50;
        let p = 4;
        let k = 4;
        let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
            0 => 1.0,
            1 => ((i + 1) as f64 * 0.23).sin(),
            2 => ((i + 1) as f64 * 0.23).sin(), // exact duplicate of column 1
            _ => ((i + 1) as f64 * 0.19).cos(),
        });
        let mut y = Array2::<f64>::zeros((n, k));
        for i in 0..n {
            y[[i, (i * 5 + 2) % k]] = 1.0;
        }
        // Penalty touches only the smooth-ish columns 1..p; columns 0/1/2 share
        // the collinearity, and a near-zero λ leaves the (e₁ − e₂) null direction
        // unregularized — exactly the rank-deficient regime that triggered #557.
        let mut penalty = Array2::<f64>::zeros((p, p));
        penalty[[3, 3]] = 1.0;
        let lambdas = Array1::from(vec![1.0e-10_f64, 1.0e-10, 1.0e-10]);

        let fit = fit_penalized_multinomial(MultinomialFitInputs {
            design: design.view(),
            y_one_hot: y.view(),
            penalty: penalty.view(),
            lambdas: lambdas.view(),
            row_weights: None,
            fisher_w_override: None,
            max_iter: 200,
            tol: 1.0e-10,
        })
        .expect("rank-deficient multinomial fit must NOT crash (#557): the ridge path recovers it");

        // Every coefficient and fitted probability must be finite (no inf/NaN
        // leaked from the near-singular solve).
        for &c in fit.coefficients_active.iter() {
            assert!(c.is_finite(), "coefficient must be finite, got {c}");
        }
        for &pr in fit.fitted_probabilities.iter() {
            assert!(
                pr.is_finite() && (-1.0e-9..=1.0 + 1.0e-9).contains(&pr),
                "fitted probability must be a finite simplex entry, got {pr}"
            );
        }
        // Rows must remain on the simplex.
        let (nn, kk) = fit.fitted_probabilities.dim();
        for row in 0..nn {
            let s: f64 = (0..kk).map(|c| fit.fitted_probabilities[[row, c]]).sum();
            assert!(
                (s - 1.0).abs() < 1.0e-9,
                "row {row} probabilities must sum to 1, got {s}"
            );
        }

        // The recovered fit must satisfy first-order optimality of the penalized
        // objective along every NON-NULL coordinate. The (e₁ − e₂) null
        // direction is unidentified (the ridge picks the minimum-norm split
        // between the duplicate columns), so the gradient is exactly zero along
        // every identified direction; a central finite difference of F over the
        // full coefficient matrix is dominated by the identified part and must be
        // small. We assert the penalized objective gradient is near-zero — the
        // ridge biases the step but never the optimum (at β̂ the unridged
        // gradient vanishes for any τ).
        let g = fd_grad(&fit.coefficients_active, |b| {
            multinomial_objective(&design, &y, &penalty, &lambdas, b)
        });
        assert!(
            g < 1.0e-4,
            "penalized objective gradient at the ridge-recovered β̂ must (near-)vanish \
             along identified directions (max |∂F| = {g})"
        );
    }
}