use crate::faer_ndarray::{FaerArrayView, array2_to_matmut, factorize_symmetricwith_fallback};
use crate::families::vector_response::VectorLikelihood;
use crate::pirls::dense_block_xtwx;
use crate::solver::estimate::EstimationError;
use faer::Side;
use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayView3};
pub struct PenalizedVectorGlmInputs<'a> {
pub design: ArrayView2<'a, f64>,
pub y: ArrayView2<'a, f64>,
pub penalty: ArrayView2<'a, f64>,
pub lambdas: ArrayView1<'a, f64>,
pub fisher_w_override: Option<ArrayView3<'a, f64>>,
pub max_iter: usize,
pub tol: f64,
}
pub struct PenalizedVectorGlmOutputs {
pub coefficients: Array2<f64>,
pub eta: Array2<f64>,
pub iterations: usize,
pub converged: bool,
pub log_likelihood: f64,
pub penalty_term: f64,
}
fn weighted_penalty_sum(
beta: &Array2<f64>,
penalty: ArrayView2<'_, f64>,
lambdas: ArrayView1<'_, f64>,
) -> f64 {
let (p, m) = beta.dim();
let mut pen = 0.0_f64;
for a in 0..m {
let la = lambdas[a];
if la == 0.0 {
continue;
}
let beta_col = beta.column(a);
let mut quad = 0.0_f64;
for i in 0..p {
let mut s_beta_i = 0.0_f64;
for j in 0..p {
s_beta_i += penalty[[i, j]] * beta_col[j];
}
quad += beta_col[i] * s_beta_i;
}
pen += 0.5 * la * quad;
}
pen
}
pub fn fit_penalized_vector_glm<L: VectorLikelihood>(
inputs: PenalizedVectorGlmInputs<'_>,
likelihood: &L,
context: &str,
) -> Result<PenalizedVectorGlmOutputs, EstimationError> {
let PenalizedVectorGlmInputs {
design,
y,
penalty,
lambdas,
fisher_w_override,
max_iter,
tol,
} = inputs;
let n_obs = design.nrows();
let p = design.ncols();
if n_obs == 0 || p == 0 {
crate::bail_invalid_estim!("{context}: design must be nonempty (got {n_obs}x{p})");
}
let m = lambdas.len();
if m == 0 {
crate::bail_invalid_estim!("{context}: need at least one active output (got M=0)");
}
if y.nrows() != n_obs {
crate::bail_invalid_estim!("{context}: y rows {} ≠ design rows {n_obs}", y.nrows());
}
if penalty.dim() != (p, p) {
crate::bail_invalid_estim!(
"{context}: penalty shape {:?} ≠ (P, P) = ({p}, {p})",
penalty.dim()
);
}
for (i, &v) in lambdas.iter().enumerate() {
if !(v.is_finite() && v >= 0.0) {
crate::bail_invalid_estim!("{context}: lambdas[{i}] must be finite and ≥ 0 (got {v})");
}
}
if let Some(fw) = fisher_w_override.as_ref() {
if fw.dim() != (n_obs, m, m) {
crate::bail_invalid_estim!(
"{context}: fisher_w_override shape {:?} ≠ (N, M, M) = ({n_obs}, {m}, {m})",
fw.dim()
);
}
}
for ((i, j), &v) in design.indexed_iter() {
if !v.is_finite() {
crate::bail_invalid_estim!("{context}: design[{i},{j}] must be finite (got {v})");
}
}
let mut beta = Array2::<f64>::zeros((p, m));
let mut eta = Array2::<f64>::zeros((n_obs, m));
let beta_flat_dim = p * m;
let mut iterations = 0usize;
let mut converged = false;
let mut last_objective = f64::INFINITY;
let recompute_eta = |beta: &Array2<f64>, eta: &mut Array2<f64>| {
for a in 0..m {
let beta_col = beta.column(a);
for row in 0..n_obs {
let mut eta_val = 0.0_f64;
for i in 0..p {
eta_val += design[[row, i]] * beta_col[i];
}
eta[[row, a]] = eta_val;
}
}
};
let evaluate_objective = |beta_trial: &Array2<f64>| -> f64 {
let mut eta_trial = Array2::<f64>::zeros((n_obs, m));
for a in 0..m {
let beta_col = beta_trial.column(a);
for row in 0..n_obs {
let mut v = 0.0_f64;
for i in 0..p {
v += design[[row, i]] * beta_col[i];
}
eta_trial[[row, a]] = v;
}
}
let ll = likelihood.log_lik(eta_trial.view(), y);
let pen = weighted_penalty_sum(beta_trial, penalty, lambdas);
-ll + pen
};
for iter in 0..max_iter {
iterations = iter + 1;
recompute_eta(&beta, &mut eta);
let analytic_fisher = fisher_w_override
.as_ref()
.map_or_else(|| Some(likelihood.hess_block(eta.view(), y)), |_| None);
let fisher_blocks = match fisher_w_override.as_ref() {
Some(fw) => *fw,
None => analytic_fisher
.as_ref()
.expect("analytic Fisher computed when no override")
.view(),
};
let residual = likelihood.grad_eta(eta.view(), y).mapv(|v| -v);
let mut hessian = dense_block_xtwx(design, fisher_blocks, None)?;
if hessian.nrows() != beta_flat_dim || hessian.ncols() != beta_flat_dim {
crate::bail_invalid_estim!(
"{context}: assembled Hessian shape {:?} ≠ ({beta_flat_dim}, {beta_flat_dim})",
hessian.dim()
);
}
for a in 0..m {
let la = lambdas[a];
if la == 0.0 {
continue;
}
let base = a * p;
for i in 0..p {
for j in 0..p {
hessian[[base + i, base + j]] += la * penalty[[i, j]];
}
}
}
let mut grad_flat = Array1::<f64>::zeros(beta_flat_dim);
for a in 0..m {
for i in 0..p {
let mut acc = 0.0_f64;
for row in 0..n_obs {
acc += design[[row, i]] * residual[[row, a]];
}
grad_flat[a * p + i] = acc;
}
}
for a in 0..m {
let la = lambdas[a];
if la == 0.0 {
continue;
}
let beta_col = beta.column(a);
for i in 0..p {
let mut s_beta_i = 0.0_f64;
for j in 0..p {
s_beta_i += penalty[[i, j]] * beta_col[j];
}
grad_flat[a * p + i] += la * s_beta_i;
}
}
let max_diag =
(0..beta_flat_dim).fold(0.0_f64, |acc, idx| acc.max(hessian[[idx, idx]].abs()));
let base_ridge = if max_diag.is_finite() && max_diag > 0.0 {
max_diag * 1.0e-10
} else {
1.0e-10
};
const MAX_RIDGE_ESCALATIONS: usize = 30;
let mut delta = Array1::<f64>::zeros(beta_flat_dim);
let mut ridge = base_ridge;
let mut solved = false;
for attempt in 0..=MAX_RIDGE_ESCALATIONS {
let mut ridged = hessian.clone();
if ridge > 0.0 {
for idx in 0..beta_flat_dim {
ridged[[idx, idx]] += ridge;
}
}
let factor = match factorize_symmetricwith_fallback(
FaerArrayView::new(&ridged).as_ref(),
Side::Lower,
) {
Ok(factor) => factor,
Err(err) => {
if attempt == MAX_RIDGE_ESCALATIONS {
return Err(EstimationError::InvalidInput(format!(
"{context}: Hessian factorization failed at iter {iter} \
even with ridge {ridge:.3e}: {err}"
)));
}
ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
continue;
}
};
let mut rhs = Array2::<f64>::zeros((beta_flat_dim, 1));
for i in 0..beta_flat_dim {
rhs[[i, 0]] = -grad_flat[i];
}
{
let rhs_view = array2_to_matmut(&mut rhs);
factor.solve_in_place(rhs_view);
}
if (0..beta_flat_dim).all(|i| rhs[[i, 0]].is_finite()) {
for i in 0..beta_flat_dim {
delta[i] = rhs[[i, 0]];
}
solved = true;
break;
}
ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
}
assert!(
solved,
"{context}: Newton step remained non-finite at iter {iter} after {} ridge \
escalations up to {ridge:.3e}; the penalized Hessian is pathologically \
rank-deficient (grad_norm={:.3e}, max_diag={max_diag:.3e})",
MAX_RIDGE_ESCALATIONS,
grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt(),
);
let proposed_beta = |alpha: f64| -> Array2<f64> {
let mut out = beta.clone();
for a in 0..m {
for i in 0..p {
out[[i, a]] += alpha * delta[a * p + i];
}
}
out
};
if iter == 0 {
last_objective = evaluate_objective(&beta);
if !last_objective.is_finite() {
crate::bail_invalid_estim!("{context}: non-finite objective at β = 0");
}
}
let mut alpha = 1.0_f64;
let mut accepted_beta = proposed_beta(alpha);
let mut new_objective = evaluate_objective(&accepted_beta);
let mut backtrack = 0usize;
while (!new_objective.is_finite() || new_objective > last_objective + 1.0e-12)
&& backtrack < 8
{
alpha *= 0.5;
accepted_beta = proposed_beta(alpha);
new_objective = evaluate_objective(&accepted_beta);
backtrack += 1;
}
let mut step_norm_sq = 0.0_f64;
let mut beta_norm_sq = 0.0_f64;
for a in 0..m {
for i in 0..p {
let d = accepted_beta[[i, a]] - beta[[i, a]];
step_norm_sq += d * d;
let v = accepted_beta[[i, a]];
beta_norm_sq += v * v;
}
}
beta = accepted_beta;
last_objective = new_objective;
let step_norm = step_norm_sq.sqrt();
let beta_norm = beta_norm_sq.sqrt();
let grad_norm = grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt();
let grad_optimal = grad_norm <= 1.0e-6 * (1.0 + max_diag);
if step_norm <= tol * (1.0 + beta_norm) && grad_optimal {
converged = true;
break;
}
}
recompute_eta(&beta, &mut eta);
let log_likelihood = likelihood.log_lik(eta.view(), y);
let penalty_term = weighted_penalty_sum(&beta, penalty, lambdas);
Ok(PenalizedVectorGlmOutputs {
coefficients: beta,
eta,
iterations,
converged,
log_likelihood,
penalty_term,
})
}
#[cfg(test)]
mod parity_tests {
use crate::families::binomial_multi::{BinomialMultiFitInputs, fit_penalized_binomial_multi};
use crate::families::multinomial::{MultinomialFitInputs, fit_penalized_multinomial};
use ndarray::{Array1, Array2};
fn sigmoid(eta: f64) -> f64 {
if eta >= 0.0 {
1.0 / (1.0 + (-eta).exp())
} else {
let e = eta.exp();
e / (1.0 + e)
}
}
fn softmax_ref(eta_active: &[f64]) -> Vec<f64> {
let m = eta_active.len();
let mut out = vec![0.0_f64; m + 1];
let mut max_eta = 0.0_f64;
for &v in eta_active {
if v > max_eta {
max_eta = v;
}
}
let baseline = (-max_eta).exp();
let mut denom = baseline;
for (idx, &v) in eta_active.iter().enumerate() {
let e = (v - max_eta).exp();
out[idx] = e;
denom += e;
}
for v in out.iter_mut().take(m) {
*v /= denom;
}
out[m] = baseline / denom;
out
}
fn binomial_objective(
design: &Array2<f64>,
y: &Array2<f64>,
penalty: &Array2<f64>,
lambdas: &Array1<f64>,
beta: &Array2<f64>,
) -> f64 {
let (n, p) = design.dim();
let k = y.ncols();
let mut ll = 0.0_f64;
for row in 0..n {
for a in 0..k {
let mut eta = 0.0_f64;
for i in 0..p {
eta += design[[row, i]] * beta[[i, a]];
}
let mu = sigmoid(eta).clamp(1.0e-12, 1.0 - 1.0e-12);
let yv = y[[row, a]];
ll += yv * mu.ln() + (1.0 - yv) * (1.0 - mu).ln();
}
}
let mut pen = 0.0_f64;
for a in 0..k {
let la = lambdas[a];
for i in 0..p {
let mut sbi = 0.0_f64;
for j in 0..p {
sbi += penalty[[i, j]] * beta[[j, a]];
}
pen += 0.5 * la * beta[[i, a]] * sbi;
}
}
-ll + pen
}
fn multinomial_objective(
design: &Array2<f64>,
y_one_hot: &Array2<f64>,
penalty: &Array2<f64>,
lambdas: &Array1<f64>,
beta: &Array2<f64>,
) -> f64 {
let (n, p) = design.dim();
let k = y_one_hot.ncols();
let m = k - 1;
let mut ll = 0.0_f64;
let mut eta_active = vec![0.0_f64; m];
for row in 0..n {
for a in 0..m {
let mut eta = 0.0_f64;
for i in 0..p {
eta += design[[row, i]] * beta[[i, a]];
}
eta_active[a] = eta;
}
let probs = softmax_ref(&eta_active);
for c in 0..k {
let yc = y_one_hot[[row, c]];
if yc != 0.0 {
ll += yc * probs[c].max(1.0e-300).ln();
}
}
}
let mut pen = 0.0_f64;
for a in 0..m {
let la = lambdas[a];
for i in 0..p {
let mut sbi = 0.0_f64;
for j in 0..p {
sbi += penalty[[i, j]] * beta[[j, a]];
}
pen += 0.5 * la * beta[[i, a]] * sbi;
}
}
-ll + pen
}
fn fd_grad<F: Fn(&Array2<f64>) -> f64>(beta: &Array2<f64>, f: F) -> f64 {
let (p, c) = beta.dim();
let h = 1.0e-6;
let mut max_abs = 0.0_f64;
for i in 0..p {
for a in 0..c {
let mut up = beta.clone();
let mut dn = beta.clone();
up[[i, a]] += h;
dn[[i, a]] -= h;
let g = (f(&up) - f(&dn)) / (2.0 * h);
max_abs = max_abs.max(g.abs());
}
}
max_abs
}
fn binomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
let n = 40;
let p = 3;
let k = 3;
let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
0 => 1.0,
1 => ((i + 1) as f64 * 0.37).sin(),
_ => ((i + 1) as f64 * 0.11).cos(),
});
let y = Array2::<f64>::from_shape_fn((n, k), |(i, a)| {
if ((i * 7 + a * 13 + 3) % 5) < 3 {
1.0
} else {
0.0
}
});
let penalty = Array2::<f64>::eye(p);
let lambdas = Array1::from(vec![0.3_f64, 1.2, 2.5]);
(design, y, penalty, lambdas)
}
fn multinomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
let n = 45;
let p = 3;
let k = 4;
let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
0 => 1.0,
1 => ((i + 2) as f64 * 0.29).sin(),
_ => ((i + 2) as f64 * 0.17).cos(),
});
let mut y = Array2::<f64>::zeros((n, k));
for i in 0..n {
y[[i, (i * 3 + 1) % k]] = 1.0;
}
let penalty = Array2::<f64>::eye(p);
let lambdas = Array1::from(vec![0.5_f64, 1.0, 2.0]);
(design, y, penalty, lambdas)
}
#[test]
fn binomial_engine_hits_optimum_and_is_self_consistent() {
let (design, y, penalty, lambdas) = binomial_fixture();
let fit = fit_penalized_binomial_multi(BinomialMultiFitInputs {
design: design.view(),
y: y.view(),
penalty: penalty.view(),
lambdas: lambdas.view(),
row_weights: None,
fisher_w_override: None,
max_iter: 100,
tol: 1.0e-12,
})
.expect("binomial fit must succeed");
assert!(fit.converged, "binomial fit must converge");
let g = fd_grad(&fit.coefficients, |b| {
binomial_objective(&design, &y, &penalty, &lambdas, b)
});
assert!(
g < 1.0e-6,
"binomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
);
let (n, p) = design.dim();
let k = y.ncols();
let mut log_lik = 0.0_f64;
for row in 0..n {
for a in 0..k {
let mut eta = 0.0_f64;
for i in 0..p {
eta += design[[row, i]] * fit.coefficients[[i, a]];
}
let mu = sigmoid(eta);
assert!(
(fit.fitted_probabilities[[row, a]] - mu).abs() < 1.0e-10,
"fitted probability must equal σ(X β̂)"
);
let muc = mu.clamp(1.0e-12, 1.0 - 1.0e-12);
let yv = y[[row, a]];
log_lik += yv * muc.ln() + (1.0 - yv) * (1.0 - muc).ln();
}
}
assert!(
(fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
"deviance must equal −2 log L"
);
}
#[test]
fn binomial_joint_solve_decouples_into_single_column_solves() {
let (design, y, penalty, lambdas) = binomial_fixture();
let joint = fit_penalized_binomial_multi(BinomialMultiFitInputs {
design: design.view(),
y: y.view(),
penalty: penalty.view(),
lambdas: lambdas.view(),
row_weights: None,
fisher_w_override: None,
max_iter: 100,
tol: 1.0e-12,
})
.expect("joint fit must succeed");
let k = y.ncols();
for a in 0..k {
let y_col = y.column(a).to_owned().insert_axis(ndarray::Axis(1));
let lam = Array1::from(vec![lambdas[a]]);
let single = fit_penalized_binomial_multi(BinomialMultiFitInputs {
design: design.view(),
y: y_col.view(),
penalty: penalty.view(),
lambdas: lam.view(),
row_weights: None,
fisher_w_override: None,
max_iter: 100,
tol: 1.0e-12,
})
.expect("single-column fit must succeed");
for i in 0..design.ncols() {
let dj = joint.coefficients[[i, a]];
let ds = single.coefficients[[i, 0]];
assert!(
(dj - ds).abs() < 1.0e-8,
"joint column {a} coef {i} ({dj}) must match single-column solve ({ds})"
);
}
}
}
#[test]
fn multinomial_engine_hits_optimum_and_is_self_consistent() {
let (design, y, penalty, lambdas) = multinomial_fixture();
let fit = fit_penalized_multinomial(MultinomialFitInputs {
design: design.view(),
y_one_hot: y.view(),
penalty: penalty.view(),
lambdas: lambdas.view(),
row_weights: None,
fisher_w_override: None,
max_iter: 100,
tol: 1.0e-12,
})
.expect("multinomial fit must succeed");
assert!(fit.converged, "multinomial fit must converge");
let g = fd_grad(&fit.coefficients_active, |b| {
multinomial_objective(&design, &y, &penalty, &lambdas, b)
});
assert!(
g < 1.0e-6,
"multinomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
);
let (n, p) = design.dim();
let k = y.ncols();
let m = k - 1;
let mut log_lik = 0.0_f64;
let mut eta_active = vec![0.0_f64; m];
for row in 0..n {
for a in 0..m {
let mut eta = 0.0_f64;
for i in 0..p {
eta += design[[row, i]] * fit.coefficients_active[[i, a]];
}
eta_active[a] = eta;
}
let probs = softmax_ref(&eta_active);
let mut row_sum = 0.0_f64;
for c in 0..k {
assert!(
(fit.fitted_probabilities[[row, c]] - probs[c]).abs() < 1.0e-10,
"fitted probability must equal softmax(X β̂)"
);
row_sum += fit.fitted_probabilities[[row, c]];
let yc = y[[row, c]];
if yc != 0.0 {
log_lik += yc * probs[c].max(1.0e-300).ln();
}
}
assert!(
(row_sum - 1.0).abs() < 1.0e-10,
"fitted probabilities must sum to 1 per row"
);
}
assert!(
(fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
"deviance must equal −2 log L"
);
}
#[test]
fn multinomial_rank_deficient_block_recovers_via_ridge_not_crash() {
let n = 50;
let p = 4;
let k = 4;
let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
0 => 1.0,
1 => ((i + 1) as f64 * 0.23).sin(),
2 => ((i + 1) as f64 * 0.23).sin(), _ => ((i + 1) as f64 * 0.19).cos(),
});
let mut y = Array2::<f64>::zeros((n, k));
for i in 0..n {
y[[i, (i * 5 + 2) % k]] = 1.0;
}
let mut penalty = Array2::<f64>::zeros((p, p));
penalty[[3, 3]] = 1.0;
let lambdas = Array1::from(vec![1.0e-10_f64, 1.0e-10, 1.0e-10]);
let fit = fit_penalized_multinomial(MultinomialFitInputs {
design: design.view(),
y_one_hot: y.view(),
penalty: penalty.view(),
lambdas: lambdas.view(),
row_weights: None,
fisher_w_override: None,
max_iter: 200,
tol: 1.0e-10,
})
.expect("rank-deficient multinomial fit must NOT crash (#557): the ridge path recovers it");
for &c in fit.coefficients_active.iter() {
assert!(c.is_finite(), "coefficient must be finite, got {c}");
}
for &pr in fit.fitted_probabilities.iter() {
assert!(
pr.is_finite() && (-1.0e-9..=1.0 + 1.0e-9).contains(&pr),
"fitted probability must be a finite simplex entry, got {pr}"
);
}
let (nn, kk) = fit.fitted_probabilities.dim();
for row in 0..nn {
let s: f64 = (0..kk).map(|c| fit.fitted_probabilities[[row, c]]).sum();
assert!(
(s - 1.0).abs() < 1.0e-9,
"row {row} probabilities must sum to 1, got {s}"
);
}
let g = fd_grad(&fit.coefficients_active, |b| {
multinomial_objective(&design, &y, &penalty, &lambdas, b)
});
assert!(
g < 1.0e-4,
"penalized objective gradient at the ridge-recovered β̂ must (near-)vanish \
along identified directions (max |∂F| = {g})"
);
}
}