gam_models/binomial_multi.rs
1//! Penalized multi-output binomial-logit fitter at fixed λ.
2//!
3//! This is the row-diagonal sibling of [`crate::multinomial`]: the
4//! same shared design `X ∈ ℝ^{N×P}` and shared penalty `S ∈ ℝ^{P×P}` are
5//! reused across `K` independent binomial-logit response columns. Per-column
6//! smoothing parameters `λ_a` (length `K`) scale `S` independently for each
7//! response. Because the Fisher information has no cross-column coupling
8//! (`H_{n,a,b} = δ_{ab} · w_n · μ_{n,a} (1 − μ_{n,a})`), the joint penalized
9//! Hessian is block-diagonal in the `K` `P × P` per-response systems; the
10//! shared [`crate::penalized_vector_glm`] engine factors that
11//! block-diagonal Hessian in a single coupled damped-Newton loop, which is
12//! mathematically identical to `K` independent per-column solves.
13//!
14//! # Fit problem
15//!
16//! Minimise the penalized negative log-likelihood
17//!
18//! ```text
19//! F(β) = − Σ_n Σ_a w_n [ y_{n,a} log μ_{n,a} + (1 − y_{n,a}) log(1 − μ_{n,a}) ]
20//! + ½ Σ_a λ_a · β_aᵀ S β_a
21//! ```
22//!
23//! with `μ_{n,a} = σ(η_{n,a})`, `η_{n,a} = (X β_a)_n`. The per-column Newton
24//! step solves
25//!
26//! ```text
27//! (Xᵀ diag(w_n μ_{n,a}(1 − μ_{n,a})) X + λ_a S) δ_a = − [Xᵀ diag(w_n)(μ_{·,a} − y_{·,a}) + λ_a S β_a]
28//! ```
29//!
30//! followed by a backtracking line search on `F` (full step first, halve up
31//! to 8 times) so monotone descent is enforced even when the quadratic
32//! model overshoots near saturation. This is precisely the shared
33//! [`crate::penalized_vector_glm`] scaffold; this module supplies
34//! only the row-diagonal binomial Fisher block, residual, and log-likelihood
35//! via [`BinomialMultiLikelihood`].
36//!
37//! # Relation to the multi-class softmax driver
38//!
39//! [`crate::multinomial::fit_penalized_multinomial`] handles the
40//! coupled softmax Fisher block `H_{n,a,b} = w_n μ_{n,a} (δ_{ab} − μ_{n,b})`
41//! and is the right entry when the user wants a single normalized
42//! probability vector per row. This driver is the right entry when the
43//! user has `K` independent binary marginals sharing a smooth basis (e.g.
44//! multi-label classification, multi-trait penalised logistic regression
45//! on a Duchon latent design). Both families are thin Fisher-block adapters
46//! over the same `penalized_vector_glm` engine: the only difference is that
47//! the softmax block is dense across outputs while these binomial columns are
48//! row-diagonal.
49//!
50//! The function-boundary contract mirrors `fit_penalized_multinomial` so
51//! the two are interchangeable at the FFI layer: same input arity, same
52//! convergence semantics, same `(N, K)` fitted-probability output.
53
54use crate::penalized_vector_glm::{PenalizedVectorGlmInputs, fit_penalized_vector_glm};
55use crate::vector_response::VectorLikelihood;
56use crate::model_types::EstimationError;
57use ndarray::{Array1, Array2, Array3, ArrayView1, ArrayView2, ArrayView3};
58
59/// Inputs for [`fit_penalized_binomial_multi`].
60#[derive(Debug, Clone)]
61pub struct BinomialMultiFitInputs<'a> {
62 /// Design matrix `X ∈ ℝ^{N×P}` (one row per observation, shared across
63 /// all response columns).
64 pub design: ArrayView2<'a, f64>,
65 /// Multi-column binomial response `Y ∈ ℝ^{N×K}`. Each column is treated
66 /// as an independent binomial-logit response, so every entry must be a
67 /// binomial proportion in `[0, 1]` (hard `{0, 1}` Bernoulli labels and soft
68 /// proportions / probabilities alike). Entries outside `[0, 1]` are
69 /// rejected because the per-entry log-likelihood is then unbounded in `η`.
70 pub y: ArrayView2<'a, f64>,
71 /// Shared smoothing penalty `S ∈ ℝ^{P×P}` (symmetric, PSD).
72 pub penalty: ArrayView2<'a, f64>,
73 /// Per-response smoothing parameter `λ_a` (length `K`).
74 pub lambdas: ArrayView1<'a, f64>,
75 /// Optional per-row weights (length `N`); `None` ⇒ uniform 1.0.
76 pub row_weights: Option<ArrayView1<'a, f64>>,
77 /// Optional per-row Fisher-block override, shape `(N, K, K)`. The `K`
78 /// binomial-logit columns are fit independently, so only the per-column
79 /// diagonal `[n, a, a]` is consumed as the curvature `w_n μ_a(1 − μ_a)`;
80 /// off-diagonals must be zero (enforced at the FFI boundary) since a
81 /// non-zero cross term cannot be represented by the separable per-column
82 /// solve. The gradient/residual path stays analytic — this is a
83 /// curvature-only override (issue #349). Diagonal entries must be finite
84 /// and non-negative.
85 pub fisher_w_override: Option<ArrayView3<'a, f64>>,
86 /// Maximum Newton iterations per response column; recommend 50.
87 pub max_iter: usize,
88 /// Relative-step convergence tolerance; recommend 1e-7.
89 pub tol: f64,
90}
91
92/// Outputs of [`fit_penalized_binomial_multi`].
93#[derive(Debug, Clone)]
94pub struct BinomialMultiFitOutputs {
95 /// Coefficient matrix, shape `(P, K)` (column `a` is `β_a`).
96 pub coefficients: Array2<f64>,
97 /// Fitted probabilities `μ_{n,a} = σ((X β_a)_n)`, shape `(N, K)`.
98 pub fitted_probabilities: Array2<f64>,
99 /// Number of joint Newton iterations executed (including the final step
100 /// that satisfied the tolerance). The `K` columns share the design and
101 /// are fitted by a single coupled damped-Newton loop over the
102 /// block-diagonal penalized Hessian, so there is one iteration count for
103 /// the whole solve.
104 pub iterations: usize,
105 /// `true` if the relative-step test was satisfied before `max_iter`.
106 pub converged: bool,
107 /// Penalized negative log-likelihood at the returned `β̂`:
108 /// `−log L(β̂) + ½ Σ_a λ_a · β̂_aᵀ S β̂_a`.
109 pub penalized_neg_log_likelihood: f64,
110 /// Unpenalized deviance `−2 log L(β̂)` for diagnostic reporting.
111 pub deviance: f64,
112}
113
114/// Numerically stable logistic CDF used by the Newton driver. Mirrors the
115/// inline helper that previously lived in `crates/gam-pyffi/src/lib.rs`.
116#[inline]
117fn sigmoid_stable(eta: f64) -> f64 {
118 if eta >= 0.0 {
119 let e = (-eta).exp();
120 1.0 / (1.0 + e)
121 } else {
122 let e = eta.exp();
123 e / (1.0 + e)
124 }
125}
126
127/// Row-diagonal multi-output binomial-logit likelihood adapter for the shared
128/// [`crate::penalized_vector_glm`] engine.
129///
130/// The `K` response columns are mutually independent binomial-logit marginals
131/// sharing the design `X`, so the per-row Fisher block is **diagonal across
132/// outputs**: `H_{n,a,b} = δ_{ab} · w_n · μ_{n,a} (1 − μ_{n,a})`. The engine
133/// works in `η = X β` space with `μ_{n,a} = σ(η_{n,a})`; this adapter supplies
134/// the log-likelihood, the residual gradient `w_n (y_a − μ_a)`, and that
135/// row-diagonal block.
136struct BinomialMultiLikelihood {
137 /// Optional per-row weights (length N), or `None` for uniform 1.0.
138 row_weights: Option<Array1<f64>>,
139}
140
141impl BinomialMultiLikelihood {
142 #[inline]
143 fn row_weight(&self, n: usize) -> f64 {
144 self.row_weights.as_ref().map_or(1.0, |w| w[n])
145 }
146}
147
148impl VectorLikelihood for BinomialMultiLikelihood {
149 /// `Σ_n Σ_a w_n [ y_{n,a} log μ_{n,a} + (1 − y_{n,a}) log(1 − μ_{n,a}) ]`,
150 /// clamping `μ` to `[1e-12, 1 − 1e-12]` so the closed-form expression stays
151 /// finite when β drives a row deeply into saturation during a tentative
152 /// Newton step (the surrounding line search rejects such steps).
153 fn log_lik(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> f64 {
154 let (n, k) = eta.dim();
155 let mut acc = 0.0_f64;
156 for row in 0..n {
157 let w = self.row_weight(row);
158 for a in 0..k {
159 let mu = sigmoid_stable(eta[[row, a]]).clamp(1.0e-12, 1.0 - 1.0e-12);
160 let yv = y[[row, a]];
161 acc += w * (yv * mu.ln() + (1.0 - yv) * (1.0 - mu).ln());
162 }
163 }
164 acc
165 }
166
167 /// `∂ log L / ∂η_{n,a} = w_n (y_{n,a} − μ_{n,a})`.
168 fn grad_eta(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> Array2<f64> {
169 let (n, k) = eta.dim();
170 let mut out = Array2::<f64>::zeros((n, k));
171 for row in 0..n {
172 let w = self.row_weight(row);
173 for a in 0..k {
174 let mu = sigmoid_stable(eta[[row, a]]);
175 out[[row, a]] = w * (y[[row, a]] - mu);
176 }
177 }
178 out
179 }
180
181 /// Per-output diagonal curvature `w_n μ_{n,a} (1 − μ_{n,a})`. The Fisher
182 /// information of independent Bernoulli outputs is `y`-independent; `y` is
183 /// read only to assert the target shape matches `eta`, as in the sibling
184 /// [`VectorLikelihood`] implementations.
185 fn hess_diag(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> Array2<f64> {
186 assert_eq!(eta.dim(), y.dim(), "y must match eta shape (N, K)");
187 let (n, k) = eta.dim();
188 let mut out = Array2::<f64>::zeros((n, k));
189 for row in 0..n {
190 let w = self.row_weight(row);
191 for a in 0..k {
192 let mu = sigmoid_stable(eta[[row, a]]);
193 out[[row, a]] = w * mu * (1.0 - mu);
194 }
195 }
196 out
197 }
198
199 /// Row-diagonal Fisher block `H_{n,a,b} = δ_{ab} · w_n μ_{n,a}(1 − μ_{n,a})`.
200 /// The independent columns have no cross-output coupling, so the off-diagonal
201 /// entries are identically zero; lifting [`Self::hess_diag`] onto the per-row
202 /// diagonal (the [`VectorLikelihood`] default) is exact here.
203 fn hess_block(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> Array3<f64> {
204 let diag = self.hess_diag(eta, y);
205 let (n, k) = diag.dim();
206 let mut out = Array3::<f64>::zeros((n, k, k));
207 for row in 0..n {
208 for a in 0..k {
209 out[[row, a, a]] = diag[[row, a]];
210 }
211 }
212 out
213 }
214}
215
216/// Fit `K` independent penalized binomial-logit GLMs sharing the design `X`
217/// and penalty `S`. See the module docs for the optimization problem.
218pub fn fit_penalized_binomial_multi(
219 inputs: BinomialMultiFitInputs<'_>,
220) -> Result<BinomialMultiFitOutputs, EstimationError> {
221 let BinomialMultiFitInputs {
222 design,
223 y,
224 penalty,
225 lambdas,
226 row_weights,
227 fisher_w_override,
228 max_iter,
229 tol,
230 } = inputs;
231
232 // ──────────────────────── family-specific validation ───────────────────
233 // The engine re-validates the shared geometry (nonempty design, penalty
234 // shape, λ finiteness/non-negativity, override `(N, M, M)` shape, finite
235 // design), but the binomial family owns three preconditions the generic
236 // scaffold cannot know: the response must be a `[0, 1]` proportion, the
237 // optional row weights must be finite and non-negative, and the optional
238 // curvature override must be **row-diagonal** (independent columns carry no
239 // cross-output coupling, so a non-zero off-diagonal cannot be represented).
240 let n_obs = design.nrows();
241 let (y_rows, k) = y.dim();
242 if y_rows != n_obs {
243 crate::bail_invalid_estim!(
244 "fit_penalized_binomial_multi: y rows {y_rows} ≠ design rows {n_obs}"
245 );
246 }
247 if k == 0 {
248 crate::bail_invalid_estim!(
249 "fit_penalized_binomial_multi: y must have at least one column (got K=0)"
250 );
251 }
252 if lambdas.len() != k {
253 crate::bail_invalid_estim!(
254 "fit_penalized_binomial_multi: lambdas length {} ≠ K = {k}",
255 lambdas.len()
256 );
257 }
258 if let Some(fw) = fisher_w_override.as_ref() {
259 if fw.dim() != (n_obs, k, k) {
260 crate::bail_invalid_estim!(
261 "fit_penalized_binomial_multi: fisher_w_override shape {:?} ≠ (N, K, K) = ({n_obs}, {k}, {k})",
262 fw.dim()
263 );
264 }
265 // Independent binomial columns have a strictly row-diagonal Fisher
266 // block; a non-zero cross term `[n, a, b]` (a ≠ b) cannot be the
267 // curvature of a separable per-column objective, so reject it rather
268 // than silently couple the columns through the shared dense solve.
269 for ((n_idx, a, b), &v) in fw.indexed_iter() {
270 if a != b && v != 0.0 {
271 crate::bail_invalid_estim!(
272 "fit_penalized_binomial_multi: fisher_w_override[{n_idx},{a},{b}] must be zero \
273 (independent columns have a row-diagonal Fisher block); got {v}"
274 );
275 }
276 }
277 }
278 if let Some(w) = row_weights.as_ref() {
279 if w.len() != n_obs {
280 crate::bail_invalid_estim!(
281 "fit_penalized_binomial_multi: row_weights length {} ≠ N = {n_obs}",
282 w.len()
283 );
284 }
285 for (i, &v) in w.iter().enumerate() {
286 if !(v.is_finite() && v >= 0.0) {
287 crate::bail_invalid_estim!(
288 "fit_penalized_binomial_multi: row_weights[{i}] must be finite and ≥ 0 (got {v})"
289 );
290 }
291 }
292 }
293 for ((i, j), &v) in y.indexed_iter() {
294 // The per-entry objective y log μ + (1 − y) log(1 − μ) is the binomial
295 // (Bernoulli / proportion) log-likelihood only when 0 ≤ y ≤ 1. Outside
296 // that range it is unbounded above in η (e.g. y = 2 gives
297 // 2η − log(1 + e^η) → ∞), so a finite-but-invalid entry would make the
298 // stated likelihood not a binomial likelihood at all. Reject it here.
299 if !(v.is_finite() && (0.0..=1.0).contains(&v)) {
300 crate::bail_invalid_estim!(
301 "fit_penalized_binomial_multi: y[{i},{j}] must be a binomial proportion in [0,1] (got {v})"
302 );
303 }
304 }
305
306 // ─────────────────── shared penalized vector-GLM solve ─────────────────
307 let likelihood = BinomialMultiLikelihood {
308 row_weights: row_weights.map(|w| w.to_owned()),
309 };
310 let fit = fit_penalized_vector_glm(
311 PenalizedVectorGlmInputs {
312 design,
313 y,
314 penalty,
315 lambdas,
316 fisher_w_override,
317 max_iter,
318 tol,
319 // Independent-binomial columns ARE genuinely independent outputs, so
320 // the per-output Diagonal penalty is correct here (the #1587 Centered
321 // metric is softmax-specific — there is no shared reference class).
322 class_penalty_metric: crate::penalized_vector_glm::ClassPenaltyMetric::Diagonal,
323 },
324 &likelihood,
325 "fit_penalized_binomial_multi",
326 )?;
327
328 // η → μ = σ(η) is the binomial inverse link applied column-wise.
329 let fitted = fit.eta.mapv(sigmoid_stable);
330
331 Ok(BinomialMultiFitOutputs {
332 coefficients: fit.coefficients,
333 fitted_probabilities: fitted,
334 iterations: fit.iterations,
335 converged: fit.converged,
336 penalized_neg_log_likelihood: -fit.log_likelihood + fit.penalty_term,
337 deviance: -2.0 * fit.log_likelihood,
338 })
339}
340
341#[cfg(test)]
342mod tests {
343 use super::*;
344 use ndarray::Array3;
345
346 fn toy_inputs() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
347 let n = 12;
348 let p = 2;
349 let k = 2;
350 let design =
351 Array2::<f64>::from_shape_fn(
352 (n, p),
353 |(i, j)| {
354 if j == 0 { 1.0 } else { ((i + 1) as f64).sin() }
355 },
356 );
357 let y =
358 Array2::<f64>::from_shape_fn((n, k), |(i, a)| if (i + a) % 2 == 0 { 1.0 } else { 0.0 });
359 let penalty = Array2::<f64>::eye(p);
360 let lambdas = Array1::<f64>::from_elem(k, 0.5);
361 (design, y, penalty, lambdas)
362 }
363
364 #[test]
365 fn fisher_override_none_reproduces_analytic_bit_for_bit() {
366 // Issue #349: a None override must give exactly the analytic result.
367 let (design, y, penalty, lambdas) = toy_inputs();
368 let base = fit_penalized_binomial_multi(BinomialMultiFitInputs {
369 design: design.view(),
370 y: y.view(),
371 penalty: penalty.view(),
372 lambdas: lambdas.view(),
373 row_weights: None,
374 fisher_w_override: None,
375 max_iter: 50,
376 tol: 1.0e-9,
377 })
378 .expect("analytic fit must succeed");
379 // Explicit None again — identical result.
380 let again = fit_penalized_binomial_multi(BinomialMultiFitInputs {
381 design: design.view(),
382 y: y.view(),
383 penalty: penalty.view(),
384 lambdas: lambdas.view(),
385 row_weights: None,
386 fisher_w_override: None,
387 max_iter: 50,
388 tol: 1.0e-9,
389 })
390 .expect("analytic fit must succeed");
391 for (a, b) in base.coefficients.iter().zip(again.coefficients.iter()) {
392 assert_eq!(a, b, "None override must be deterministic");
393 }
394 }
395
396 #[test]
397 fn out_of_range_response_is_rejected() {
398 // Issue #452: a finite but invalid entry (y = 2) makes the per-entry
399 // binomial log-likelihood unbounded in η, so it must be rejected rather
400 // than silently fit. The same guard covers negative entries.
401 let (design, y, penalty, lambdas) = toy_inputs();
402 let mut bad = y.clone();
403 bad[[0, 0]] = 2.0;
404 let err = fit_penalized_binomial_multi(BinomialMultiFitInputs {
405 design: design.view(),
406 y: bad.view(),
407 penalty: penalty.view(),
408 lambdas: lambdas.view(),
409 row_weights: None,
410 fisher_w_override: None,
411 max_iter: 50,
412 tol: 1.0e-9,
413 })
414 .expect_err("out-of-range response must error");
415 assert!(format!("{err}").contains("binomial proportion in [0,1]"));
416
417 let mut neg = y.clone();
418 neg[[1, 1]] = -0.5;
419 let err = fit_penalized_binomial_multi(BinomialMultiFitInputs {
420 design: design.view(),
421 y: neg.view(),
422 penalty: penalty.view(),
423 lambdas: lambdas.view(),
424 row_weights: None,
425 fisher_w_override: None,
426 max_iter: 50,
427 tol: 1.0e-9,
428 })
429 .expect_err("negative response must error");
430 assert!(format!("{err}").contains("binomial proportion in [0,1]"));
431 }
432
433 #[test]
434 fn fisher_override_shape_mismatch_is_rejected() {
435 let (design, y, penalty, lambdas) = toy_inputs();
436 let n = design.nrows();
437 let k = y.ncols();
438 let bad = Array3::<f64>::zeros((n, k + 1, k + 1));
439 let err = fit_penalized_binomial_multi(BinomialMultiFitInputs {
440 design: design.view(),
441 y: y.view(),
442 penalty: penalty.view(),
443 lambdas: lambdas.view(),
444 row_weights: None,
445 fisher_w_override: Some(bad.view()),
446 max_iter: 50,
447 tol: 1.0e-9,
448 })
449 .expect_err("mismatched override shape must error");
450 assert!(format!("{err}").contains("fisher_w_override shape"));
451 }
452
453 #[test]
454 fn fisher_override_replaces_curvature_diagonal() {
455 // A scaled curvature override changes the Newton step from β = 0:
456 // with curvature scaled by α the first step is 1/α of the analytic
457 // step (gradient unchanged), so the fitted β must differ from analytic.
458 let (design, y, penalty, lambdas) = toy_inputs();
459 let n = design.nrows();
460 let k = y.ncols();
461 // Analytic diagonal at β = 0 is μ(1−μ) = 0.25 for every column.
462 let mut over = Array3::<f64>::zeros((n, k, k));
463 for row in 0..n {
464 for a in 0..k {
465 over[[row, a, a]] = 0.25 * 4.0; // 4× the analytic curvature
466 }
467 }
468 let scaled = fit_penalized_binomial_multi(BinomialMultiFitInputs {
469 design: design.view(),
470 y: y.view(),
471 penalty: penalty.view(),
472 lambdas: lambdas.view(),
473 row_weights: None,
474 fisher_w_override: Some(over.view()),
475 max_iter: 1,
476 tol: 1.0e-9,
477 })
478 .expect("override fit must succeed");
479 let analytic = fit_penalized_binomial_multi(BinomialMultiFitInputs {
480 design: design.view(),
481 y: y.view(),
482 penalty: penalty.view(),
483 lambdas: lambdas.view(),
484 row_weights: None,
485 fisher_w_override: None,
486 max_iter: 1,
487 tol: 1.0e-9,
488 })
489 .expect("analytic fit must succeed");
490 let differs = scaled
491 .coefficients
492 .iter()
493 .zip(analytic.coefficients.iter())
494 .any(|(a, b)| (a - b).abs() > 1.0e-6);
495 assert!(differs, "scaled curvature override must change the step");
496 }
497}