Skip to main content

gam_models/
binomial_multi.rs

1//! Penalized multi-output binomial-logit fitter at fixed λ.
2//!
3//! This is the row-diagonal sibling of [`crate::multinomial`]: the
4//! same shared design `X ∈ ℝ^{N×P}` and shared penalty `S ∈ ℝ^{P×P}` are
5//! reused across `K` independent binomial-logit response columns. Per-column
6//! smoothing parameters `λ_a` (length `K`) scale `S` independently for each
7//! response. Because the Fisher information has no cross-column coupling
8//! (`H_{n,a,b} = δ_{ab} · w_n · μ_{n,a} (1 − μ_{n,a})`), the joint penalized
9//! Hessian is block-diagonal in the `K` `P × P` per-response systems; the
10//! shared [`crate::penalized_vector_glm`] engine factors that
11//! block-diagonal Hessian in a single coupled damped-Newton loop, which is
12//! mathematically identical to `K` independent per-column solves.
13//!
14//! # Fit problem
15//!
16//! Minimise the penalized negative log-likelihood
17//!
18//! ```text
19//!   F(β) = − Σ_n Σ_a w_n [ y_{n,a} log μ_{n,a} + (1 − y_{n,a}) log(1 − μ_{n,a}) ]
20//!           + ½ Σ_a λ_a · β_aᵀ S β_a
21//! ```
22//!
23//! with `μ_{n,a} = σ(η_{n,a})`, `η_{n,a} = (X β_a)_n`. The per-column Newton
24//! step solves
25//!
26//! ```text
27//!   (Xᵀ diag(w_n μ_{n,a}(1 − μ_{n,a})) X + λ_a S) δ_a = − [Xᵀ diag(w_n)(μ_{·,a} − y_{·,a}) + λ_a S β_a]
28//! ```
29//!
30//! followed by a backtracking line search on `F` (full step first, halve up
31//! to 8 times) so monotone descent is enforced even when the quadratic
32//! model overshoots near saturation. This is precisely the shared
33//! [`crate::penalized_vector_glm`] scaffold; this module supplies
34//! only the row-diagonal binomial Fisher block, residual, and log-likelihood
35//! via [`BinomialMultiLikelihood`].
36//!
37//! # Relation to the multi-class softmax driver
38//!
39//! [`crate::multinomial::fit_penalized_multinomial`] handles the
40//! coupled softmax Fisher block `H_{n,a,b} = w_n μ_{n,a} (δ_{ab} − μ_{n,b})`
41//! and is the right entry when the user wants a single normalized
42//! probability vector per row. This driver is the right entry when the
43//! user has `K` independent binary marginals sharing a smooth basis (e.g.
44//! multi-label classification, multi-trait penalised logistic regression
45//! on a Duchon latent design). Both families are thin Fisher-block adapters
46//! over the same `penalized_vector_glm` engine: the only difference is that
47//! the softmax block is dense across outputs while these binomial columns are
48//! row-diagonal.
49//!
50//! The function-boundary contract mirrors `fit_penalized_multinomial` so
51//! the two are interchangeable at the FFI layer: same input arity, same
52//! convergence semantics, same `(N, K)` fitted-probability output.
53
54use crate::penalized_vector_glm::{PenalizedVectorGlmInputs, fit_penalized_vector_glm};
55use crate::vector_response::VectorLikelihood;
56use crate::model_types::EstimationError;
57use ndarray::{Array1, Array2, Array3, ArrayView1, ArrayView2, ArrayView3};
58
59/// Inputs for [`fit_penalized_binomial_multi`].
60#[derive(Debug, Clone)]
61pub struct BinomialMultiFitInputs<'a> {
62    /// Design matrix `X ∈ ℝ^{N×P}` (one row per observation, shared across
63    /// all response columns).
64    pub design: ArrayView2<'a, f64>,
65    /// Multi-column binomial response `Y ∈ ℝ^{N×K}`. Each column is treated
66    /// as an independent binomial-logit response, so every entry must be a
67    /// binomial proportion in `[0, 1]` (hard `{0, 1}` Bernoulli labels and soft
68    /// proportions / probabilities alike). Entries outside `[0, 1]` are
69    /// rejected because the per-entry log-likelihood is then unbounded in `η`.
70    pub y: ArrayView2<'a, f64>,
71    /// Shared smoothing penalty `S ∈ ℝ^{P×P}` (symmetric, PSD).
72    pub penalty: ArrayView2<'a, f64>,
73    /// Per-response smoothing parameter `λ_a` (length `K`).
74    pub lambdas: ArrayView1<'a, f64>,
75    /// Optional per-row weights (length `N`); `None` ⇒ uniform 1.0.
76    pub row_weights: Option<ArrayView1<'a, f64>>,
77    /// Optional per-row Fisher-block override, shape `(N, K, K)`. The `K`
78    /// binomial-logit columns are fit independently, so only the per-column
79    /// diagonal `[n, a, a]` is consumed as the curvature `w_n μ_a(1 − μ_a)`;
80    /// off-diagonals must be zero (enforced at the FFI boundary) since a
81    /// non-zero cross term cannot be represented by the separable per-column
82    /// solve. The gradient/residual path stays analytic — this is a
83    /// curvature-only override (issue #349). Diagonal entries must be finite
84    /// and non-negative.
85    pub fisher_w_override: Option<ArrayView3<'a, f64>>,
86    /// Maximum Newton iterations per response column; recommend 50.
87    pub max_iter: usize,
88    /// Relative-step convergence tolerance; recommend 1e-7.
89    pub tol: f64,
90}
91
92/// Outputs of [`fit_penalized_binomial_multi`].
93#[derive(Debug, Clone)]
94pub struct BinomialMultiFitOutputs {
95    /// Coefficient matrix, shape `(P, K)` (column `a` is `β_a`).
96    pub coefficients: Array2<f64>,
97    /// Fitted probabilities `μ_{n,a} = σ((X β_a)_n)`, shape `(N, K)`.
98    pub fitted_probabilities: Array2<f64>,
99    /// Number of joint Newton iterations executed (including the final step
100    /// that satisfied the tolerance). The `K` columns share the design and
101    /// are fitted by a single coupled damped-Newton loop over the
102    /// block-diagonal penalized Hessian, so there is one iteration count for
103    /// the whole solve.
104    pub iterations: usize,
105    /// `true` if the relative-step test was satisfied before `max_iter`.
106    pub converged: bool,
107    /// Penalized negative log-likelihood at the returned `β̂`:
108    /// `−log L(β̂) + ½ Σ_a λ_a · β̂_aᵀ S β̂_a`.
109    pub penalized_neg_log_likelihood: f64,
110    /// Unpenalized deviance `−2 log L(β̂)` for diagnostic reporting.
111    pub deviance: f64,
112}
113
114/// Numerically stable logistic CDF used by the Newton driver. Mirrors the
115/// inline helper that previously lived in `crates/gam-pyffi/src/lib.rs`.
116#[inline]
117fn sigmoid_stable(eta: f64) -> f64 {
118    if eta >= 0.0 {
119        let e = (-eta).exp();
120        1.0 / (1.0 + e)
121    } else {
122        let e = eta.exp();
123        e / (1.0 + e)
124    }
125}
126
127/// Row-diagonal multi-output binomial-logit likelihood adapter for the shared
128/// [`crate::penalized_vector_glm`] engine.
129///
130/// The `K` response columns are mutually independent binomial-logit marginals
131/// sharing the design `X`, so the per-row Fisher block is **diagonal across
132/// outputs**: `H_{n,a,b} = δ_{ab} · w_n · μ_{n,a} (1 − μ_{n,a})`. The engine
133/// works in `η = X β` space with `μ_{n,a} = σ(η_{n,a})`; this adapter supplies
134/// the log-likelihood, the residual gradient `w_n (y_a − μ_a)`, and that
135/// row-diagonal block.
136struct BinomialMultiLikelihood {
137    /// Optional per-row weights (length N), or `None` for uniform 1.0.
138    row_weights: Option<Array1<f64>>,
139}
140
141impl BinomialMultiLikelihood {
142    #[inline]
143    fn row_weight(&self, n: usize) -> f64 {
144        self.row_weights.as_ref().map_or(1.0, |w| w[n])
145    }
146}
147
148impl VectorLikelihood for BinomialMultiLikelihood {
149    /// `Σ_n Σ_a w_n [ y_{n,a} log μ_{n,a} + (1 − y_{n,a}) log(1 − μ_{n,a}) ]`,
150    /// clamping `μ` to `[1e-12, 1 − 1e-12]` so the closed-form expression stays
151    /// finite when β drives a row deeply into saturation during a tentative
152    /// Newton step (the surrounding line search rejects such steps).
153    fn log_lik(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> f64 {
154        let (n, k) = eta.dim();
155        let mut acc = 0.0_f64;
156        for row in 0..n {
157            let w = self.row_weight(row);
158            for a in 0..k {
159                let mu = sigmoid_stable(eta[[row, a]]).clamp(1.0e-12, 1.0 - 1.0e-12);
160                let yv = y[[row, a]];
161                acc += w * (yv * mu.ln() + (1.0 - yv) * (1.0 - mu).ln());
162            }
163        }
164        acc
165    }
166
167    /// `∂ log L / ∂η_{n,a} = w_n (y_{n,a} − μ_{n,a})`.
168    fn grad_eta(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> Array2<f64> {
169        let (n, k) = eta.dim();
170        let mut out = Array2::<f64>::zeros((n, k));
171        for row in 0..n {
172            let w = self.row_weight(row);
173            for a in 0..k {
174                let mu = sigmoid_stable(eta[[row, a]]);
175                out[[row, a]] = w * (y[[row, a]] - mu);
176            }
177        }
178        out
179    }
180
181    /// Per-output diagonal curvature `w_n μ_{n,a} (1 − μ_{n,a})`. The Fisher
182    /// information of independent Bernoulli outputs is `y`-independent; `y` is
183    /// read only to assert the target shape matches `eta`, as in the sibling
184    /// [`VectorLikelihood`] implementations.
185    fn hess_diag(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> Array2<f64> {
186        assert_eq!(eta.dim(), y.dim(), "y must match eta shape (N, K)");
187        let (n, k) = eta.dim();
188        let mut out = Array2::<f64>::zeros((n, k));
189        for row in 0..n {
190            let w = self.row_weight(row);
191            for a in 0..k {
192                let mu = sigmoid_stable(eta[[row, a]]);
193                out[[row, a]] = w * mu * (1.0 - mu);
194            }
195        }
196        out
197    }
198
199    /// Row-diagonal Fisher block `H_{n,a,b} = δ_{ab} · w_n μ_{n,a}(1 − μ_{n,a})`.
200    /// The independent columns have no cross-output coupling, so the off-diagonal
201    /// entries are identically zero; lifting [`Self::hess_diag`] onto the per-row
202    /// diagonal (the [`VectorLikelihood`] default) is exact here.
203    fn hess_block(&self, eta: ArrayView2<'_, f64>, y: ArrayView2<'_, f64>) -> Array3<f64> {
204        let diag = self.hess_diag(eta, y);
205        let (n, k) = diag.dim();
206        let mut out = Array3::<f64>::zeros((n, k, k));
207        for row in 0..n {
208            for a in 0..k {
209                out[[row, a, a]] = diag[[row, a]];
210            }
211        }
212        out
213    }
214}
215
216/// Fit `K` independent penalized binomial-logit GLMs sharing the design `X`
217/// and penalty `S`. See the module docs for the optimization problem.
218pub fn fit_penalized_binomial_multi(
219    inputs: BinomialMultiFitInputs<'_>,
220) -> Result<BinomialMultiFitOutputs, EstimationError> {
221    let BinomialMultiFitInputs {
222        design,
223        y,
224        penalty,
225        lambdas,
226        row_weights,
227        fisher_w_override,
228        max_iter,
229        tol,
230    } = inputs;
231
232    // ──────────────────────── family-specific validation ───────────────────
233    // The engine re-validates the shared geometry (nonempty design, penalty
234    // shape, λ finiteness/non-negativity, override `(N, M, M)` shape, finite
235    // design), but the binomial family owns three preconditions the generic
236    // scaffold cannot know: the response must be a `[0, 1]` proportion, the
237    // optional row weights must be finite and non-negative, and the optional
238    // curvature override must be **row-diagonal** (independent columns carry no
239    // cross-output coupling, so a non-zero off-diagonal cannot be represented).
240    let n_obs = design.nrows();
241    let (y_rows, k) = y.dim();
242    if y_rows != n_obs {
243        crate::bail_invalid_estim!(
244            "fit_penalized_binomial_multi: y rows {y_rows} ≠ design rows {n_obs}"
245        );
246    }
247    if k == 0 {
248        crate::bail_invalid_estim!(
249            "fit_penalized_binomial_multi: y must have at least one column (got K=0)"
250        );
251    }
252    if lambdas.len() != k {
253        crate::bail_invalid_estim!(
254            "fit_penalized_binomial_multi: lambdas length {} ≠ K = {k}",
255            lambdas.len()
256        );
257    }
258    if let Some(fw) = fisher_w_override.as_ref() {
259        if fw.dim() != (n_obs, k, k) {
260            crate::bail_invalid_estim!(
261                "fit_penalized_binomial_multi: fisher_w_override shape {:?} ≠ (N, K, K) = ({n_obs}, {k}, {k})",
262                fw.dim()
263            );
264        }
265        // Independent binomial columns have a strictly row-diagonal Fisher
266        // block; a non-zero cross term `[n, a, b]` (a ≠ b) cannot be the
267        // curvature of a separable per-column objective, so reject it rather
268        // than silently couple the columns through the shared dense solve.
269        for ((n_idx, a, b), &v) in fw.indexed_iter() {
270            if a != b && v != 0.0 {
271                crate::bail_invalid_estim!(
272                    "fit_penalized_binomial_multi: fisher_w_override[{n_idx},{a},{b}] must be zero \
273                     (independent columns have a row-diagonal Fisher block); got {v}"
274                );
275            }
276        }
277    }
278    if let Some(w) = row_weights.as_ref() {
279        if w.len() != n_obs {
280            crate::bail_invalid_estim!(
281                "fit_penalized_binomial_multi: row_weights length {} ≠ N = {n_obs}",
282                w.len()
283            );
284        }
285        for (i, &v) in w.iter().enumerate() {
286            if !(v.is_finite() && v >= 0.0) {
287                crate::bail_invalid_estim!(
288                    "fit_penalized_binomial_multi: row_weights[{i}] must be finite and ≥ 0 (got {v})"
289                );
290            }
291        }
292    }
293    for ((i, j), &v) in y.indexed_iter() {
294        // The per-entry objective y log μ + (1 − y) log(1 − μ) is the binomial
295        // (Bernoulli / proportion) log-likelihood only when 0 ≤ y ≤ 1. Outside
296        // that range it is unbounded above in η (e.g. y = 2 gives
297        // 2η − log(1 + e^η) → ∞), so a finite-but-invalid entry would make the
298        // stated likelihood not a binomial likelihood at all. Reject it here.
299        if !(v.is_finite() && (0.0..=1.0).contains(&v)) {
300            crate::bail_invalid_estim!(
301                "fit_penalized_binomial_multi: y[{i},{j}] must be a binomial proportion in [0,1] (got {v})"
302            );
303        }
304    }
305
306    // ─────────────────── shared penalized vector-GLM solve ─────────────────
307    let likelihood = BinomialMultiLikelihood {
308        row_weights: row_weights.map(|w| w.to_owned()),
309    };
310    let fit = fit_penalized_vector_glm(
311        PenalizedVectorGlmInputs {
312            design,
313            y,
314            penalty,
315            lambdas,
316            fisher_w_override,
317            max_iter,
318            tol,
319            // Independent-binomial columns ARE genuinely independent outputs, so
320            // the per-output Diagonal penalty is correct here (the #1587 Centered
321            // metric is softmax-specific — there is no shared reference class).
322            class_penalty_metric: crate::penalized_vector_glm::ClassPenaltyMetric::Diagonal,
323        },
324        &likelihood,
325        "fit_penalized_binomial_multi",
326    )?;
327
328    // η → μ = σ(η) is the binomial inverse link applied column-wise.
329    let fitted = fit.eta.mapv(sigmoid_stable);
330
331    Ok(BinomialMultiFitOutputs {
332        coefficients: fit.coefficients,
333        fitted_probabilities: fitted,
334        iterations: fit.iterations,
335        converged: fit.converged,
336        penalized_neg_log_likelihood: -fit.log_likelihood + fit.penalty_term,
337        deviance: -2.0 * fit.log_likelihood,
338    })
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344    use ndarray::Array3;
345
346    fn toy_inputs() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
347        let n = 12;
348        let p = 2;
349        let k = 2;
350        let design =
351            Array2::<f64>::from_shape_fn(
352                (n, p),
353                |(i, j)| {
354                    if j == 0 { 1.0 } else { ((i + 1) as f64).sin() }
355                },
356            );
357        let y =
358            Array2::<f64>::from_shape_fn((n, k), |(i, a)| if (i + a) % 2 == 0 { 1.0 } else { 0.0 });
359        let penalty = Array2::<f64>::eye(p);
360        let lambdas = Array1::<f64>::from_elem(k, 0.5);
361        (design, y, penalty, lambdas)
362    }
363
364    #[test]
365    fn fisher_override_none_reproduces_analytic_bit_for_bit() {
366        // Issue #349: a None override must give exactly the analytic result.
367        let (design, y, penalty, lambdas) = toy_inputs();
368        let base = fit_penalized_binomial_multi(BinomialMultiFitInputs {
369            design: design.view(),
370            y: y.view(),
371            penalty: penalty.view(),
372            lambdas: lambdas.view(),
373            row_weights: None,
374            fisher_w_override: None,
375            max_iter: 50,
376            tol: 1.0e-9,
377        })
378        .expect("analytic fit must succeed");
379        // Explicit None again — identical result.
380        let again = fit_penalized_binomial_multi(BinomialMultiFitInputs {
381            design: design.view(),
382            y: y.view(),
383            penalty: penalty.view(),
384            lambdas: lambdas.view(),
385            row_weights: None,
386            fisher_w_override: None,
387            max_iter: 50,
388            tol: 1.0e-9,
389        })
390        .expect("analytic fit must succeed");
391        for (a, b) in base.coefficients.iter().zip(again.coefficients.iter()) {
392            assert_eq!(a, b, "None override must be deterministic");
393        }
394    }
395
396    #[test]
397    fn out_of_range_response_is_rejected() {
398        // Issue #452: a finite but invalid entry (y = 2) makes the per-entry
399        // binomial log-likelihood unbounded in η, so it must be rejected rather
400        // than silently fit. The same guard covers negative entries.
401        let (design, y, penalty, lambdas) = toy_inputs();
402        let mut bad = y.clone();
403        bad[[0, 0]] = 2.0;
404        let err = fit_penalized_binomial_multi(BinomialMultiFitInputs {
405            design: design.view(),
406            y: bad.view(),
407            penalty: penalty.view(),
408            lambdas: lambdas.view(),
409            row_weights: None,
410            fisher_w_override: None,
411            max_iter: 50,
412            tol: 1.0e-9,
413        })
414        .expect_err("out-of-range response must error");
415        assert!(format!("{err}").contains("binomial proportion in [0,1]"));
416
417        let mut neg = y.clone();
418        neg[[1, 1]] = -0.5;
419        let err = fit_penalized_binomial_multi(BinomialMultiFitInputs {
420            design: design.view(),
421            y: neg.view(),
422            penalty: penalty.view(),
423            lambdas: lambdas.view(),
424            row_weights: None,
425            fisher_w_override: None,
426            max_iter: 50,
427            tol: 1.0e-9,
428        })
429        .expect_err("negative response must error");
430        assert!(format!("{err}").contains("binomial proportion in [0,1]"));
431    }
432
433    #[test]
434    fn fisher_override_shape_mismatch_is_rejected() {
435        let (design, y, penalty, lambdas) = toy_inputs();
436        let n = design.nrows();
437        let k = y.ncols();
438        let bad = Array3::<f64>::zeros((n, k + 1, k + 1));
439        let err = fit_penalized_binomial_multi(BinomialMultiFitInputs {
440            design: design.view(),
441            y: y.view(),
442            penalty: penalty.view(),
443            lambdas: lambdas.view(),
444            row_weights: None,
445            fisher_w_override: Some(bad.view()),
446            max_iter: 50,
447            tol: 1.0e-9,
448        })
449        .expect_err("mismatched override shape must error");
450        assert!(format!("{err}").contains("fisher_w_override shape"));
451    }
452
453    #[test]
454    fn fisher_override_replaces_curvature_diagonal() {
455        // A scaled curvature override changes the Newton step from β = 0:
456        // with curvature scaled by α the first step is 1/α of the analytic
457        // step (gradient unchanged), so the fitted β must differ from analytic.
458        let (design, y, penalty, lambdas) = toy_inputs();
459        let n = design.nrows();
460        let k = y.ncols();
461        // Analytic diagonal at β = 0 is μ(1−μ) = 0.25 for every column.
462        let mut over = Array3::<f64>::zeros((n, k, k));
463        for row in 0..n {
464            for a in 0..k {
465                over[[row, a, a]] = 0.25 * 4.0; // 4× the analytic curvature
466            }
467        }
468        let scaled = fit_penalized_binomial_multi(BinomialMultiFitInputs {
469            design: design.view(),
470            y: y.view(),
471            penalty: penalty.view(),
472            lambdas: lambdas.view(),
473            row_weights: None,
474            fisher_w_override: Some(over.view()),
475            max_iter: 1,
476            tol: 1.0e-9,
477        })
478        .expect("override fit must succeed");
479        let analytic = fit_penalized_binomial_multi(BinomialMultiFitInputs {
480            design: design.view(),
481            y: y.view(),
482            penalty: penalty.view(),
483            lambdas: lambdas.view(),
484            row_weights: None,
485            fisher_w_override: None,
486            max_iter: 1,
487            tol: 1.0e-9,
488        })
489        .expect("analytic fit must succeed");
490        let differs = scaled
491            .coefficients
492            .iter()
493            .zip(analytic.coefficients.iter())
494            .any(|(a, b)| (a - b).abs() > 1.0e-6);
495        assert!(differs, "scaled curvature override must change the step");
496    }
497}