Skip to main content

gam_models/gamlss/
dispersion_family.rs

1//! #913: dispersion-channel GAMLSS location-scale families.
2//!
3//! Extracted from `gamlss.rs` (issue #780); this module now owns the
4//! dispersion-channel joint-curvature corrections.
5
6use super::weighted_design_products::{mirror_upper_to_lower, xt_diag_x_design, xt_diag_y_design};
7// `Order2<2>::value()` is the JetScalar trait method; bring the trait into scope
8// so the dispersion row-NLL value reads (`-tower.value()`) resolve (E0599 fix).
9use super::{
10    BlockwiseTermFitResult, GamlssLambdaLayout, LOCATION_SCALE_N_OUTPUTS,
11    LocationScaleFamilyBuilder, build_location_scale_block, fit_location_scale_terms,
12    identity_penalty, solve_penalizedweighted_projection,
13};
14use crate::block_layout::block_count::validate_block_count;
15use crate::custom_family::{
16    BlockWorkingSet, BlockwiseFitOptions, CustomFamily, CustomFamilyBlockPsiDerivative,
17    FamilyEvaluation, ParameterBlockSpec, ParameterBlockState, PenaltyMatrix,
18};
19use crate::gamlss::GamlssError;
20use crate::model_types::UnifiedFitResult;
21use gam_linalg::matrix::LinearOperator;
22use gam_math::jet_scalar::JetScalar;
23use gam_terms::smooth::{
24    SpatialLengthScaleOptimizationOptions, TermCollectionDesign, TermCollectionSpec,
25};
26use ndarray::{Array1, Array2, s};
27use statrs::function::gamma::ln_gamma;
28
29// ============================================================================
30// #913: dispersion-channel GAMLSS location-scale families.
31//
32// `noise_formula` (a second linear predictor on the dispersion channel) was
33// wired only for Gaussian/Binomial location-scale and the survival families.
34// The genuine-dispersion mean families — NegativeBinomial, Gamma, Beta and
35// Tweedie — were mean-only with a single scalar dispersion. This module adds a
36// SINGLE generic two-block family that routes all four through the existing
37// blockwise REML engine and the shared `LocationScaleFamilyBuilder` /
38// `fit_location_scale_terms` plumbing, so the κ-coordinate assembly, warm
39// start, shrinkage-penalised scale block and result extraction are reused
40// verbatim. A family is added by supplying only its per-row log-likelihood and
41// the (mean, log-precision) working sets — everything else is shared.
42//
43// Block layout: block 0 = mean predictor (η_μ, log link for NB/Gamma/Tweedie,
44// logit for Beta); block 1 = log-precision predictor (η_d). The dispersion
45// channel models log(precision) uniformly — `θ` for NegativeBinomial, the
46// shape `ν` for Gamma, `φ` for Beta, and `1/φ` for Tweedie — so a larger η_d
47// always means *less* dispersion, matching the Gaussian/Binomial convention
48// where η_logσ smaller ⇒ tighter. With no `noise_formula` the log-precision
49// block is a single intercept and the fit reduces to the scalar-dispersion
50// model.
51//
52// NB2 with `(μ, θ)` and the exponential-dispersion members here with
53// `(μ, φ)` are Fisher-orthogonal in their standard mean/dispersion
54// parameterizations: Gamma uses shape `ν = 1/φ`, and Tweedie models
55// `log(1/φ)`, so those precision-channel transforms preserve zero expected
56// mean/dispersion cross information. Beta is the exception in this module's
57// mean/precision parameterization. For `Beta(μφ, (1−μ)φ)`,
58//
59//   I_{μ,φ} = φ · (μ ψ'(μφ) − (1−μ) ψ'((1−μ)φ)),
60//
61// so in predictor coordinates `(η_μ = logit μ, η_φ = log φ)` the Fisher cross
62// block is
63//
64//   I_{η_μ,η_φ} = μ(1−μ) φ² · (μ ψ'(μφ) − (1−μ) ψ'((1−μ)φ)),
65//
66// which is generically nonzero. Block-cyclic Fisher-scoring IRLS is still a
67// valid block coordinate solve for the point estimate, but joint-curvature
68// consumers (`log|H|`, coefficient covariance, posterior draws) must receive
69// Beta's off-diagonal coefficient block. Smoothing-parameter selection still
70// runs through the engine's first-order (gradient-only) outer path: the family
71// declines the dense outer Hessian capability because its working weights
72// couple the two blocks (`W_μ` depends on the precision and vice-versa), which
73// the block-local diagonal-drift hook cannot represent exactly.
74// ============================================================================
75
76/// The genuine-dispersion mean family whose precision (overdispersion) channel
77/// can carry a second `noise_formula` linear predictor (issue #913).
78#[derive(Clone, Copy, Debug, PartialEq)]
79pub enum DispersionFamilyKind {
80    /// NB2: `Var = μ + μ²/θ`; the precision channel models `log θ`.
81    NegativeBinomial,
82    /// Gamma with `Var = μ²/ν`; the precision channel models `log ν` (shape).
83    Gamma,
84    /// Beta(μφ, (1−μ)φ) with a logit mean link; the precision channel models
85    /// `log φ`.
86    Beta,
87    /// Tweedie compound Poisson–Gamma with `Var = φ μ^p`, fixed power `p`; the
88    /// precision channel models `log(1/φ)`. The per-row density uses the
89    /// saddlepoint (Nelder–Pregibon) approximation for `y > 0` and the exact
90    /// point mass at `y = 0`; this is the standard tractable Tweedie ML
91    /// surface (an exact-series φ-derivative is the remaining hard sub-item of
92    /// #913).
93    Tweedie { p: f64 },
94}
95
96impl DispersionFamilyKind {
97    pub const fn family_tag(self) -> &'static str {
98        match self {
99            DispersionFamilyKind::NegativeBinomial => FAMILY_NEGBIN_LOCATION_SCALE,
100            DispersionFamilyKind::Gamma => FAMILY_GAMMA_LOCATION_SCALE,
101            DispersionFamilyKind::Beta => FAMILY_BETA_LOCATION_SCALE,
102            DispersionFamilyKind::Tweedie { .. } => FAMILY_TWEEDIE_LOCATION_SCALE,
103        }
104    }
105
106    /// The mean link is logit for Beta (a probability mean) and log otherwise.
107    pub(crate) const fn mean_is_logit(self) -> bool {
108        matches!(self, DispersionFamilyKind::Beta)
109    }
110
111    /// The mean inverse link this dispersion family fits on: log for
112    /// NegativeBinomial / Gamma / Tweedie, logit for Beta. Single source of
113    /// truth shared by the CLI and FFI save paths so the persisted
114    /// `base_link` never diverges from the fitted channel.
115    pub fn base_link(self) -> gam_problem::InverseLink {
116        use gam_problem::{InverseLink, StandardLink};
117        if self.mean_is_logit() {
118            InverseLink::Standard(StandardLink::Logit)
119        } else {
120            InverseLink::Standard(StandardLink::Log)
121        }
122    }
123
124    /// The family's canonical [`LikelihoodSpec`] (mean response × mean link).
125    /// The overdispersion parameter is estimated by the log-precision channel,
126    /// so the response-family placeholder parameters (`phi`, `theta`) mirror
127    /// the [`resolve_family`](crate::fit_orchestration::materialize::resolve_family) defaults
128    /// and are not consumed as fixed values at predict time. This is the single
129    /// source of truth for the persisted location-scale likelihood so the CLI
130    /// and FFI save paths cannot diverge.
131    pub fn likelihood_spec(self) -> gam_problem::LikelihoodSpec {
132        use gam_problem::{InverseLink, LikelihoodSpec, ResponseFamily, StandardLink};
133        let response = match self {
134            DispersionFamilyKind::NegativeBinomial => ResponseFamily::NegativeBinomial {
135                theta: 1.0,
136                theta_fixed: false,
137            },
138            DispersionFamilyKind::Gamma => ResponseFamily::Gamma,
139            DispersionFamilyKind::Beta => ResponseFamily::Beta { phi: 1.0 },
140            DispersionFamilyKind::Tweedie { p } => ResponseFamily::Tweedie { p },
141        };
142        let link = if self.mean_is_logit() {
143            InverseLink::Standard(StandardLink::Logit)
144        } else {
145            InverseLink::Standard(StandardLink::Log)
146        };
147        LikelihoodSpec::new(response, link)
148    }
149}
150
151pub const FAMILY_NEGBIN_LOCATION_SCALE: &str = "negbin-location-scale";
152pub const FAMILY_GAMMA_LOCATION_SCALE: &str = "gamma-location-scale";
153pub const FAMILY_BETA_LOCATION_SCALE: &str = "beta-location-scale";
154pub const FAMILY_TWEEDIE_LOCATION_SCALE: &str = "tweedie-location-scale";
155
156/// `η` magnitude clamp shared by both channels (mirrors PIRLS `ETA_CLAMP`):
157/// keeps `exp(η)` and the logit jet away from overflow while staying in the
158/// smooth interior of every link.
159pub(super) const DISPERSION_ETA_CLAMP: f64 = 30.0;
160/// Floor for a per-row IRLS working weight / curvature so the block normal
161/// equations stay positive-definite. The working *response* always carries the
162/// exact score, so the stationary point (penalised score = 0) is independent
163/// of this floor; it only conditions the inner solve.
164pub(super) const DISPERSION_MIN_CURVATURE: f64 = 1e-12;
165
166/// Per-row working quantities for both channels at the current `(η_μ, η_d)`.
167pub(super) struct DispersionRowKernel {
168    pub(super) loglik: f64,
169    pub(super) mean_weight: f64,
170    pub(super) mean_response: f64,
171    pub(super) disp_weight: f64,
172    pub(super) disp_response: f64,
173}
174
175#[cfg(test)]
176mod test_support {
177    use super::*;
178
179    /// Test-oracle NB2 row NLL over a generic [`JetScalar<2>`], seeded on the
180    /// natural parameters `(μ, θ)`.
181    #[inline]
182    pub(super) fn dispersion_nb_nll_generic<S: gam_math::jet_scalar::JetScalar<2>>(
183        yi: f64,
184        mu_value: f64,
185        theta_value: f64,
186        wi: f64,
187    ) -> S {
188        let mu = S::variable(mu_value, 0);
189        let theta = S::variable(theta_value, 1);
190        let tpm = theta.add(&mu);
191        // (theta + yi).ln_gamma() - theta.ln_gamma() - ln_gamma(yi+1)
192        //   + theta*theta.ln() - theta*tpm.ln() + mu.ln()*yi - tpm.ln()*yi
193        let loglik = theta
194            .add(&S::constant(yi))
195            .ln_gamma()
196            .sub(&theta.ln_gamma())
197            .sub(&S::constant(ln_gamma(yi + 1.0)))
198            .add(&theta.mul(&theta.ln()))
199            .sub(&theta.mul(&tpm.ln()))
200            .add(&mu.ln().scale(yi))
201            .sub(&tpm.ln().scale(yi));
202        loglik.scale(-wi)
203    }
204
205    /// Test-oracle Gamma row NLL over a generic [`JetScalar<2>`], seeded on
206    /// `(μ, ν)`.
207    #[inline]
208    pub(super) fn dispersion_gamma_nll_generic<S: gam_math::jet_scalar::JetScalar<2>>(
209        yi: f64,
210        y_pos: f64,
211        mu_value: f64,
212        nu_value: f64,
213        wi: f64,
214    ) -> S {
215        let mu = S::variable(mu_value, 0);
216        let nu = S::variable(nu_value, 1);
217        // nu*nu.ln() - nu*mu.ln() - nu.ln_gamma() + (nu-1)*y_pos.ln() - nu*(mu.recip()*yi)
218        let loglik = nu
219            .mul(&nu.ln())
220            .sub(&nu.mul(&mu.ln()))
221            .sub(&nu.ln_gamma())
222            .add(&nu.sub(&S::constant(1.0)).scale(y_pos.ln()))
223            .sub(&nu.mul(&mu.recip().scale(yi)));
224        loglik.scale(-wi)
225    }
226
227    /// Test-oracle Beta row NLL over a generic [`JetScalar<2>`], seeded on
228    /// `(μ, φ)`.
229    #[inline]
230    pub(super) fn dispersion_beta_nll_generic<S: gam_math::jet_scalar::JetScalar<2>>(
231        yi: f64,
232        mu_value: f64,
233        phi_value: f64,
234        wi: f64,
235    ) -> S {
236        let mu = S::variable(mu_value, 0);
237        let phi = S::variable(phi_value, 1);
238        let one_minus_mu = S::constant(1.0).sub(&mu);
239        let yc = yi.clamp(1e-12, 1.0 - 1e-12);
240        let a = mu.mul(&phi);
241        let b = one_minus_mu.mul(&phi);
242        // phi.ln_gamma() - a.ln_gamma() - b.ln_gamma()
243        //   + (a-1)*yc.ln() + (b-1)*(1-yc).ln()
244        let loglik = phi
245            .ln_gamma()
246            .sub(&a.ln_gamma())
247            .sub(&b.ln_gamma())
248            .add(&a.sub(&S::constant(1.0)).scale(yc.ln()))
249            .add(&b.sub(&S::constant(1.0)).scale((1.0 - yc).ln()));
250        loglik.scale(-wi)
251    }
252
253    /// #1591 jet-prune oracle: full `Order2<2>` (value/grad/Hessian) NB2 row NLL.
254    ///
255    /// Production no longer consumes the mean (`μ`-axis) derivative channels of
256    /// this tower — the NB mean block is Fisher-orthogonal and hand-written
257    /// exactly in [`dispersion_row_kernel`] — so the hot path uses the pruned
258    /// single-axis [`dispersion_nb_disp_order2`] instead. This `K=2` form
259    /// survives only as the dense-`Tower4<2>` oracle pin
260    /// (`order2_matches_dense_tower_all_channels`).
261    #[inline]
262    pub(super) fn dispersion_nb_nll_order2(
263        yi: f64,
264        mu_value: f64,
265        theta_value: f64,
266        wi: f64,
267    ) -> gam_math::jet_scalar::Order2<2> {
268        type O2 = gam_math::jet_scalar::Order2<2>;
269
270        let mu = O2::variable(mu_value, 0);
271        let theta = O2::variable(theta_value, 1);
272        let tpm = theta.add(&mu);
273        let theta_plus_y = theta.add(&O2::constant(yi));
274        let loglik = order2_ln_gamma(&theta_plus_y)
275            .sub(&order2_ln_gamma(&theta))
276            .sub(&O2::constant(ln_gamma(yi + 1.0)))
277            .add(&theta.mul(&theta.ln()))
278            .sub(&theta.mul(&tpm.ln()))
279            .add(&mu.ln().scale(yi))
280            .sub(&tpm.ln().scale(yi));
281        loglik.scale(-wi)
282    }
283
284    /// #1591 jet-prune oracle: full `Order2<2>` Gamma row NLL. As with NB, the
285    /// mean axis is unused in production (hand-written, Fisher-orthogonal); the
286    /// hot path uses the single-axis [`dispersion_gamma_disp_order2`]. Kept only
287    /// as the dense-tower oracle pin.
288    #[inline]
289    pub(super) fn dispersion_gamma_nll_order2(
290        yi: f64,
291        y_pos: f64,
292        mu_value: f64,
293        nu_value: f64,
294        wi: f64,
295    ) -> gam_math::jet_scalar::Order2<2> {
296        type O2 = gam_math::jet_scalar::Order2<2>;
297
298        let mu = O2::variable(mu_value, 0);
299        let nu = O2::variable(nu_value, 1);
300        let loglik = nu
301            .mul(&nu.ln())
302            .sub(&nu.mul(&mu.ln()))
303            .sub(&order2_ln_gamma(&nu))
304            .add(&nu.sub(&O2::constant(1.0)).scale(y_pos.ln()))
305            .sub(&nu.mul(&mu.recip().scale(yi)));
306        loglik.scale(-wi)
307    }
308}
309
310/// Production `Order2<2>` Beta row NLL (value/grad/Hessian hot path; the cross
311/// channel `h()[0][1]` feeds the Beta observed cross weight).
312#[inline]
313pub(crate) fn dispersion_beta_nll_order2(
314    yi: f64,
315    mu_value: f64,
316    phi_value: f64,
317    wi: f64,
318) -> gam_math::jet_scalar::Order2<2> {
319    type O2 = gam_math::jet_scalar::Order2<2>;
320
321    let mu = O2::variable(mu_value, 0);
322    let phi = O2::variable(phi_value, 1);
323    let one_minus_mu = O2::constant(1.0).sub(&mu);
324    let yc = yi.clamp(1e-12, 1.0 - 1e-12);
325    let a = mu.mul(&phi);
326    let b = one_minus_mu.mul(&phi);
327    let loglik = order2_ln_gamma(&phi)
328        .sub(&order2_ln_gamma(&a))
329        .sub(&order2_ln_gamma(&b))
330        .add(&a.sub(&O2::constant(1.0)).scale(yc.ln()))
331        .add(&b.sub(&O2::constant(1.0)).scale((1.0 - yc).ln()));
332    loglik.scale(-wi)
333}
334
335#[inline]
336fn order2_ln_gamma<const K: usize>(
337    x: &gam_math::jet_scalar::Order2<K>,
338) -> gam_math::jet_scalar::Order2<K> {
339    gam_math::jet_scalar::Order2(
340        x.0.compose_unary(gam_math::jet_tower::ln_gamma_derivative_stack_order2(x.0.v)),
341    )
342}
343
344// ============================================================================
345// #1591 jet-prune: single-axis (`K=1`) dispersion-channel towers.
346//
347// For NegativeBinomial / Gamma / Tweedie the production row kernel consumes ONLY
348// the dispersion-axis derivatives (`g[disp]`, `h[disp][disp]`) and the value;
349// the mean block is Fisher-orthogonal and assembled in closed form. Seeding the
350// mean as a CONSTANT and the dispersion parameter as the SOLE jet variable
351// therefore yields a tower whose `(value, g[0], h[0][0])` are `to_bits`-
352// identical to the consumed `(value, g[1], h[1][1])` of the old `Order2<2>`
353// tower — the mean seed only ever populated the now-discarded `g[mean]` /
354// `h[mean][·]` channels (Leibniz/Faà-di-Bruno never read the dispersion-axis
355// channels off the mean seed). Collapsing `K=2 → K=1` quarters the Hessian
356// tensor (1 entry vs 4) and halves the gradient, with no change to any consumed
357// float bit. The `ln_gamma` derivative stacks are unchanged (the irreducible
358// transcendental cost), so this trims the rational composition, not the special
359// functions.
360// ============================================================================
361
362/// Pruned single-axis NB2 dispersion tower: `θ` is the sole jet variable
363/// (axis 0), `μ` a constant. `value`/`g[0]`/`h[0][0]` reproduce the consumed
364/// `value`/`g[1]`/`h[1][1]` of `dispersion_nb_nll_order2` bit-for-bit.
365#[inline]
366pub(crate) fn dispersion_nb_disp_order2(
367    yi: f64,
368    mu_value: f64,
369    theta_value: f64,
370    wi: f64,
371) -> gam_math::jet_scalar::Order2<1> {
372    type O1 = gam_math::jet_scalar::Order2<1>;
373
374    let mu = O1::constant(mu_value);
375    let theta = O1::variable(theta_value, 0);
376    let tpm = theta.add(&mu);
377    let theta_plus_y = theta.add(&O1::constant(yi));
378    let loglik = order2_ln_gamma(&theta_plus_y)
379        .sub(&order2_ln_gamma(&theta))
380        .sub(&O1::constant(ln_gamma(yi + 1.0)))
381        .add(&theta.mul(&theta.ln()))
382        .sub(&theta.mul(&tpm.ln()))
383        .add(&mu.ln().scale(yi))
384        .sub(&tpm.ln().scale(yi));
385    loglik.scale(-wi)
386}
387
388/// Pruned single-axis Gamma dispersion tower: `ν` is the sole jet variable
389/// (axis 0), `μ` a constant. Consumed channels match
390/// `dispersion_gamma_nll_order2` index-1 bit-for-bit.
391#[inline]
392pub(crate) fn dispersion_gamma_disp_order2(
393    yi: f64,
394    y_pos: f64,
395    mu_value: f64,
396    nu_value: f64,
397    wi: f64,
398) -> gam_math::jet_scalar::Order2<1> {
399    type O1 = gam_math::jet_scalar::Order2<1>;
400
401    let mu = O1::constant(mu_value);
402    let nu = O1::variable(nu_value, 0);
403    let loglik = nu
404        .mul(&nu.ln())
405        .sub(&nu.mul(&mu.ln()))
406        .sub(&order2_ln_gamma(&nu))
407        .add(&nu.sub(&O1::constant(1.0)).scale(y_pos.ln()))
408        .sub(&nu.mul(&mu.recip().scale(yi)));
409    loglik.scale(-wi)
410}
411
412/// Pruned single-axis Tweedie dispersion tower seeded on the predictor `η_d`
413/// (axis 0), with `η_μ` a constant (so `μ = exp(η_μ)` carries no jet). The
414/// `φ = exp(−η_d)` chain and its nonlinear `∂²φ/∂η_d²` curvature are carried
415/// exactly as in `dispersion_tweedie_nll_generic`; `value`/`g[0]`/`h[0][0]`
416/// match that program's `value`/`g[1]`/`h[1][1]` bit-for-bit.
417#[inline]
418pub(crate) fn dispersion_tweedie_disp_order2(
419    yi: f64,
420    eta_mu: f64,
421    eta_d: f64,
422    p: f64,
423    wi: f64,
424) -> gam_math::jet_scalar::Order2<1> {
425    type O1 = gam_math::jet_scalar::Order2<1>;
426
427    let one_minus_p = 1.0 - p;
428    let two_minus_p = 2.0 - p;
429    let mu = O1::constant(eta_mu).exp();
430    let phi = O1::variable(eta_d, 0).scale(-1.0).exp();
431    if yi > 0.0 {
432        let dev = mu
433            .powf(two_minus_p)
434            .scale(1.0 / two_minus_p)
435            .sub(&mu.powf(one_minus_p).scale(yi / one_minus_p))
436            .add(&O1::constant(
437                yi.powf(two_minus_p) / (one_minus_p * two_minus_p),
438            ))
439            .scale(2.0);
440        let loglik = dev
441            .mul(&phi.recip().scale(-0.5))
442            .sub(&phi.scale(2.0 * std::f64::consts::PI).ln().scale(0.5))
443            .sub(&O1::constant(0.5 * p * yi.ln()));
444        loglik.scale(-wi)
445    } else {
446        let c = mu.powf(two_minus_p).scale(1.0 / two_minus_p);
447        let loglik = c.mul(&phi.recip()).scale(-1.0);
448        loglik.scale(-wi)
449    }
450}
451
452// ============================================================================
453// #1591 jet-prune: value-only (`K=0`) row negative-log-likelihood.
454//
455// `log_likelihood_only` reads ONLY `row.loglik = -tower.value()`; the full row
456// kernel it used to call evaluated every dispersion tower's gradient AND Hessian
457// — including the digamma/trigamma derivative stacks — purely to discard them.
458// These functions evaluate the SAME value-channel program in plain `f64`, so
459// they are `to_bits`-identical to `-tower.value()` (the jet value channel is the
460// naive scalar evaluation: `mul.v = a.v*b.v`, `compose.v = stack[0]`), while
461// touching only `ln_gamma` (stack slot 0) and never the digamma/trigamma slots.
462// On a per-row loglik that is the dominant transcendental saving.
463// ============================================================================
464
465/// NB2 row NLL value, plain `f64`, bit-identical to
466/// `-dispersion_nb_disp_order2(..).value()`.
467#[inline]
468fn dispersion_nb_neg_loglik(yi: f64, mu: f64, theta: f64, wi: f64) -> f64 {
469    let tpm = theta + mu;
470    let s = ln_gamma(theta + yi) - ln_gamma(theta) - ln_gamma(yi + 1.0) + theta * theta.ln()
471        - theta * tpm.ln()
472        + mu.ln() * yi
473        - tpm.ln() * yi;
474    -(s * -wi)
475}
476
477/// Gamma row NLL value, plain `f64`, bit-identical to
478/// `-dispersion_gamma_disp_order2(..).value()`.
479#[inline]
480fn dispersion_gamma_neg_loglik(yi: f64, y_pos: f64, mu: f64, nu: f64, wi: f64) -> f64 {
481    // NB: the jet forms `μ.recip().scale(yi)` = `(1/μ)·yᵢ` (reciprocal then
482    // multiply), NOT `yᵢ/μ` (single divide) — these differ in the last bit, so
483    // the value path must reproduce the reciprocal-then-multiply exactly.
484    let s = nu * nu.ln() - nu * mu.ln() - ln_gamma(nu) + (nu - 1.0) * y_pos.ln()
485        - nu * ((1.0 / mu) * yi);
486    -(s * -wi)
487}
488
489/// Beta row NLL value, plain `f64`, bit-identical to
490/// `-dispersion_beta_nll_order2(..).value()`.
491#[inline]
492fn dispersion_beta_neg_loglik(yi: f64, mu: f64, phi: f64, wi: f64) -> f64 {
493    let one_minus_mu = 1.0 - mu;
494    let yc = yi.clamp(1e-12, 1.0 - 1e-12);
495    let a = mu * phi;
496    let b = one_minus_mu * phi;
497    let s = ln_gamma(phi) - ln_gamma(a) - ln_gamma(b)
498        + (a - 1.0) * yc.ln()
499        + (b - 1.0) * (1.0 - yc).ln();
500    -(s * -wi)
501}
502
503/// Tweedie row NLL value, plain `f64`, bit-identical to
504/// `-dispersion_tweedie_disp_order2(..).value()` (both density branches).
505#[inline]
506fn dispersion_tweedie_neg_loglik(yi: f64, eta_mu: f64, eta_d: f64, p: f64, wi: f64) -> f64 {
507    let one_minus_p = 1.0 - p;
508    let two_minus_p = 2.0 - p;
509    let mu = eta_mu.exp();
510    let phi = (-eta_d).exp();
511    let s = if yi > 0.0 {
512        let dev = (mu.powf(two_minus_p) * (1.0 / two_minus_p)
513            - mu.powf(one_minus_p) * (yi / one_minus_p)
514            + yi.powf(two_minus_p) / (one_minus_p * two_minus_p))
515            * 2.0;
516        dev * ((1.0 / phi) * -0.5)
517            - (phi * (2.0 * std::f64::consts::PI)).ln() * 0.5
518            - 0.5 * p * yi.ln()
519    } else {
520        let c = mu.powf(two_minus_p) * (1.0 / two_minus_p);
521        (c * (1.0 / phi)) * -1.0
522    };
523    -(s * -wi)
524}
525
526/// Value-only row negative log-likelihood for one observation — the pruned hot
527/// path for [`CustomFamily::log_likelihood_only`]. Mirrors the link/clamp
528/// preamble of [`dispersion_row_kernel`] exactly, then evaluates ONLY the value
529/// channel (no gradient/Hessian, no digamma/trigamma). Returns `row.loglik`
530/// `to_bits`-identically.
531#[inline]
532pub(crate) fn dispersion_row_loglik(
533    kind: DispersionFamilyKind,
534    yi: f64,
535    eta_mu: f64,
536    eta_d: f64,
537    prior_weight: f64,
538) -> f64 {
539    let wi = prior_weight.max(0.0);
540    let em = eta_mu.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
541    let ed = eta_d.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
542    match kind {
543        DispersionFamilyKind::NegativeBinomial => {
544            let mu = em.exp().max(1e-300);
545            let theta = ed.exp().max(1e-12);
546            dispersion_nb_neg_loglik(yi, mu, theta, wi)
547        }
548        DispersionFamilyKind::Gamma => {
549            let mu = em.exp().max(1e-300);
550            let nu = ed.exp().max(1e-12);
551            let y_pos = yi.max(1e-300);
552            dispersion_gamma_neg_loglik(yi, y_pos, mu, nu, wi)
553        }
554        DispersionFamilyKind::Beta => {
555            let mu = (1.0 / (1.0 + (-em).exp())).clamp(1e-12, 1.0 - 1e-12);
556            let phi = ed.exp().max(1e-12);
557            dispersion_beta_neg_loglik(yi, mu, phi, wi)
558        }
559        DispersionFamilyKind::Tweedie { p } => dispersion_tweedie_neg_loglik(yi, em, ed, p, wi),
560    }
561}
562
563#[inline]
564pub(crate) fn beta_observed_cross_weight_eta(yi: f64, mu: f64, phi: f64, wi: f64) -> f64 {
565    let q = (mu * (1.0 - mu)).max(1e-12);
566    let tower = dispersion_beta_nll_order2(yi, mu, phi, wi);
567    q * phi * tower.h()[0][1]
568}
569
570#[inline]
571pub(crate) fn dispersion_row_cross_weight(
572    kind: DispersionFamilyKind,
573    yi: f64,
574    eta_mu: f64,
575    eta_d: f64,
576    prior_weight: f64,
577) -> f64 {
578    let wi = prior_weight.max(0.0);
579    if wi == 0.0 {
580        return 0.0;
581    }
582    let em = eta_mu.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
583    let ed = eta_d.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
584    match kind {
585        DispersionFamilyKind::Beta => {
586            let mu = (1.0 / (1.0 + (-em).exp())).clamp(1e-12, 1.0 - 1e-12);
587            let phi = ed.exp().max(1e-12);
588            beta_observed_cross_weight_eta(yi, mu, phi, wi)
589        }
590        DispersionFamilyKind::NegativeBinomial
591        | DispersionFamilyKind::Gamma
592        | DispersionFamilyKind::Tweedie { .. } => 0.0,
593    }
594}
595
596#[inline]
597pub(crate) fn tower_score_info<const K: usize>(
598    tower: &gam_math::jet_scalar::Order2<K>,
599    idx: usize,
600    wi: f64,
601) -> (f64, f64) {
602    if wi == 0.0 {
603        (0.0, 0.0)
604    } else {
605        (-tower.g()[idx] / wi, tower.h()[idx][idx] / wi)
606    }
607}
608
609/// Evaluate the row log-likelihood and the (mean, log-precision) Fisher-scoring
610/// working sets for one observation. `eta_mu`/`eta_d` already include any
611/// per-channel offset (they are the block predictors). `prior_weight` is the
612/// observation's prior weight.
613pub(super) fn dispersion_row_kernel(
614    kind: DispersionFamilyKind,
615    yi: f64,
616    eta_mu: f64,
617    eta_d: f64,
618    prior_weight: f64,
619) -> DispersionRowKernel {
620    let wi = prior_weight.max(0.0);
621    let em = eta_mu.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
622    let ed = eta_d.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
623    match kind {
624        DispersionFamilyKind::NegativeBinomial => {
625            let mu = em.exp().max(1e-300);
626            let theta = ed.exp().max(1e-12); // precision (size)
627            let tpm = theta + mu;
628            let tower = dispersion_nb_disp_order2(yi, mu, theta, wi);
629            let (s_theta, info_theta_raw) = tower_score_info(&tower, 0, wi);
630            let loglik = -tower.value();
631            let info_mu = if wi == 0.0 {
632                DISPERSION_MIN_CURVATURE
633            } else {
634                (theta / (mu * tpm)).max(DISPERSION_MIN_CURVATURE)
635            };
636            let score_mu = theta * (yi - mu) / (mu * tpm);
637            let mean_weight = wi * mu * mu * info_mu;
638            let mean_response = em + score_mu / (mu * info_mu);
639            let info_theta = info_theta_raw;
640            let info_pos = info_theta.max(DISPERSION_MIN_CURVATURE);
641            let disp_weight = wi * theta * theta * info_pos;
642            let disp_response = ed + s_theta / (theta * info_pos);
643            DispersionRowKernel {
644                loglik,
645                mean_weight,
646                mean_response,
647                disp_weight,
648                disp_response,
649            }
650        }
651        DispersionFamilyKind::Gamma => {
652            let mu = em.exp().max(1e-300);
653            let nu = ed.exp().max(1e-12); // precision = shape ν
654            let y_pos = yi.max(1e-300);
655            let tower = dispersion_gamma_disp_order2(yi, y_pos, mu, nu, wi);
656            let (s_nu, info_nu_raw) = tower_score_info(&tower, 0, wi);
657            let loglik = -tower.value();
658            let info_mu = if wi == 0.0 {
659                DISPERSION_MIN_CURVATURE
660            } else {
661                (nu / (mu * mu)).max(DISPERSION_MIN_CURVATURE)
662            };
663            let score_mu = nu * (yi - mu) / (mu * mu);
664            let mean_weight = wi * mu * mu * info_mu;
665            let mean_response = em + score_mu / (mu * info_mu);
666            let info_nu = info_nu_raw.max(DISPERSION_MIN_CURVATURE);
667            let disp_weight = wi * nu * nu * info_nu;
668            let disp_response = ed + s_nu / (nu * info_nu);
669            DispersionRowKernel {
670                loglik,
671                mean_weight,
672                mean_response,
673                disp_weight,
674                disp_response,
675            }
676        }
677        DispersionFamilyKind::Beta => {
678            // logit mean link.
679            let mu = (1.0 / (1.0 + (-em).exp())).clamp(1e-12, 1.0 - 1e-12);
680            let phi = ed.exp().max(1e-12); // precision
681            let q = (mu * (1.0 - mu)).max(1e-12); // dμ/dη
682            let tower = dispersion_beta_nll_order2(yi, mu, phi, wi);
683            let (score_mu, info_mu_raw) = tower_score_info(&tower, 0, wi);
684            let (s_phi, info_phi_raw) = tower_score_info(&tower, 1, wi);
685            let loglik = -tower.value();
686            let info_mu = info_mu_raw.max(DISPERSION_MIN_CURVATURE);
687            let mean_weight = wi * q * q * info_mu;
688            let mean_response = em + score_mu / (q * info_mu);
689            let info_phi = info_phi_raw.max(DISPERSION_MIN_CURVATURE);
690            let disp_weight = wi * phi * phi * info_phi;
691            let disp_response = ed + s_phi / (phi * info_phi);
692            DispersionRowKernel {
693                loglik,
694                mean_weight,
695                mean_response,
696                disp_weight,
697                disp_response,
698            }
699        }
700        DispersionFamilyKind::Tweedie { p } => {
701            let mu = em.exp().max(1e-300);
702            // Precision channel models log(1/φ) ⇒ φ = exp(−η_d).
703            let phi = (-ed).exp().max(1e-12);
704            let two_minus_p = 2.0 - p;
705            // Mean channel: the quasi-score `(y−μ)/μ` and Fisher weight
706            // `μ^{2−p}/φ` are simple closed forms (and the mean block is
707            // Fisher-orthogonal to the dispersion block in this
708            // parameterization), so they stay hand-written exactly as the
709            // NB/Gamma mean arms do.
710            let mean_weight = wi * mu.powf(two_minus_p) / phi;
711            let mean_response = em + (yi - mu) / mu;
712            // Dispersion channel: the η_d-space score and OBSERVED information
713            // come straight off the single-expression tower seeded on `η_d`
714            // (#932), so the saddlepoint/point-mass branch split, the
715            // `φ = exp(−η_d)` chain and its nonlinear `∂²φ/∂η_d²` curvature
716            // correction are all mechanically carried — no per-branch
717            // `s_phi`/`s_eta`/`curvature_eta` hand calculus. #1591: only the
718            // η_d axis is consumed, so the tower is the pruned single-axis
719            // `Order2<1>` (`η_μ` enters as a constant).
720            let tower = dispersion_tweedie_disp_order2(yi, em, ed, p, wi);
721            let loglik = -tower.value();
722            // η_d-space score and observed information off the tower, via the
723            // same helper the NB/Gamma/Beta arms use (returns `(0, 0)` when the
724            // prior weight is zero, so the row stays excluded below).
725            let (s_eta, info_eta_raw) = tower_score_info(&tower, 0, wi);
726            let curvature_eta = if wi == 0.0 {
727                DISPERSION_MIN_CURVATURE
728            } else {
729                info_eta_raw.max(DISPERSION_MIN_CURVATURE)
730            };
731            // The working response divides by this per-row curvature so the
732            // prior weight cancels (and a zero-prior-weight row stays excluded
733            // via `disp_weight = 0`).
734            let disp_weight = wi * curvature_eta;
735            let disp_response = ed + s_eta / curvature_eta;
736            DispersionRowKernel {
737                loglik,
738                mean_weight,
739                mean_response,
740                disp_weight,
741                disp_response,
742            }
743        }
744    }
745}
746
747/// Two-block GAMLSS family for the genuine-dispersion mean families (#913).
748#[derive(Clone)]
749pub(crate) struct DispersionGlmLocationScaleFamily {
750    pub(crate) kind: DispersionFamilyKind,
751    pub(crate) y: Array1<f64>,
752    pub(crate) weights: Array1<f64>,
753}
754
755impl DispersionGlmLocationScaleFamily {
756    pub(crate) const BLOCK_MEAN: usize = 0;
757    pub(crate) const BLOCK_DISP: usize = 1;
758}
759
760impl CustomFamily for DispersionGlmLocationScaleFamily {
761    // Preserve the pre-gam#1395 behavior: the trait default flipped to OFF (the
762    // flat-prior exact-Newton objective carries no Jeffreys term), so families
763    // that historically armed the term by default opt back in explicitly.
764    fn joint_jeffreys_term_required(&self) -> bool {
765        true
766    }
767
768    fn evaluate(&self, block_states: &[ParameterBlockState]) -> Result<FamilyEvaluation, String> {
769        validate_block_count::<GamlssError>(self.kind.family_tag(), 2, block_states.len())?;
770        let eta_mu = &block_states[Self::BLOCK_MEAN].eta;
771        let eta_d = &block_states[Self::BLOCK_DISP].eta;
772        let n = self.y.len();
773        if eta_mu.len() != n || eta_d.len() != n || self.weights.len() != n {
774            return Err(format!(
775                "{} row-count mismatch: y={n}, eta_mu={}, eta_d={}, weights={}",
776                self.kind.family_tag(),
777                eta_mu.len(),
778                eta_d.len(),
779                self.weights.len()
780            ));
781        }
782        let mut log_likelihood = 0.0;
783        let mut mean_weights = Array1::<f64>::zeros(n);
784        let mut mean_response = Array1::<f64>::zeros(n);
785        let mut disp_weights = Array1::<f64>::zeros(n);
786        let mut disp_response = Array1::<f64>::zeros(n);
787        for i in 0..n {
788            let row =
789                dispersion_row_kernel(self.kind, self.y[i], eta_mu[i], eta_d[i], self.weights[i]);
790            if row.loglik.is_finite() {
791                log_likelihood += row.loglik;
792            }
793            mean_weights[i] = row.mean_weight.max(0.0);
794            mean_response[i] = row.mean_response;
795            disp_weights[i] = row.disp_weight.max(0.0);
796            disp_response[i] = row.disp_response;
797        }
798        Ok(FamilyEvaluation {
799            log_likelihood,
800            blockworking_sets: vec![
801                BlockWorkingSet::diagonal_checked(mean_response, mean_weights)?,
802                BlockWorkingSet::diagonal_checked(disp_response, disp_weights)?,
803            ],
804        })
805    }
806
807    fn log_likelihood_only(&self, block_states: &[ParameterBlockState]) -> Result<f64, String> {
808        validate_block_count::<GamlssError>(self.kind.family_tag(), 2, block_states.len())?;
809        let eta_mu = &block_states[Self::BLOCK_MEAN].eta;
810        let eta_d = &block_states[Self::BLOCK_DISP].eta;
811        let mut ll = 0.0;
812        for i in 0..self.y.len() {
813            // #1591 prune: the objective needs only the row log-likelihood, so
814            // evaluate the value channel alone (`to_bits`-identical to
815            // `dispersion_row_kernel(..).loglik`) and skip every gradient,
816            // Hessian and digamma/trigamma derivative-stack evaluation.
817            let loglik =
818                dispersion_row_loglik(self.kind, self.y[i], eta_mu[i], eta_d[i], self.weights[i]);
819            if loglik.is_finite() {
820                ll += loglik;
821            }
822        }
823        Ok(ll)
824    }
825
826    fn coefficient_hessian_cost(&self, specs: &[ParameterBlockSpec]) -> u64 {
827        crate::location_scale_engine::location_scale_coefficient_hessian_cost(
828            self.y.len() as u64,
829            specs,
830        )
831    }
832
833    /// Exact joint coefficient-space Hessian `H_L = -∇²log L` in flattened
834    /// `[mean | log-precision]` block order.
835    ///
836    /// All four members assemble the same `Xᵀ diag(W) X` blocks; the cross
837    /// block is the per-row mixed weight `dispersion_row_cross_weight`. Beta
838    /// carries a genuinely nonzero (η_μ, η_φ) cross weight; the Fisher-
839    /// orthogonal members (NegativeBinomial / Gamma / Tweedie) report a zero
840    /// cross weight, so this returns their exact *block-diagonal* joint
841    /// Hessian. Returning that block-diagonal `H_L` — rather than `None` —
842    /// is what lets the multi-block outer-REML path (`build_joint_hessian_
843    /// closures` → `joint_outer_evaluate`) and the joint posterior covariance
844    /// (`compute_joint_covariance`) run for these families instead of failing
845    /// the "multi-block families must provide a joint outer path" gate and
846    /// silently escalating to a degraded ρ-seed fit with no covariance/EDF
847    /// (gam#1119). The orthogonal members additionally declare
848    /// `likelihood_blocks_uncoupled() = true` so the directional-derivative
849    /// and Jeffreys dispatch route through the block-diagonal-exact fallback
850    /// rather than rejecting the structurally-uncoupled Hessian.
851    fn exact_newton_joint_hessian_with_specs(
852        &self,
853        block_states: &[ParameterBlockState],
854        specs: &[ParameterBlockSpec],
855    ) -> Result<Option<Array2<f64>>, String> {
856        validate_block_count::<GamlssError>(self.kind.family_tag(), 2, block_states.len())?;
857        if specs.len() != 2 {
858            return Err(format!(
859                "{} exact joint Hessian expects 2 specs, got {}",
860                self.kind.family_tag(),
861                specs.len()
862            ));
863        }
864        let eta_mu = &block_states[Self::BLOCK_MEAN].eta;
865        let eta_d = &block_states[Self::BLOCK_DISP].eta;
866        let n = self.y.len();
867        if eta_mu.len() != n || eta_d.len() != n || self.weights.len() != n {
868            return Err(format!(
869                "{} exact joint Hessian row-count mismatch: y={n}, eta_mu={}, eta_d={}, weights={}",
870                self.kind.family_tag(),
871                eta_mu.len(),
872                eta_d.len(),
873                self.weights.len()
874            ));
875        }
876
877        let eval = self.evaluate(block_states)?;
878        let BlockWorkingSet::Diagonal {
879            working_weights: mean_weights,
880            ..
881        } = &eval.blockworking_sets[Self::BLOCK_MEAN]
882        else {
883            return Err(format!(
884                "{} dispersion mean block did not return diagonal weights",
885                self.kind.family_tag()
886            ));
887        };
888        let BlockWorkingSet::Diagonal {
889            working_weights: disp_weights,
890            ..
891        } = &eval.blockworking_sets[Self::BLOCK_DISP]
892        else {
893            return Err(format!(
894                "{} dispersion precision block did not return diagonal weights",
895                self.kind.family_tag()
896            ));
897        };
898
899        let cross_weights = Array1::from_shape_fn(n, |i| {
900            dispersion_row_cross_weight(self.kind, self.y[i], eta_mu[i], eta_d[i], self.weights[i])
901        });
902        let mean_spec = &specs[Self::BLOCK_MEAN];
903        let disp_spec = &specs[Self::BLOCK_DISP];
904        if mean_spec.design.nrows() != n || disp_spec.design.nrows() != n {
905            return Err(format!(
906                "{} exact joint Hessian design row mismatch: y={n}, mean rows={}, precision rows={}",
907                self.kind.family_tag(),
908                mean_spec.design.nrows(),
909                disp_spec.design.nrows()
910            ));
911        }
912        let p_mean = mean_spec.design.ncols();
913        let p_disp = disp_spec.design.ncols();
914        if block_states[Self::BLOCK_MEAN].beta.len() != p_mean
915            || block_states[Self::BLOCK_DISP].beta.len() != p_disp
916        {
917            return Err(format!(
918                "{} exact joint Hessian beta/design mismatch: mean beta {} vs cols {}, precision beta {} vs cols {}",
919                self.kind.family_tag(),
920                block_states[Self::BLOCK_MEAN].beta.len(),
921                p_mean,
922                block_states[Self::BLOCK_DISP].beta.len(),
923                p_disp
924            ));
925        }
926
927        let h_mean = xt_diag_x_design(&mean_spec.design, mean_weights)?;
928        let h_cross = xt_diag_y_design(&mean_spec.design, &cross_weights, &disp_spec.design)?;
929        let h_disp = xt_diag_x_design(&disp_spec.design, disp_weights)?;
930        let total = p_mean + p_disp;
931        let mut h = Array2::<f64>::zeros((total, total));
932        h.slice_mut(s![0..p_mean, 0..p_mean]).assign(&h_mean);
933        h.slice_mut(s![0..p_mean, p_mean..total]).assign(&h_cross);
934        h.slice_mut(s![p_mean..total, p_mean..total])
935            .assign(&h_disp);
936        mirror_upper_to_lower(&mut h);
937        Ok(Some(h))
938    }
939
940    /// Whether the joint likelihood Hessian is block-diagonal in the
941    /// `[mean | log-precision]` coefficient vector.
942    ///
943    /// `Beta(μφ, (1−μ)φ)` carries a genuinely nonzero `(η_μ, η_φ)` Fisher
944    /// cross block (see the module header), so its blocks are coupled. The
945    /// remaining members are Fisher-orthogonal in their mean/precision
946    /// parameterizations — NB2 `(μ, θ)`, Gamma shape `ν = 1/φ`, Tweedie
947    /// `log(1/φ)` — so `∂²L/∂β_μ∂β_d = 0` and the joint Hessian is exactly
948    /// block-diagonal. Declaring that here lets the trait's directional-
949    /// derivative / Jeffreys dispatch accept the block-diagonal joint Hessian
950    /// via the working-set-exact fallback instead of rejecting it as an
951    /// untrusted structurally-uncoupled override (which would strand the
952    /// outer-REML gradient with a "dH unavailable" error, gam#1119).
953    fn likelihood_blocks_uncoupled(&self) -> bool {
954        !matches!(self.kind, DispersionFamilyKind::Beta)
955    }
956
957    /// The mean and precision working weights couple across both blocks, which
958    /// the block-local diagonal drift hook cannot represent, so decline the
959    /// dense outer Hessian capability whenever the actual two-block (or
960    /// larger) geometry is in play; a degenerate single-block probe — there
961    /// is no cross-block coupling to reject — keeps the trait default's
962    /// availability verdict.
963    ///
964    /// The override still validates the block-spec slice it is handed (the
965    /// same consistency check the trait default's assertion bottoms out in)
966    /// so a malformed probe is reported here rather than downstream.
967    fn outer_hyper_hessian_dense_available(&self, specs: &[ParameterBlockSpec]) -> bool {
968        assert!(
969            crate::custom_family::validate_blockspec_consistency(specs).is_ok(),
970            "DispersionGlmLocationScale outer hyper-Hessian dense availability: \
971             inconsistent parameter block specs"
972        );
973        specs.len() < 2
974    }
975}
976
977/// Term spec consumed by [`fit_dispersion_glm_location_scale_terms`]; mirrors
978/// [`GaussianLocationScaleTermSpec`](super::GaussianLocationScaleTermSpec) with
979/// the dispersion channel in place of the Gaussian log-σ channel.
980pub struct DispersionGlmLocationScaleTermSpec {
981    pub kind: DispersionFamilyKind,
982    pub y: Array1<f64>,
983    pub weights: Array1<f64>,
984    pub meanspec: TermCollectionSpec,
985    pub log_dispspec: TermCollectionSpec,
986    pub mean_offset: Array1<f64>,
987    pub log_disp_offset: Array1<f64>,
988}
989
990pub(crate) struct DispersionGlmLocationScaleTermBuilder {
991    pub(crate) kind: DispersionFamilyKind,
992    pub(crate) y: Array1<f64>,
993    pub(crate) weights: Array1<f64>,
994    pub(crate) meanspec: TermCollectionSpec,
995    pub(crate) noisespec: TermCollectionSpec,
996    pub(crate) mean_offset: Array1<f64>,
997    pub(crate) noise_offset: Array1<f64>,
998}
999
1000/// Warm start for a dispersion location-scale fit: project a link-transformed
1001/// response onto the mean block and seed the log-precision block at a constant
1002/// (precision ≈ 1) baseline. The block-cyclic IRLS then refines both jointly.
1003pub(crate) fn dispersion_location_scale_warm_start(
1004    kind: DispersionFamilyKind,
1005    y: &Array1<f64>,
1006    weights: &Array1<f64>,
1007    mean_block: &ParameterBlockSpec,
1008    disp_block: &ParameterBlockSpec,
1009    mean_beta_hint: Option<&Array1<f64>>,
1010    disp_beta_hint: Option<&Array1<f64>>,
1011) -> Result<(Array1<f64>, Array1<f64>), String> {
1012    let ridge_floor = 1e-10;
1013    let mean_beta = if let Some(beta) = mean_beta_hint {
1014        beta.clone()
1015    } else {
1016        let target = Array1::from_shape_fn(y.len(), |i| {
1017            if kind.mean_is_logit() {
1018                let yi = y[i].clamp(1e-3, 1.0 - 1e-3);
1019                (yi / (1.0 - yi)).ln()
1020            } else {
1021                // log mean link; the +0.1 keeps zero counts finite.
1022                (y[i].max(0.0) + 0.1).ln()
1023            }
1024        });
1025        solve_penalizedweighted_projection(
1026            &mean_block.design,
1027            &mean_block.offset,
1028            &target,
1029            weights,
1030            &mean_block.penalties,
1031            &mean_block.initial_log_lambdas,
1032            ridge_floor,
1033        )?
1034    };
1035    let disp_beta = if let Some(beta) = disp_beta_hint {
1036        beta.clone()
1037    } else {
1038        // Seed the precision block from a smoothed method-of-moments surface
1039        // rather than the old flat η_d=0 constant.  A single observation cannot
1040        // identify its own variance, but for the Fisher-orthogonal dispersion
1041        // members the residual-squared moment contains the correct first-order
1042        // signal:
1043        //
1044        //   Gamma:   Var(Y)=μ²/ν              ⇒ log ν     ≈ log(μ²/e²)
1045        //   NB2:     Var(Y)=μ+μ²/θ            ⇒ log θ     ≈ log(μ²/(e²-μ))
1046        //   Tweedie: Var(Y)=φ μ^p, η_d=log1/φ ⇒ η_d       ≈ log(μ^p/e²)
1047        //
1048        // The targets are deliberately conservative (finite residual floor,
1049        // precision cap, and no fixture-specific constants): they only give the
1050        // block-cyclic likelihood solve a correctly-signed non-flat starting
1051        // surface, while the final estimate is still the penalized joint MLE.
1052        let mean_eta = mean_block.design.apply(&mean_beta) + &mean_block.offset;
1053        let target = Array1::from_shape_fn(y.len(), |i| {
1054            dispersion_moment_log_precision_seed(kind, y[i], mean_eta[i])
1055        });
1056        solve_penalizedweighted_projection(
1057            &disp_block.design,
1058            &disp_block.offset,
1059            &target,
1060            weights,
1061            &disp_block.penalties,
1062            &disp_block.initial_log_lambdas,
1063            ridge_floor,
1064        )?
1065    };
1066    Ok((mean_beta, disp_beta))
1067}
1068
1069#[inline]
1070fn dispersion_moment_log_precision_seed(kind: DispersionFamilyKind, yi: f64, eta_mu: f64) -> f64 {
1071    const LOG_PRECISION_FLOOR: f64 = -10.0;
1072    const LOG_PRECISION_CEILING: f64 = 10.0;
1073    let em = eta_mu.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
1074    let raw = match kind {
1075        DispersionFamilyKind::Beta => {
1076            // Beta's mean and precision scores are not Fisher-orthogonal in
1077            // the (logit μ, log φ) parameterization.  Per-row residual moments
1078            // therefore make a poor block-cyclic seed: an outlying y near 0/1
1079            // can imply a near-zero φ and pull the coupled mean block onto the
1080            // boundary before the joint likelihood has had a chance to settle.
1081            // Keep the neutral precision seed for this one coupled member; the
1082            // exact Beta cross-Hessian below still drives the joint solve and
1083            // covariance with the coherent two-block likelihood geometry.
1084            0.0
1085        }
1086        DispersionFamilyKind::Gamma => {
1087            let mu = em.exp().max(1e-12);
1088            let e2 = (yi - mu).powi(2).max(1e-8 * mu * mu);
1089            (mu * mu / e2).max(1e-6).ln()
1090        }
1091        DispersionFamilyKind::NegativeBinomial => {
1092            let mu = em.exp().max(1e-12);
1093            let e2 = (yi - mu).powi(2);
1094            let excess = (e2 - mu).max(1e-6 * (mu + mu * mu));
1095            (mu * mu / excess).max(1e-6).ln()
1096        }
1097        DispersionFamilyKind::Tweedie { p } => {
1098            let mu = em.exp().max(1e-12);
1099            let e2 = (yi - mu).powi(2).max(1e-8 * mu.powf(p));
1100            (mu.powf(p) / e2).max(1e-6).ln()
1101        }
1102    };
1103    raw.clamp(LOG_PRECISION_FLOOR, LOG_PRECISION_CEILING)
1104}
1105
1106impl LocationScaleFamilyBuilder for DispersionGlmLocationScaleTermBuilder {
1107    type Family = DispersionGlmLocationScaleFamily;
1108
1109    fn meanspec(&self) -> &TermCollectionSpec {
1110        &self.meanspec
1111    }
1112
1113    fn noisespec(&self) -> &TermCollectionSpec {
1114        &self.noisespec
1115    }
1116
1117    fn noise_penalty_count(&self, noise_design: &TermCollectionDesign) -> usize {
1118        // Mirror the Gaussian/Binomial scale block: a full-span shrinkage
1119        // penalty pins the log-precision nullspace so REML does not optimise
1120        // the dispersion smoothing on a flat surface.
1121        noise_design.penalties.len() + 1
1122    }
1123
1124    fn build_blocks(
1125        &self,
1126        theta: &Array1<f64>,
1127        mean_design: &TermCollectionDesign,
1128        noise_design: &TermCollectionDesign,
1129        mean_beta_hint: Option<Array1<f64>>,
1130        noise_beta_hint: Option<Array1<f64>>,
1131    ) -> Result<Vec<ParameterBlockSpec>, String> {
1132        let layout = GamlssLambdaLayout::two_block(
1133            mean_design.penalties.len(),
1134            self.noise_penalty_count(noise_design),
1135        );
1136        layout.validate_theta_len(theta.len(), "dispersion location-scale")?;
1137
1138        let mut meanspec = build_location_scale_block(
1139            "mu",
1140            mean_design.design.clone(),
1141            self.mean_offset.clone(),
1142            mean_design.penalties_as_penalty_matrix(),
1143            mean_design.nullspace_dims.clone(),
1144            layout.mean_from(theta),
1145            mean_beta_hint,
1146            0,
1147            LOCATION_SCALE_N_OUTPUTS,
1148            "DispersionLocationScale::build_blocks: mu",
1149        )?;
1150
1151        let p_disp = noise_design.design.ncols();
1152        let mut disp_penalties = noise_design.penalties_as_penalty_matrix();
1153        disp_penalties.push(PenaltyMatrix::Dense(identity_penalty(p_disp)));
1154        let mut disp_nullspace = noise_design.nullspace_dims.clone();
1155        disp_nullspace.push(0);
1156        let mut dispspec = build_location_scale_block(
1157            "log_precision",
1158            noise_design.design.clone(),
1159            self.noise_offset.clone(),
1160            disp_penalties,
1161            disp_nullspace,
1162            layout.noise_from(theta),
1163            noise_beta_hint,
1164            1,
1165            LOCATION_SCALE_N_OUTPUTS,
1166            "DispersionLocationScale::build_blocks: log_precision",
1167        )?;
1168
1169        if meanspec.initial_beta.is_none() || dispspec.initial_beta.is_none() {
1170            let (mean_beta0, disp_beta0) = dispersion_location_scale_warm_start(
1171                self.kind,
1172                &self.y,
1173                &self.weights,
1174                &meanspec,
1175                &dispspec,
1176                meanspec.initial_beta.as_ref(),
1177                dispspec.initial_beta.as_ref(),
1178            )?;
1179            if meanspec.initial_beta.is_none() {
1180                meanspec.initial_beta = Some(mean_beta0);
1181            }
1182            if dispspec.initial_beta.is_none() {
1183                dispspec.initial_beta = Some(disp_beta0);
1184            }
1185        }
1186
1187        Ok(vec![meanspec, dispspec])
1188    }
1189
1190    fn build_family(
1191        &self,
1192        mean_design: &TermCollectionDesign,
1193        noise_design: &TermCollectionDesign,
1194    ) -> Self::Family {
1195        // The family stores y/weights/kind directly and does not need the
1196        // designs at construction time, but the row geometry of the offered
1197        // designs is the only cross-check that ties this family back to the
1198        // builder's data — assert it before handing the family to the engine
1199        // so a misaligned design surfaces here rather than downstream in the
1200        // inner solver.
1201        assert_eq!(
1202            mean_design.design.nrows(),
1203            self.y.len(),
1204            "DispersionGlmLocationScale::build_family: mean design row count must match y"
1205        );
1206        assert_eq!(
1207            noise_design.design.nrows(),
1208            self.y.len(),
1209            "DispersionGlmLocationScale::build_family: noise design row count must match y"
1210        );
1211        DispersionGlmLocationScaleFamily {
1212            kind: self.kind,
1213            y: self.y.clone(),
1214            weights: self.weights.clone(),
1215        }
1216    }
1217
1218    fn extract_primary_betas(
1219        &self,
1220        fit: &UnifiedFitResult,
1221    ) -> Result<(Array1<f64>, Array1<f64>), String> {
1222        let mean_beta = fit
1223            .block_states
1224            .get(DispersionGlmLocationScaleFamily::BLOCK_MEAN)
1225            .ok_or_else(|| "missing dispersion mean block state".to_string())?
1226            .beta
1227            .clone();
1228        let disp_beta = fit
1229            .block_states
1230            .get(DispersionGlmLocationScaleFamily::BLOCK_DISP)
1231            .ok_or_else(|| "missing dispersion log-precision block state".to_string())?
1232            .beta
1233            .clone();
1234        Ok((mean_beta, disp_beta))
1235    }
1236
1237    fn build_psiderivative_blocks(
1238        &self,
1239        data: ndarray::ArrayView2<'_, f64>,
1240        meanspec: &TermCollectionSpec,
1241        noisespec: &TermCollectionSpec,
1242        mean_design: &TermCollectionDesign,
1243        noise_design: &TermCollectionDesign,
1244    ) -> Result<Vec<Vec<CustomFamilyBlockPsiDerivative>>, String> {
1245        // The dispersion location-scale families have no closed-form analytic
1246        // spatial psi derivatives, and `fit_dispersion_glm_location_scale_terms`
1247        // disables the κ/ψ joint optimizer before the engine ever asks. If we
1248        // do get called (for example by a future caller that forgets the
1249        // disable), return a real diagnostic rather than a sentinel — emit the
1250        // exact data and design shape that was passed in so the bug is
1251        // diagnosable from the error string alone.
1252        Err(format!(
1253            "dispersion location-scale ({:?}) does not implement analytic spatial \
1254             psi derivatives; the κ/ψ joint optimizer must be disabled before \
1255             this builder is consulted. Called with data {n_rows}×{n_cols}, mean \
1256             spec (linear={mean_lin}, random={mean_re}, smooth={mean_sm}), noise \
1257             spec (linear={noise_lin}, random={noise_re}, smooth={noise_sm}), \
1258             mean design cols={mean_p}, noise design cols={noise_p}",
1259            self.kind,
1260            n_rows = data.nrows(),
1261            n_cols = data.ncols(),
1262            mean_lin = meanspec.linear_terms.len(),
1263            mean_re = meanspec.random_effect_terms.len(),
1264            mean_sm = meanspec.smooth_terms.len(),
1265            noise_lin = noisespec.linear_terms.len(),
1266            noise_re = noisespec.random_effect_terms.len(),
1267            noise_sm = noisespec.smooth_terms.len(),
1268            mean_p = mean_design.design.ncols(),
1269            noise_p = noise_design.design.ncols(),
1270        ))
1271    }
1272}
1273
1274/// Fit a dispersion-channel GAMLSS location-scale model (#913). All four
1275/// genuine-dispersion mean families share this single entry; the per-family
1276/// likelihood lives in [`dispersion_row_kernel`].
1277pub fn fit_dispersion_glm_location_scale_terms(
1278    data: ndarray::ArrayView2<'_, f64>,
1279    spec: DispersionGlmLocationScaleTermSpec,
1280    options: &BlockwiseFitOptions,
1281    kappa_options: &SpatialLengthScaleOptimizationOptions,
1282) -> Result<BlockwiseTermFitResult, String> {
1283    if let DispersionFamilyKind::Tweedie { p } = spec.kind {
1284        if !(p.is_finite() && p > 1.0 && p < 2.0) {
1285            return Err(format!(
1286                "Tweedie location-scale requires a variance power strictly in (1, 2); got p={p}"
1287            ));
1288        }
1289    }
1290    // The κ/ψ anisotropic-kernel joint optimizer needs analytic psi
1291    // derivatives this family does not provide; disable it so the engine runs
1292    // the full ρ REML directly via `fit_custom_family` (1-D and tensor smooth
1293    // penalties λ are still REML-selected).
1294    let mut kappa = kappa_options.clone();
1295    kappa.enabled = false;
1296    // A dispersion location-scale model is an inherently *predictable* model:
1297    // posterior-mean prediction (the response-scale predict path the CLI/FFI
1298    // drive) needs the joint `(β_μ, β_d)` posterior covariance, and so does the
1299    // reported total EDF / coefficient SEs. The block-diagonal joint Hessian is
1300    // always assembled here (`exact_newton_joint_hessian_with_specs` →
1301    // `compute_joint_covariance`, which for this family's `RidgedQuadraticReml`
1302    // outer objective uses the never-erroring SPD-retry → positive-part
1303    // pseudo-inverse), so we can — and must — request the covariance
1304    // unconditionally rather than leaving `covariance_conditional = None`
1305    // whenever the outer optimizer happens to *converge* (the only family-
1306    // independent reason NB sometimes populated covariance was that it escalated
1307    // into the never-fail posterior-sampling rung, while a cleanly-converged
1308    // Gamma/Tweedie fit took the `!options.compute_covariance ⇒ None` early
1309    // return and stranded its covariance/EDF — gam#1119). Forcing the flag here
1310    // makes all four genuine-dispersion mean families assemble the joint
1311    // covariance + EDF deterministically, exactly as a predictable model
1312    // requires.
1313    let mut options = options.clone();
1314    options.compute_covariance = true;
1315    fit_location_scale_terms(
1316        data,
1317        DispersionGlmLocationScaleTermBuilder {
1318            kind: spec.kind,
1319            y: spec.y,
1320            weights: spec.weights,
1321            meanspec: spec.meanspec,
1322            noisespec: spec.log_dispspec,
1323            mean_offset: spec.mean_offset,
1324            noise_offset: spec.log_disp_offset,
1325        },
1326        &options,
1327        &kappa,
1328    )
1329}
1330
1331#[cfg(test)]
1332mod tests {
1333    use super::*;
1334    use super::test_support::{dispersion_gamma_nll_order2, dispersion_nb_nll_order2};
1335    use crate::gamlss::test_support::dispersion_tweedie_nll_generic;
1336
1337    pub(crate) fn beta_fisher_cross_info_mu_phi(mu: f64, phi: f64) -> f64 {
1338        let a = mu * phi;
1339        let b = (1.0 - mu) * phi;
1340        phi * (mu * gam_math::jet_tower::trigamma_derivative_stack(a)[0]
1341            - (1.0 - mu) * gam_math::jet_tower::trigamma_derivative_stack(b)[0])
1342    }
1343
1344    pub(crate) fn assert_close(label: &str, got: f64, want: f64, tol: f64) {
1345        assert!(
1346            (got - want).abs() <= tol,
1347            "{label}: got {got:.12e}, want {want:.12e}, |diff|={:.3e}",
1348            (got - want).abs()
1349        );
1350    }
1351
1352    #[test]
1353    pub(crate) fn beta_tower_mixed_channel_matches_cross_information_formula() {
1354        let mu = 0.1;
1355        let phi = 10.0;
1356        let a = mu * phi;
1357        let b = (1.0 - mu) * phi;
1358        let digamma_a = gam_math::jet_tower::digamma_derivative_stack(a)[0];
1359        let digamma_b = gam_math::jet_tower::digamma_derivative_stack(b)[0];
1360        let score_neutral_y = 1.0 / (1.0 + (-(digamma_a - digamma_b)).exp());
1361
1362        let tower = dispersion_beta_nll_order2(score_neutral_y, mu, phi, 1.0);
1363        let trigamma_a = std::f64::consts::PI * std::f64::consts::PI / 6.0;
1364        let trigamma_b = gam_math::jet_tower::trigamma_derivative_stack(b)[0];
1365        let analytic = phi * (mu * trigamma_a - (1.0 - mu) * trigamma_b);
1366        let helper = beta_fisher_cross_info_mu_phi(mu, phi);
1367
1368        assert!(
1369            analytic > 0.58,
1370            "audit example should have visibly nonzero cross information, got {analytic}"
1371        );
1372        assert_close("helper cross information", helper, analytic, 1e-12);
1373        assert_close("tower mixed channel", tower.h()[0][1], analytic, 1e-8);
1374
1375        let q = mu * (1.0 - mu);
1376        let eta_cross = beta_observed_cross_weight_eta(score_neutral_y, mu, phi, 1.0);
1377        assert_close(
1378            "eta-scale cross weight",
1379            eta_cross,
1380            q * phi * analytic,
1381            1e-8,
1382        );
1383    }
1384
1385    /// #932 oracle: the production `Order2<2>` evaluation of each dispersion
1386    /// row NLL must reproduce, channel-for-channel (value/grad/Hessian), the
1387    /// dense `Tower4<2>` evaluation of the same row expression.
1388    #[test]
1389    pub(crate) fn order2_matches_dense_tower_all_channels() {
1390        use gam_math::jet_scalar::{JetScalar, Order2};
1391        use gam_math::jet_tower::Tower4;
1392
1393        fn check_o2_vs_tower4(label: &str, o2: Order2<2>, t4: Tower4<2>) {
1394            let band = |a: f64, b: f64| 1e-9 + 1e-9 * a.abs().max(b.abs());
1395            assert!(
1396                (o2.value() - t4.v).abs() <= band(o2.value(), t4.v),
1397                "{label} value: {} vs {}",
1398                o2.value(),
1399                t4.v
1400            );
1401            for a in 0..2 {
1402                assert!(
1403                    (o2.g()[a] - t4.g[a]).abs() <= band(o2.g()[a], t4.g[a]),
1404                    "{label} grad[{a}]: {} vs {}",
1405                    o2.g()[a],
1406                    t4.g[a]
1407                );
1408                for b in 0..2 {
1409                    assert!(
1410                        (o2.h()[a][b] - t4.h[a][b]).abs() <= band(o2.h()[a][b], t4.h[a][b]),
1411                        "{label} hess[{a}][{b}]: {} vs {}",
1412                        o2.h()[a][b],
1413                        t4.h[a][b]
1414                    );
1415                }
1416            }
1417        }
1418
1419        let wi = 1.7_f64;
1420        // NB2: (μ, θ).
1421        for &(yi, mu, theta) in &[(0.0, 1.2, 3.0), (4.0, 2.5, 0.7), (10.0, 0.6, 5.0)] {
1422            check_o2_vs_tower4(
1423                "nb",
1424                dispersion_nb_nll_order2(yi, mu, theta, wi),
1425                test_support::dispersion_nb_nll_generic::<Tower4<2>>(yi, mu, theta, wi),
1426            );
1427        }
1428        // Gamma: (μ, ν).
1429        for &(yi, mu, nu) in &[
1430            (0.5_f64, 1.1_f64, 2.0_f64),
1431            (3.0, 4.0, 0.9),
1432            (1.0, 0.3, 6.0),
1433        ] {
1434            let y_pos = yi.max(1e-300);
1435            check_o2_vs_tower4(
1436                "gamma",
1437                dispersion_gamma_nll_order2(yi, y_pos, mu, nu, wi),
1438                test_support::dispersion_gamma_nll_generic::<Tower4<2>>(yi, y_pos, mu, nu, wi),
1439            );
1440        }
1441        // Beta: (μ, φ).
1442        for &(yi, mu, phi) in &[(0.3, 0.4, 5.0), (0.9, 0.6, 12.0), (0.01, 0.2, 3.0)] {
1443            check_o2_vs_tower4(
1444                "beta",
1445                dispersion_beta_nll_order2(yi, mu, phi, wi),
1446                test_support::dispersion_beta_nll_generic::<Tower4<2>>(yi, mu, phi, wi),
1447            );
1448        }
1449        // Tweedie: (η_μ, η_d), both density branches.
1450        for &(yi, eta_mu, eta_d, p) in &[
1451            (0.0, 0.4, -0.3, 1.5),
1452            (2.5, -0.2, 0.5, 1.3),
1453            (0.0, 1.0, 0.1, 1.7),
1454            (5.0, 0.7, -0.6, 1.6),
1455        ] {
1456            check_o2_vs_tower4(
1457                "tweedie",
1458                dispersion_tweedie_nll_generic::<Order2<2>>(yi, eta_mu, eta_d, p, wi),
1459                dispersion_tweedie_nll_generic::<Tower4<2>>(yi, eta_mu, eta_d, p, wi),
1460            );
1461        }
1462    }
1463
1464    /// #1591 prune oracle: the pruned single-axis (`K=1`) dispersion towers
1465    /// reproduce, `to_bits`-exactly, the CONSUMED channels (`value`, dispersion-
1466    /// axis `g`/`h`) of the full `Order2<2>` towers — across ≥2000 randomized
1467    /// rows per family (both Tweedie density branches), including the η-clamp
1468    /// boundary. This is the bit-identity guarantee that the K-prune changes no
1469    /// observable float.
1470    #[test]
1471    pub(crate) fn pruned_disp_towers_bit_identical_to_full_order2() {
1472        use gam_math::jet_scalar::{JetScalar, Order2};
1473
1474        // Deterministic LCG so the sweep is reproducible without an rng dep.
1475        let mut state: u64 = 0x9E3779B97F4A7C15;
1476        let mut next = || {
1477            state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
1478            ((state >> 11) as f64) / ((1u64 << 53) as f64)
1479        };
1480        let bits = |x: f64| x.to_bits();
1481
1482        let n_per = 600; // 600 rows × 4 families (Tweedie ×2 branches) > 2000.
1483        for _ in 0..n_per {
1484            let wi = 0.25 + 3.0 * next();
1485            let yi_count = (next() * 12.0).floor();
1486
1487            // NB: full O2<2> seeds (μ, θ); pruned seeds θ only.
1488            {
1489                let mu = (0.05 + 4.0 * next()).max(1e-300);
1490                let theta = (0.05 + 6.0 * next()).max(1e-12);
1491                let full = dispersion_nb_nll_order2(yi_count, mu, theta, wi);
1492                let prn = dispersion_nb_disp_order2(yi_count, mu, theta, wi);
1493                assert_eq!(bits(full.value()), bits(prn.value()), "nb value");
1494                assert_eq!(bits(full.g()[1]), bits(prn.g()[0]), "nb grad");
1495                assert_eq!(bits(full.h()[1][1]), bits(prn.h()[0][0]), "nb hess");
1496                // value-only path == -tower.value(), bit-for-bit.
1497                assert_eq!(
1498                    bits(dispersion_nb_neg_loglik(yi_count, mu, theta, wi)),
1499                    bits(-prn.value()),
1500                    "nb value-only"
1501                );
1502            }
1503            // Gamma: seeds (μ, ν) / ν.
1504            {
1505                let mu = (0.05 + 4.0 * next()).max(1e-300);
1506                let nu = (0.05 + 6.0 * next()).max(1e-12);
1507                let yi = 0.01 + 8.0 * next();
1508                let y_pos = yi.max(1e-300);
1509                let full = dispersion_gamma_nll_order2(yi, y_pos, mu, nu, wi);
1510                let prn = dispersion_gamma_disp_order2(yi, y_pos, mu, nu, wi);
1511                assert_eq!(bits(full.value()), bits(prn.value()), "gamma value");
1512                assert_eq!(bits(full.g()[1]), bits(prn.g()[0]), "gamma grad");
1513                assert_eq!(bits(full.h()[1][1]), bits(prn.h()[0][0]), "gamma hess");
1514                assert_eq!(
1515                    bits(dispersion_gamma_neg_loglik(yi, y_pos, mu, nu, wi)),
1516                    bits(-prn.value()),
1517                    "gamma value-only"
1518                );
1519            }
1520            // Beta value-only path vs full K=2 tower value.
1521            {
1522                let mu = (1e-6 + (1.0 - 2e-6) * next()).clamp(1e-12, 1.0 - 1e-12);
1523                let phi = (0.05 + 20.0 * next()).max(1e-12);
1524                let yi = next();
1525                let full = dispersion_beta_nll_order2(yi, mu, phi, wi);
1526                assert_eq!(
1527                    bits(dispersion_beta_neg_loglik(yi, mu, phi, wi)),
1528                    bits(-full.value()),
1529                    "beta value-only"
1530                );
1531            }
1532            // Tweedie: seeds (η_μ, η_d) / η_d, both density branches & clamp edge.
1533            for &(yi, eta_mu, eta_d, p) in &[
1534                (0.0_f64, -4.0 + 8.0 * next(), -4.0 + 8.0 * next(), 1.1 + 0.8 * next()),
1535                (0.01 + 9.0 * next(), -4.0 + 8.0 * next(), -4.0 + 8.0 * next(), 1.1 + 0.8 * next()),
1536                (3.0, -DISPERSION_ETA_CLAMP - 5.0, DISPERSION_ETA_CLAMP + 5.0, 1.5),
1537            ] {
1538                let em = eta_mu.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
1539                let ed = eta_d.clamp(-DISPERSION_ETA_CLAMP, DISPERSION_ETA_CLAMP);
1540                let full = dispersion_tweedie_nll_generic::<Order2<2>>(yi, em, ed, p, wi);
1541                let prn = dispersion_tweedie_disp_order2(yi, em, ed, p, wi);
1542                assert_eq!(bits(full.value()), bits(prn.value()), "tweedie value");
1543                assert_eq!(bits(full.g()[1]), bits(prn.g()[0]), "tweedie grad");
1544                assert_eq!(bits(full.h()[1][1]), bits(prn.h()[0][0]), "tweedie hess");
1545                assert_eq!(
1546                    bits(dispersion_tweedie_neg_loglik(yi, em, ed, p, wi)),
1547                    bits(-prn.value()),
1548                    "tweedie value-only"
1549                );
1550            }
1551        }
1552    }
1553
1554    #[test]
1555    pub(crate) fn orthogonal_dispersion_families_report_zero_cross_weight() {
1556        let cases = [
1557            DispersionFamilyKind::NegativeBinomial,
1558            DispersionFamilyKind::Gamma,
1559            DispersionFamilyKind::Tweedie { p: 1.5 },
1560        ];
1561        for kind in cases {
1562            let got = dispersion_row_cross_weight(kind, 1.25, 0.2, -0.3, 2.0);
1563            assert_close(kind.family_tag(), got, 0.0, 1e-12);
1564        }
1565    }
1566}