Skip to main content

ndarray_glm/
fit.rs

1//! Stores the fit results of the IRLS regression and provides functions that
2//! depend on the MLE estimate. These include statistical tests for goodness-of-fit.
3
4pub mod options;
5#[cfg(feature = "stats")]
6use crate::response::Response;
7use crate::{
8    Linear,
9    data::{Dataset, one_pad},
10    error::RegressionResult,
11    glm::{DispersionType, Glm},
12    irls::{Irls, IrlsStep},
13    link::{Link, Transform},
14    model::Model,
15    num::Float,
16    regularization::IrlsReg,
17};
18use ndarray::{Array1, Array2, ArrayBase, ArrayView1, Axis, Data, Ix2, array, s};
19use ndarray_linalg::InverseInto;
20use once_cell::unsync::OnceCell; // can be replaced by std::cell::OnceCell upon stabilization
21use options::FitOptions;
22#[cfg(feature = "stats")]
23use statrs::distribution::ContinuousCDF;
24use std::marker::PhantomData;
25
26/// the result of a successful GLM fit
27pub struct Fit<'a, M, F>
28where
29    M: Glm,
30    F: Float,
31{
32    model: PhantomData<M>,
33    /// The data and model specification used in the fit.
34    data: &'a Dataset<F>,
35    /// The parameter values that maximize the likelihood as given by the IRLS regression. If the
36    /// dataset was internally standardized, this is transformed back
37    pub result: Array1<F>,
38    /// The parameter values used internally for the standardized data.
39    result_std: Array1<F>,
40    /// The linear predictor of the X training data given the fit result. Note that it is invariant
41    /// under any standardization scaling, since it is an inner product between X and beta.
42    x_lin_pred: Array1<F>,
43    /// The predicted y-values for the training data.
44    y_hat: Array1<F>,
45    /// The options used for this fit.
46    pub options: FitOptions<F>,
47    /// The value of the likelihood function for the fit result.
48    pub model_like: F,
49    /// The regularizer of the fit
50    reg: Box<dyn IrlsReg<F>>,
51    /// The number of overall iterations taken in the IRLS.
52    pub n_iter: usize,
53    /// The history of guesses and likelihoods over the IRLS iterations.
54    pub history: Vec<IrlsStep<F>>,
55    /// The number of parameters
56    n_par: usize,
57    // The remaining variables hold cached results and matrices
58    /// The estimated dispersion parameter, which is called in many places. For some families this
59    /// is just one.
60    phi: OnceCell<F>,
61    /// The deviance terms for each observation are called frequently and used in several places.
62    dev_terms: OnceCell<Array1<F>>,
63    /// The Pearson residuals, a common statistic that is re-used in several other quantities.
64    resid_pear: OnceCell<Array1<F>>,
65    /// The covariance matrix, using the sandwich approach.
66    covariance: OnceCell<Array2<F>>,
67    /// The inverse of the regularized Fisher information matrix in the external non-standardized
68    /// parameter basis. Used as a component of the sandwich covariance and for influence
69    /// calculations. Cached on first access.
70    fisher_inv: OnceCell<Array2<F>>,
71    /// The inverse of the fisher matrix in standardized internal parameters.
72    fisher_std_inv: OnceCell<Array2<F>>,
73    /// The hat matrix of the data and fit. Since the calculation requires a matrix inversion of
74    /// the fisher information, it is computed only when needed and the value is cached. Access
75    /// through the `hat()` function.
76    hat: OnceCell<Array2<F>>,
77    /// The likelihood and parameters for the null model.
78    null_model: OnceCell<(F, Array1<F>)>,
79}
80
81impl<'a, M, F> Fit<'a, M, F>
82where
83    M: Glm,
84    F: 'static + Float,
85{
86    /// Returns the Akaike information criterion for the model fit.
87    ///
88    /// ```math
89    /// \text{AIC} = D + 2K - 2\sum_{i} \ln w_{i}
90    /// ```
91    ///
92    /// where $`D`$ is the deviance, $`K`$ is the number of parameters, and $`w_i`$ are the
93    /// variance weights.
94    /// This is unique only to an additive constant, so only differences in AIC are meaningful.
95    pub fn aic(&self) -> F {
96        let log_weights = self.data.get_variance_weights().mapv(num_traits::Float::ln);
97        let sum_log_weights = self.data.freq_sum(&log_weights);
98        // NOTE: This is now the unregularized deviance.
99        self.deviance() + F::two() * self.rank() - F::two() * sum_log_weights
100    }
101
102    /// Returns the Bayesian information criterion for the model fit.
103    ///
104    /// ```math
105    /// \text{BIC} = K \ln(n) - 2l - 2\sum_{i} \ln w_{i}
106    /// ```
107    ///
108    /// where $`K`$ is the number of parameters, $`n`$ is the number of observations, and $`l`$ is
109    /// the log-likelihood (including the variance weight normalization terms).
110    // TODO: Also consider the effect of regularization on this statistic.
111    // TODO: Wikipedia suggests that the variance should included in the number
112    // of parameters for multiple linear regression. Should an additional
113    // parameter be included for the dispersion parameter? This question does
114    // not affect the difference between two models fit with the methodology in
115    // this package.
116    pub fn bic(&self) -> F {
117        let log_weights = self.data.get_variance_weights().mapv(num_traits::Float::ln);
118        let sum_log_weights = self.data.freq_sum(&log_weights);
119        let logn = num_traits::Float::ln(self.data.n_obs());
120        logn * self.rank() - F::two() * self.model_like - F::two() * sum_log_weights
121    }
122
123    /// The Cook's distance for each observation, which measures how much the predicted values
124    /// change when leaving out each observation.
125    ///
126    /// ```math
127    /// C_i = \frac{r_i^2 \, h_i}{K \, \hat\phi \, (1 - h_i)^2}
128    /// ```
129    ///
130    /// where $`r_i`$ is the Pearson residual, $`h_i`$ is the leverage, $`K`$ is the rank
131    /// (number of parameters), and $`\hat\phi`$ is the estimated dispersion.
132    pub fn cooks(&self) -> RegressionResult<Array1<F>, F> {
133        let hat = self.leverage()?;
134        let pear_sq = self.resid_pear().mapv(|r| r * r);
135        let h_terms: Array1<F> = hat.mapv_into(|h| {
136            let omh = F::one() - h;
137            h / (omh * omh)
138        });
139        let denom: F = self.rank() * self.dispersion();
140        Ok(pear_sq * h_terms / denom)
141    }
142
143    /// The covariance matrix of the parameter estimates. When no regularization is used, this is:
144    ///
145    /// ```math
146    /// \text{Cov}[\hat{\boldsymbol\beta}] = \hat\phi \, (\mathbf{X}^\mathsf{T}\mathbf{WSX})^{-1}
147    /// ```
148    ///
149    /// When regularization is active, the sandwich form is used to correctly account for the bias
150    /// introduced by the penalty:
151    ///
152    /// ```math
153    /// \text{Cov}[\hat{\boldsymbol\beta}] = \hat\phi \, \mathcal{I}_\text{reg}^{-1} \, \mathcal{I}_\text{data} \, \mathcal{I}_\text{reg}^{-1}
154    /// ```
155    ///
156    /// where $`\mathcal{I}_\text{reg}`$ is the regularized Fisher information and
157    /// $`\mathcal{I}_\text{data}`$ is the unregularized (data-only) Fisher information.
158    /// When unregularized, $`\mathcal{I}_\text{reg} = \mathcal{I}_\text{data}`$ and this reduces
159    /// to the standard form. The result is cached on first access.
160    pub fn covariance(&self) -> RegressionResult<&Array2<F>, F> {
161        self.covariance.get_or_try_init(|| {
162            // The covariance must be multiplied by the dispersion parameter.
163            // For logistic/poisson regression, this is identically 1.
164            // For linear/gamma regression it is estimated from the data.
165            let phi: F = self.dispersion();
166            let f_reg_inv: Array2<F> = self.fisher_inv()?.to_owned();
167            // Use the sandwich form so that regularization doesn't artificially deflate uncertainty.
168            // When unregularized, F_reg = F_data and this reduces to F_data^{-1}.
169            // The unregularized fisher matrix is most easily acquired in terms of the standardized
170            // variables, so apply the external transformation to it.
171            let f_data = self
172                .data
173                .inverse_transform_fisher(self.fisher_data_std(&self.result_std));
174            Ok(f_reg_inv.dot(&f_data).dot(&f_reg_inv) * phi)
175        })
176    }
177
178    /// Returns the deviance of the fit:
179    ///
180    /// ```math
181    /// D = -2 \left[ l(\hat{\boldsymbol\beta}) - l_\text{sat} \right]
182    /// ```
183    ///
184    /// Asymptotically $`\chi^2`$-distributed with [`ndf()`](Self::ndf) degrees of freedom.
185    /// The unregularized likelihood is used.
186    pub fn deviance(&self) -> F {
187        let terms = self.deviance_terms();
188        self.data.freq_sum(terms)
189    }
190
191    /// Returns the contribution to the deviance from each observation. The total deviance should
192    /// be the sum of all of these. Variance weights are already included, but not frequency
193    /// weights.
194    fn deviance_terms(&self) -> &Array1<F> {
195        self.dev_terms.get_or_init(|| {
196            let ll_terms: Array1<F> = M::log_like_terms(self.data, &self.result_std);
197            let ll_sat: Array1<F> = self.data.y.mapv(M::log_like_sat);
198            let terms = (ll_sat - ll_terms) * F::two();
199            self.data.apply_var_weights(terms)
200        })
201    }
202
203    /// Returns the self-excluded deviance terms, i.e. the deviance of an observation as if the
204    /// model was fit without it. This is a one-step approximation.
205    fn deviance_terms_loo(&self) -> RegressionResult<Array1<F>, F> {
206        let dev_terms = self.deviance_terms();
207        let pear_sq = self.resid_pear().mapv(|r| r * r);
208        let hat_rat = self.leverage()?.mapv(|h| h / (F::one() - h));
209        let result = dev_terms + &hat_rat * (&hat_rat + F::two()) * pear_sq;
210        Ok(result)
211    }
212
213    /// The dispersion parameter $`\hat\phi`$ relating the variance to the variance function:
214    /// $`\text{Var}[y] = \phi \, V(\mu)`$.
215    ///
216    /// Identically one for logistic, binomial, and Poisson regression.
217    /// For families with a free dispersion (linear, gamma), estimated as:
218    ///
219    /// ```math
220    /// \hat\phi = \frac{D}{\left(1 - \frac{K}{n_\text{eff}}\right) \sum_i w_i}
221    /// ```
222    ///
223    /// which reduces to $`D / (N - K)`$ without variance weights.
224    pub fn dispersion(&self) -> F {
225        *self.phi.get_or_init(|| {
226            use DispersionType::*;
227            match M::DISPERSED {
228                FreeDispersion => {
229                    let dev = self.deviance();
230                    let p = self.rank();
231                    let n_eff = self.data.n_eff();
232                    let scaling = if p >= n_eff {
233                        // This is the overparameterized regime, which is checked directly instead of
234                        // allowing negative values. It's not clear what conditions result in this when
235                        // p < N.
236                        F::zero()
237                    } else {
238                        (F::one() - p / n_eff) * self.data.sum_weights()
239                    };
240                    dev / scaling
241                }
242                NoDispersion => F::one(),
243            }
244        })
245    }
246
247    /// Return the dispersion terms with the observation(s) at each point excluded from the fit.
248    fn dispersion_loo(&self) -> RegressionResult<Array1<F>, F> {
249        use DispersionType::*;
250        match M::DISPERSED {
251            FreeDispersion => {
252                let pear_sq = self.resid_pear().mapv(|r| r * r);
253                let hat_rat = self.leverage()?.mapv(|h| h / (F::one() - h));
254                let terms = self.deviance_terms() + hat_rat * pear_sq;
255                // Don't apply total weights since the variance weights are already
256                // included in the residual terms. However, we do need the frequency weights.
257                let terms = self.data.apply_freq_weights(terms);
258                let total: Array1<F> = -terms + self.deviance();
259                let scaled_total: Array1<F> = match &self.data.weights {
260                    None => match &self.data.freqs {
261                        Some(f) => total / -(f - self.ndf()),
262                        None => total / (self.ndf() - F::one()),
263                    },
264                    Some(w) => {
265                        let v1 = self.data.freq_sum(w);
266                        let w2 = w * w;
267                        let v2 = self.data.freq_sum(&w2);
268                        // The subtracted out terms need the frequency terms as well
269                        let f_w = self.data.apply_freq_weights(w.clone());
270                        let f_w2 = self.data.apply_freq_weights(w2);
271                        // the modifed sums from leaving out the ith observation
272                        let v1p = -f_w + v1;
273                        let v2p = -f_w2 + v2;
274                        let p = self.rank();
275                        let scale = &v1p - v2p / &v1p * p;
276                        total / scale
277                    }
278                };
279                Ok(scaled_total)
280            }
281            NoDispersion => Ok(Array1::<F>::ones(self.data.y.len())),
282        }
283    }
284
285    /// Returns the Fisher information (the negative Hessian of the log-likelihood) at the
286    /// parameter values given:
287    ///
288    /// ```math
289    /// \mathcal{I}(\boldsymbol\beta) = \mathbf{X}^\mathsf{T}\mathbf{W}\eta'^2\mathbf{S}\mathbf{X}
290    /// ```
291    ///
292    /// where $`\mathbf{S} = \text{diag}(V(\mu_i))`$ and $`\eta'`$ is the derivative of the natural
293    /// parameter in terms of the linear predictor ($`\eta(\omega) = g_0(g^{-1}(\omega))`$ where
294    /// $`g_0`$ is the canonical link function).
295    /// The regularization is included.
296    pub fn fisher(&self, params: &Array1<F>) -> Array2<F> {
297        // Note that fisher() is a public function so it should take the external parameters.
298        let params = self.data.transform_beta(params.clone());
299        // We actually need to futher apply the transformation beyond this, so that *arrays
300        // multiplying this resulting matrix* are hit the right way.
301        let fish_std = self.fisher_std(&params);
302        self.data.inverse_transform_fisher(fish_std)
303    }
304
305    /// The inverse of the (regularized) fisher information matrix, in the external parameter
306    /// basis. Used for the influence calculations, and as a component of the sandwich covariance.
307    /// Cached on first access.
308    fn fisher_inv(&self) -> RegressionResult<&Array2<F>, F> {
309        self.fisher_inv.get_or_try_init(|| {
310            let fisher_reg = self.fisher(&self.result);
311            // NOTE: invh/invh_into() are bugged and incorrect, as they are sensitive to the layout.
312            // Even with X in column-major order, the fisher matrix is in the space of the
313            // covariates, and seems to naturally emerge as row-major. The bug could be avoided
314            // with:
315            // let fish_inv = fisher_reg.t().to_owned().invh_into()?;
316            // but this also allocates again so let's not worry about this yet.
317            let fish_inv = fisher_reg.inv_into()?;
318            Ok(fish_inv)
319        })
320    }
321
322    /// Compute the data-only (unregularized) Fisher information in the internal standardized basis.
323    fn fisher_data_std(&self, params: &Array1<F>) -> Array2<F> {
324        let lin_pred: Array1<F> = self.data.linear_predictor_std(params);
325        let adj_var: Array1<F> = M::adjusted_variance_diag(&lin_pred);
326        (self.data.x_conj() * &adj_var).dot(&self.data.x)
327    }
328
329    /// Compute the fisher information in terms of the internal (likely standardized)
330    /// coefficients. The regularization is included here.
331    fn fisher_std(&self, params: &Array1<F>) -> Array2<F> {
332        let fisher = self.fisher_data_std(params);
333        self.reg.as_ref().irls_mat(fisher, params)
334    }
335
336    /// The inverse of the (regularized) fisher information matrix, in the internal standardized
337    /// parameter basis.
338    fn fisher_std_inv(&self) -> RegressionResult<&Array2<F>, F> {
339        self.fisher_std_inv.get_or_try_init(|| {
340            let fisher_reg = self.fisher_std(&self.result_std);
341            // NOTE: invh/invh_into() are bugged and incorrect!
342            let fish_inv = fisher_reg.inv_into()?;
343            Ok(fish_inv)
344        })
345    }
346
347    /// Returns the hat matrix, also known as the "projection" or "influence" matrix:
348    ///
349    /// ```math
350    /// P_{ij} = \frac{\partial \hat{y}_i}{\partial y_j}
351    /// ```
352    ///
353    /// Orthogonal to the response residuals at the fit result: $`\mathbf{P}(\mathbf{y} -
354    /// \hat{\mathbf{y}}) = 0`$.
355    /// This version is not symmetric, but the diagonal is invariant to this choice of convention.
356    pub fn hat(&self) -> RegressionResult<&Array2<F>, F> {
357        self.hat.get_or_try_init(|| {
358            // Do the full computation in terms of the internal parameters, since this observable
359            // is not sensitive to the choice of basis.
360
361            // Apply the eta' terms manually instead of calling adjusted_variance_diag, because the
362            // adjusted variance method applies 2 powers to the variance, while we want one power
363            // to the variance and one to the weights.
364            // let adj_var = M::adjusted_variance_diag(&self.x_lin_pred);
365
366            let var = self.y_hat.clone().mapv_into(M::variance);
367            let eta_d = M::Link::d_nat_param(&self.x_lin_pred);
368
369            let fisher_inv = self.fisher_std_inv()?;
370
371            // the GLM variance and the data weights are put on different sides in this convention
372            let left = (var * &eta_d).insert_axis(Axis(1)) * &self.data.x;
373            let right = self.data.x_conj() * &eta_d;
374            Ok(left.dot(&fisher_inv.dot(&right)))
375        })
376    }
377
378    /// The one-step approximation to the change in coefficients from excluding each observation:
379    ///
380    /// ```math
381    /// \Delta\boldsymbol\beta^{(-i)} \approx \frac{1}{1-h_i}\,\mathcal{I}^{-1}\mathbf{x}^{(i)} w_i \eta'_i e_i
382    /// ```
383    ///
384    /// Each row $`i`$ should be subtracted from $`\hat{\boldsymbol\beta}`$ to approximate
385    /// the coefficients that would result from excluding observation $`i`$.
386    /// Exact for linear models; a one-step approximation for nonlinear models.
387    pub fn infl_coef(&self) -> RegressionResult<Array2<F>, F> {
388        // The linear predictor can be acquired in terms of the standardized parameters, but the
389        // rest of the computation should use external.
390        let resid_resp = self.resid_resp();
391        let omh = -self.leverage()? + F::one();
392        let resid_adj = M::Link::adjust_errors(resid_resp, &self.x_lin_pred) / omh;
393        let xte = self.data.x_conj_ext() * resid_adj;
394        let fisher_inv = self.fisher_inv()?;
395        let delta_b = xte.t().dot(fisher_inv);
396        Ok(delta_b)
397    }
398
399    /// Returns the leverage $`h_i = P_{ii}`$ for each observation: the diagonal of the hat matrix.
400    /// Indicates the sensitivity of each prediction to its corresponding observation.
401    pub fn leverage(&self) -> RegressionResult<Array1<F>, F> {
402        let hat = self.hat()?;
403        Ok(hat.diag().to_owned())
404    }
405
406    /// Returns exact coefficients from leaving each observation out, one-at-a-time.
407    /// This is a much more expensive operation than the original regression because a new one is
408    /// performed for each observation.
409    pub fn loo_exact(&self) -> RegressionResult<Array2<F>, F> {
410        // Use the one-step approximation to get good starting points for each exclusion.
411        let loo_coef: Array2<F> = self.infl_coef()?;
412        // NOTE: These coefficients need to be in terms of external parameters
413        let loo_initial = &self.result - loo_coef;
414        let mut loo_result = loo_initial.clone();
415        let n_obs = self.data.y.len();
416        for i in 0..n_obs {
417            let data_i: Dataset<F> = {
418                // Get the proper X data matrix as it existed before standardization and
419                // intercept-padding.
420                let x_i = self.data.x_orig();
421                // Leave the observation out by setting the frequency weight to zero.
422                let mut freqs_i: Array1<F> =
423                    self.data.freqs.clone().unwrap_or(Array1::<F>::ones(n_obs));
424                freqs_i[i] = F::zero();
425                let mut data_i = Dataset {
426                    y: self.data.y.clone(),
427                    x: x_i,
428                    linear_offset: self.data.linear_offset.clone(),
429                    weights: self.data.weights.clone(),
430                    freqs: Some(freqs_i),
431                    // These fields must be set this way, as they are in the ModelBuilder, before
432                    // finalize_design_matrix() is called.
433                    has_intercept: false,
434                    standardizer: None,
435                };
436                data_i.finalize_design_matrix(
437                    self.data.standardizer.is_some(),
438                    self.data.has_intercept,
439                );
440                data_i
441            };
442            let model_i = Model {
443                model: PhantomData::<M>,
444                data: data_i,
445            };
446            let options = {
447                let mut options = self.options.clone();
448                // The one-step approximation should be a good starting point.
449                // Use the external result as it should be re-standardized to the new dataset
450                // internally before being passed to IRLS.
451                options.init_guess = Some(loo_initial.row(i).to_owned());
452                options
453            };
454            let fit_i = model_i.with_options(options).fit()?;
455            // The internal re-fit transforms back to the external scale.
456            loo_result.row_mut(i).assign(&fit_i.result);
457        }
458        Ok(loo_result)
459    }
460
461    /// Perform a likelihood-ratio test, returning the statistic:
462    ///
463    /// ```math
464    /// \Lambda = -2 \ln \frac{L_0}{L} = -2(l_0 - l)
465    /// ```
466    ///
467    /// where $`L_0`$ is the null model likelihood (intercept only) and $`L`$ is the fit likelihood.
468    /// By Wilks' theorem, asymptotically $`\chi^2`$-distributed with
469    /// [`test_ndf()`](Self::test_ndf) degrees of freedom.
470    // TODO: Should the effective number of degrees of freedom due to
471    // regularization be taken into account?
472    pub fn lr_test(&self) -> F {
473        // The model likelihood should include regularization terms and there
474        // shouldn't be any in the null model with all non-intercept parameters
475        // set to zero.
476        let null_like = self.null_like();
477        F::from(-2.).unwrap() * (null_like - self.model_like)
478    }
479
480    /// Perform a likelihood-ratio test against a general alternative model, not
481    /// necessarily a null model. The alternative model is regularized the same
482    /// way that the regression resulting in this fit was. The degrees of
483    /// freedom cannot be generally inferred.
484    pub fn lr_test_against(&self, alternative: &Array1<F>) -> F {
485        let alt_std = self.data.transform_beta(alternative.clone());
486        let alt_like = M::log_like(self.data, &alt_std);
487        let alt_like_reg = alt_like + self.reg.likelihood(&alt_std);
488        F::two() * (self.model_like - alt_like_reg)
489    }
490
491    /// Returns the residual degrees of freedom in the model, i.e. the number
492    /// of data points minus the number of parameters. Not to be confused with
493    /// `test_ndf()`, the degrees of freedom in the statistical tests of the
494    /// fit parameters.
495    pub fn ndf(&self) -> F {
496        self.data.n_obs() - self.rank()
497    }
498
499    pub(crate) fn new(data: &'a Dataset<F>, optimum: IrlsStep<F>, irls: Irls<M, F>) -> Self {
500        let IrlsStep {
501            guess: result_std,
502            like: model_like,
503        } = optimum;
504        let Irls {
505            options,
506            reg,
507            n_iter,
508            history,
509            ..
510        } = irls;
511        // Cache some of these variables that will be used often.
512        let n_par = result_std.len();
513        // NOTE: This necessarily uses the coefficients directly from the standardized data.
514        // Store the linear predictor and the corresponding y values as they are commonly used.
515        let x_lin_pred = data.linear_predictor_std(&result_std);
516        let y_hat = M::mean(&x_lin_pred);
517        // The public result must be transformed back to the external scale for compatability with
518        // the input data.
519        let result_ext = data.inverse_transform_beta(result_std.clone());
520        // The history also needs to be transformed back to the external scale, since it is public.
521        // It shouldn't be used directly by the Fit's methods.
522        let history = history
523            .into_iter()
524            .map(|IrlsStep { guess, like }| IrlsStep {
525                guess: data.inverse_transform_beta(guess),
526                like,
527            })
528            .collect();
529        Self {
530            model: PhantomData,
531            data,
532            result: result_ext,
533            result_std,
534            x_lin_pred,
535            y_hat,
536            options,
537            model_like,
538            reg,
539            n_iter,
540            history,
541            n_par,
542            phi: OnceCell::new(),
543            dev_terms: OnceCell::new(),
544            resid_pear: OnceCell::new(),
545            covariance: OnceCell::new(),
546            fisher_inv: OnceCell::new(),
547            fisher_std_inv: OnceCell::new(),
548            hat: OnceCell::new(),
549            null_model: OnceCell::new(),
550        }
551    }
552
553    /// Returns the likelihood given the null model, which fixes all parameters
554    /// to zero except the intercept (if it is used). A total of `test_ndf()`
555    /// parameters are constrained.
556    pub fn null_like(&self) -> F {
557        let (null_like, _) = self.null_model_fit();
558        *null_like
559    }
560
561    /// Return the likelihood and null model parameters, which will be zero with the possible
562    /// exception of the intercept term. Since this can require an additional regression, the
563    /// values are cached.
564    fn null_model_fit(&self) -> &(F, Array1<F>) {
565        self.null_model
566            .get_or_init(|| match &self.data.linear_offset {
567                None => {
568                    // If there is no linear offset, the natural parameter is
569                    // identical for all observations so it is sufficient to
570                    // calculate the null likelihood for a single point with y equal
571                    // to the average.
572                    // The average y
573                    let y_bar: F = self.data.weighted_sum(&self.data.y) / self.data.sum_weights();
574                    // This approach assumes that the likelihood is in the natural
575                    // exponential form as calculated by Glm::log_like_natural(). If that
576                    // function is overridden and the values differ significantly, this
577                    // approach will give incorrect results. If the likelihood has terms
578                    // non-linear in y, then the likelihood must be calculated for every
579                    // point rather than averaged.
580                    // If the intercept is allowed to maximize the likelihood, the natural
581                    // parameter is equal to the link of the expectation. Otherwise it is
582                    // the transformation function of zero.
583                    let intercept: F = if self.data.has_intercept {
584                        M::Link::func(y_bar)
585                    } else {
586                        F::zero()
587                    };
588                    // this is a length-one array. This works because the
589                    // likelihood contribution is the same for all observations.
590                    let nat_par = M::Link::nat_param(array![intercept]);
591                    // The null likelihood per observation
592                    let null_like_one: F = M::log_like_natural(y_bar, nat_par[0]);
593                    // just multiply the average likelihood by the number of data points, since every term is the same.
594                    let null_like_total = self.data.sum_weights() * null_like_one;
595                    let null_params: Array1<F> = {
596                        let mut par = Array1::<F>::zeros(self.n_par);
597                        par[0] = intercept;
598                        par
599                    };
600                    (null_like_total, null_params)
601                }
602                Some(off) => {
603                    if self.data.has_intercept {
604                        // If there are linear offsets and the intercept is allowed
605                        // to be free, there is not a major simplification and the
606                        // model needs to be re-fit.
607                        // the X data is a single column of ones. Since this model
608                        // isn't being created by the ModelBuilder, the X data
609                        // has to be automatically padded with ones.
610                        let data_x_null = Array2::<F>::ones((self.data.y.len(), 1));
611                        let null_model = Model {
612                            model: std::marker::PhantomData::<M>,
613                            data: Dataset::<F> {
614                                y: self.data.y.clone(),
615                                x: data_x_null,
616                                linear_offset: Some(off.clone()),
617                                weights: self.data.weights.clone(),
618                                freqs: self.data.freqs.clone(),
619                                // If we are in this branch it is because an intercept is needed.
620                                has_intercept: true,
621                                // We don't use standardization for the null model.
622                                standardizer: None,
623                            },
624                        };
625                        // TODO: Make this function return an error, although it's
626                        // difficult to imagine this case happening.
627                        // TODO: Should the tolerance of this fit be stricter?
628                        // The intercept should not be regularized
629                        let null_fit = null_model
630                            .fit_options()
631                            // There shouldn't be too much trouble fitting this
632                            // single-parameter fit, but there shouldn't be harm in
633                            // using the same maximum as in the original model.
634                            .max_iter(self.options.max_iter)
635                            .fit()
636                            .expect("Could not fit null model!");
637                        let null_params: Array1<F> = {
638                            let mut par = Array1::<F>::zeros(self.n_par);
639                            // there is only one parameter in this fit. It should be the same as
640                            // the external result since this model doesn't have standardization.
641                            par[0] = null_fit.result_std[0];
642                            par
643                        };
644                        (null_fit.model_like, null_params)
645                    } else {
646                        // If the intercept is fixed to zero, then no minimization is
647                        // required. The natural parameters are directly known in terms
648                        // of the linear offset. The likelihood must still be summed
649                        // over all observations, since they have different offsets.
650                        let nat_par = M::Link::nat_param(off.clone());
651                        let null_like_terms = ndarray::Zip::from(&self.data.y)
652                            .and(&nat_par)
653                            .map_collect(|&y, &eta| M::log_like_natural(y, eta));
654                        let null_like = self.data.weighted_sum(&null_like_terms);
655                        let null_params = Array1::<F>::zeros(self.n_par);
656                        (null_like, null_params)
657                    }
658                }
659            })
660    }
661
662    /// Returns $`\hat{\mathbf{y}} = g^{-1}(\mathbf{X}\hat{\boldsymbol\beta} + \boldsymbol\omega_0)`$
663    /// given input data $`\mathbf{X}`$ and an optional linear offset
664    /// $`\boldsymbol\omega_0`$. The data need not be the training data.
665    pub fn predict<S>(&self, data_x: &ArrayBase<S, Ix2>, lin_off: Option<&Array1<F>>) -> Array1<F>
666    where
667        S: Data<Elem = F>,
668    {
669        let lin_pred = if self.data.has_intercept {
670            one_pad(data_x.view())
671        } else {
672            data_x.to_owned()
673        }
674        .dot(&self.result);
675        let lin_pred: Array1<F> = if let Some(off) = &lin_off {
676            lin_pred + *off
677        } else {
678            lin_pred
679        };
680        M::mean(&lin_pred)
681    }
682
683    /// Returns the rank $`K`$ of the model (i.e. the number of parameters)
684    fn rank(&self) -> F {
685        F::from(self.n_par).unwrap()
686    }
687
688    /// Return the deviance residuals for each point in the training data:
689    ///
690    /// ```math
691    /// d_i = \text{sign}(y_i - \hat\mu_i)\sqrt{D_i}
692    /// ```
693    ///
694    /// where $`D_i`$ is the per-observation deviance contribution.
695    /// This is usually a better choice than Pearson residuals for non-linear models.
696    pub fn resid_dev(&self) -> Array1<F> {
697        let signs = self.resid_resp().mapv_into(F::signum);
698        let ll_diff = self.deviance_terms();
699        let dev: Array1<F> = ll_diff.clone().mapv_into(num_traits::Float::sqrt);
700        signs * dev
701    }
702
703    /// Return the standardized deviance residuals (internally studentized):
704    ///
705    /// ```math
706    /// d_i^* = \frac{d_i}{\sqrt{\hat\phi(1 - h_i)}}
707    /// ```
708    ///
709    /// where $`d_i`$ is the deviance residual, $`\hat\phi`$ is the dispersion, and $`h_i`$ is
710    /// the leverage. Generally applicable for outlier detection.
711    pub fn resid_dev_std(&self) -> RegressionResult<Array1<F>, F> {
712        let dev = self.resid_dev();
713        let phi = self.dispersion();
714        let hat: Array1<F> = self.leverage()?;
715        let omh: Array1<F> = -hat + F::one();
716        let denom: Array1<F> = (omh * phi).mapv_into(num_traits::Float::sqrt);
717        Ok(dev / denom)
718    }
719
720    /// Return the partial residuals as an n_obs × n_predictors matrix. Each column $`j`$
721    /// contains working residuals plus the centered contribution of predictor $`j`$:
722    ///
723    /// ```math
724    /// r^{(j)}_i = r^w_i + (x_{ij} - \bar x_j) \beta_j
725    /// ```
726    ///
727    /// The bar denotes the **fully weighted** column mean (combining both variance and frequency
728    /// weights), which is the WLS-consistent choice: it ensures that $`\sum_i w_i r^{(j)}_i = 0`$
729    /// for each predictor. R's `residuals(model, type = "partial")` uses only frequency weights in
730    /// its centering (excluding variance weights), so results will differ when variance weights are
731    /// present. For models without an intercept, no centering is applied. The intercept term is
732    /// always excluded from the output.
733    pub fn resid_part(&self) -> Array2<F> {
734        let resid_work_i = self.resid_work().insert_axis(Axis(1));
735        let n = resid_work_i.len();
736
737        // Use external (unstandardized) feature columns and coefficients.
738        let x_orig = self.data.x_orig();
739        // The intercept term is not part of the partial residuals.
740        let beta_j = if self.data.has_intercept {
741            self.result.slice(s![1..]).to_owned()
742        } else {
743            self.result.clone()
744        }
745        .insert_axis(Axis(0));
746
747        // Center by fully-weighted (variance × frequency) column means. This is the WLS-consistent
748        // choice: sum_i w_i r^(j)_i = 0 for each predictor. R excludes variance weights from
749        // centering; results will differ from R when variance weights are present with an intercept.
750        // No centering for no-intercept models.
751        let x_centered: Array2<F> = if self.data.has_intercept {
752            let wt_factors = self
753                .data
754                .apply_total_weights(Array1::ones(n))
755                .insert_axis(Axis(1));
756            let x_mean: Array1<F> = (&x_orig * &wt_factors).sum_axis(Axis(0)) / wt_factors.sum();
757            x_orig - x_mean.insert_axis(Axis(0))
758        } else {
759            x_orig
760        };
761        resid_work_i + x_centered * beta_j
762    }
763
764    /// Return the Pearson residuals for each point in the training data:
765    ///
766    /// ```math
767    /// r_i = \sqrt{w_i} \, \frac{y_i - \hat\mu_i}{\sqrt{V(\hat\mu_i)}}
768    /// ```
769    ///
770    /// where $`V`$ is the variance function and $`w_i`$ are the variance weights.
771    /// Not scaled by the dispersion for families with a free dispersion parameter.
772    pub fn resid_pear(&self) -> &Array1<F> {
773        self.resid_pear.get_or_init(|| {
774            let residuals = self.resid_resp();
775            let inv_var_diag: Array1<F> = self
776                .y_hat
777                .clone()
778                .mapv_into(M::variance)
779                .mapv_into(F::recip);
780            // the variance weights are the reciprocal of the corresponding variance
781            let scales = self
782                .data
783                .apply_var_weights(inv_var_diag)
784                .mapv_into(num_traits::Float::sqrt);
785            scales * residuals
786        })
787    }
788
789    /// Return the standardized Pearson residuals (internally studentized):
790    ///
791    /// ```math
792    /// r_i^* = \frac{r_i}{\sqrt{\hat\phi(1 - h_i)}}
793    /// ```
794    ///
795    /// where $`r_i`$ is the Pearson residual and $`h_i`$ is the leverage. These are expected
796    /// to have unit variance.
797    pub fn resid_pear_std(&self) -> RegressionResult<Array1<F>, F> {
798        let pearson = self.resid_pear();
799        let phi = self.dispersion();
800        let hat = self.leverage()?;
801        let omh = -hat + F::one();
802        let denom: Array1<F> = (omh * phi).mapv_into(num_traits::Float::sqrt);
803        Ok(pearson / denom)
804    }
805
806    /// Return the response residuals: $`e_i^\text{resp} = y_i - \hat\mu_i`$.
807    pub fn resid_resp(&self) -> Array1<F> {
808        &self.data.y - &self.y_hat
809    }
810
811    /// Return the externally studentized residuals:
812    ///
813    /// ```math
814    /// \tilde{t}_i = \text{sign}(e_i) \sqrt{\frac{D_i^{(-i)}}{\hat\phi^{(-i)}}}
815    /// ```
816    ///
817    /// where $`D_i^{(-i)}`$ and $`\hat\phi^{(-i)}`$ are the LOO deviance and dispersion
818    /// approximated via one-step deletion. Under normality, $`t`$-distributed with
819    /// $`N - K - 1`$ degrees of freedom. This is a robust and general method for outlier
820    /// detection.
821    pub fn resid_student(&self) -> RegressionResult<Array1<F>, F> {
822        let signs = self.resid_resp().mapv(F::signum);
823        let dev_terms_loo: Array1<F> = self.deviance_terms_loo()?;
824        // NOTE: This match could also be handled internally in dispersion_loo()
825        let dev_terms_scaled = match M::DISPERSED {
826            // The dispersion is corrected for the contribution from each current point.
827            // This is an approximation; the exact solution would perform a fit at each point.
828            DispersionType::FreeDispersion => dev_terms_loo / self.dispersion_loo()?,
829            DispersionType::NoDispersion => dev_terms_loo,
830        };
831        Ok(signs * dev_terms_scaled.mapv_into(num_traits::Float::sqrt))
832    }
833
834    /// Returns the working residuals:
835    ///
836    /// ```math
837    /// e_i^\text{work} = g'(\hat\mu_i)\,(y_i - \hat\mu_i) = \frac{y_i - \hat\mu_i}{\eta'(\omega_i)\,V(\hat\mu_i)}
838    /// ```
839    ///
840    /// where $`g'(\mu)`$ is the derivative of the link function and $`\eta'(\omega)`$ is the
841    /// derivative of the natural parameter with respect to the linear predictor. For canonical
842    /// links $`\eta'(\omega) = 1`$, reducing this to $`(y_i - \hat\mu_i)/V(\hat\mu_i)`$.
843    ///
844    /// These can be interpreted as the residual differences mapped into the linear predictor space
845    /// of $`\omega = \mathbf{x}\cdot\boldsymbol{\beta}`$.
846    pub fn resid_work(&self) -> Array1<F> {
847        let resid_response: Array1<F> = self.resid_resp();
848        let var: Array1<F> = self.y_hat.clone().mapv(M::variance);
849        // adjust for non-canonical link functions; we want a total factor of 1/eta'
850        let (adj_response, adj_var) =
851            M::Link::adjust_errors_variance(resid_response, var, &self.x_lin_pred);
852        adj_response / adj_var
853    }
854
855    /// Returns the score function $`\nabla_{\boldsymbol\beta} l`$ (the gradient of the
856    /// regularized log-likelihood) at the parameter values given. Should be zero at the MLE.
857    /// The input and output are in the external (unstandardized) parameter space.
858    pub fn score(&self, params: Array1<F>) -> Array1<F> {
859        // Compute in the internal (standardized) basis so the regularization gradient is applied
860        // to the correct parameters, then transform the result back to external coordinates via
861        // score_ext = J^T score_int where J = d(beta_std)/d(beta_ext).
862        let params_std = self.data.transform_beta(params);
863        let lin_pred: Array1<F> = self.data.linear_predictor_std(&params_std);
864        let mu: Array1<F> = M::mean(&lin_pred);
865        let resid_response = &self.data.y - mu;
866        let resid_working = M::Link::adjust_errors(resid_response, &lin_pred);
867        let score_int = self.data.x_conj().dot(&resid_working);
868        let score_int_reg = self.reg.as_ref().gradient(score_int, &params_std);
869        self.data.inverse_transform_score(score_int_reg)
870    }
871
872    /// Returns the score test statistic:
873    ///
874    /// ```math
875    /// S = \mathbf{J}(\boldsymbol\beta_0)^\mathsf{T} \, \mathcal{I}(\boldsymbol\beta_0)^{-1} \, \mathbf{J}(\boldsymbol\beta_0)
876    /// ```
877    ///
878    /// where $`\mathbf{J}`$ is the score and $`\mathcal{I}`$ is the Fisher information, both
879    /// evaluated at the null parameters. Asymptotically $`\chi^2`$-distributed with
880    /// [`test_ndf()`](Self::test_ndf) degrees of freedom.
881    pub fn score_test(&self) -> RegressionResult<F, F> {
882        let (_, null_params) = self.null_model_fit();
883        // NOTE: The null model is not sensitive to standardization.
884        self.score_test_against(null_params.clone())
885    }
886
887    /// Returns the score test statistic compared to another set of model
888    /// parameters, not necessarily a null model. The degrees of freedom cannot
889    /// be generally inferred.
890    pub fn score_test_against(&self, alternative: Array1<F>) -> RegressionResult<F, F> {
891        let fisher_alt = self.fisher(&alternative);
892        let score_alt = self.score(alternative);
893        // The is not the same as the cached covariance matrix since it is
894        // evaluated at the null parameters.
895        // NOTE: invh/invh_into() are bugged and incorrect!
896        let inv_fisher_alt = fisher_alt.inv_into()?;
897        Ok(score_alt.t().dot(&inv_fisher_alt.dot(&score_alt)))
898    }
899
900    /// The degrees of freedom for the likelihood ratio test, the score test,
901    /// and the Wald test. Not to be confused with `ndf()`, the degrees of
902    /// freedom in the model fit.
903    pub fn test_ndf(&self) -> usize {
904        if self.data.has_intercept {
905            self.n_par - 1
906        } else {
907            self.n_par
908        }
909    }
910
911    /// Returns the Wald test statistic:
912    ///
913    /// ```math
914    /// W = (\hat{\boldsymbol\beta} - \boldsymbol\beta_0)^\mathsf{T} \, \mathcal{I}(\boldsymbol\beta_0) \, (\hat{\boldsymbol\beta} - \boldsymbol\beta_0)
915    /// ```
916    ///
917    /// Compared to a null model with only an intercept (if one is used). Asymptotically
918    /// $`\chi^2`$-distributed with [`test_ndf()`](Self::test_ndf) degrees of freedom.
919    pub fn wald_test(&self) -> F {
920        // The null parameters are all zero except for a possible intercept term
921        // which optimizes the null model.
922        let (_, null_params) = self.null_model_fit();
923        // NOTE: The null model is agnostic to standardization, since it discards all features
924        // except perhaps the intercept.
925        self.wald_test_against(null_params)
926    }
927
928    /// Returns the Wald test statistic compared to another specified model fit
929    /// instead of the null model. The degrees of freedom cannot be generally
930    /// inferred.
931    pub fn wald_test_against(&self, alternative: &Array1<F>) -> F {
932        // This could be computed in either the internal or external basis. This implementation
933        // will use internal, so first we just need to transform the input.
934        let alt_std = self.data.transform_beta(alternative.clone());
935        let d_params_std = &self.result_std - &alt_std;
936        // Get the fisher matrix at the *other* result, since this is a test of this fit's
937        // parameters under a model given by the other.
938        let fisher_std: Array2<F> = self.fisher_std(&alt_std);
939        d_params_std.t().dot(&fisher_std.dot(&d_params_std))
940    }
941
942    /// Returns the per-parameter Wald $`z`$-statistic:
943    ///
944    /// ```math
945    /// z_k = \frac{\hat\beta_k}{\sqrt{\text{Cov}[\hat{\boldsymbol\beta}]_{kk}}}
946    /// ```
947    ///
948    /// Since it does not account for covariance between parameters it may not be accurate.
949    pub fn wald_z(&self) -> RegressionResult<Array1<F>, F> {
950        let par_cov = self.covariance()?;
951        let par_variances: ArrayView1<F> = par_cov.diag();
952        Ok(&self.result / &par_variances.mapv(num_traits::Float::sqrt))
953    }
954}
955
956/// Specialized functions for OLS.
957impl<'a, F> Fit<'a, Linear, F>
958where
959    F: 'static + Float,
960{
961    /// Returns the coefficient of determination $`R^2`$:
962    ///
963    /// ```math
964    /// R^2 = 1 - \frac{D}{D_0}
965    /// ```
966    ///
967    /// where $`D`$ is the deviance of the fitted model and $`D_0`$ is the null deviance
968    /// (deviance of the intercept-only model). For Gaussian with no weights this reduces to
969    /// $`1 - \text{RSS}/\text{TSS}`$. Using the deviance ratio correctly handles variance and
970    /// frequency weights.
971    // NOTE: that this implementation would be valid as a pseudo-$`R^2`$ for all GLMs, but it's
972    // mostly expected in the OLS context and probably wouldn't add much.
973    pub fn r_sq(&self) -> F {
974        let lr = self.lr_test();
975        lr / (lr + self.deviance())
976    }
977
978    /// Returns the weighted residual sum of squares (RSS):
979    /// $`\text{RSS} = \sum_i w_i (y_i - \hat\mu_i)^2`$
980    /// where $`w_i`$ combines variance and frequency weights.
981    // NOTE: this implementation would be well-defined for any GLM, but there are typically better
982    // tools for the job so it's not exposed for non-linear families.
983    pub fn rss(&self) -> F {
984        let resid_sq = self.resid_resp().mapv(|r| r * r);
985        self.data.apply_total_weights(resid_sq).sum()
986    }
987}
988
989/// Methods that require the `stats` feature for distribution CDF evaluation.
990#[cfg(feature = "stats")]
991impl<'a, M, F> Fit<'a, M, F>
992where
993    M: Glm,
994    F: 'static + Float,
995{
996    /// Perform a full re-fit of the model excluding the ith data column.
997    /// NOTE: Ideally we could return a full Fit object, but as it stands the source data must
998    /// outlive the Fit result. It would be nice if we could get around this with some COW
999    /// shenanigans but this will probably require a big change. To get around this for now, just
1000    /// return the deviance, which is all we're using this function for at the moment.
1001    fn dev_without_covariate(&self, i: usize) -> RegressionResult<F, F> {
1002        use ndarray::{concatenate, s};
1003
1004        let is_intercept = self.data.has_intercept && (i == 0);
1005        let x_reduced: Array2<F> = if is_intercept {
1006            // If this is the intercept term, we need to use the original unscaled data to avoid
1007            // mixing
1008            self.data.x_ext().slice(s![.., 1..]).to_owned()
1009        } else {
1010            concatenate![
1011                Axis(1),
1012                self.data.x.slice(s![.., ..i]),
1013                self.data.x.slice(s![.., i + 1..])
1014            ]
1015        };
1016        // NOTE: This data has been standardized already. Since we are fully dropping a covariate,
1017        // each data column should still be standardized if the full external set is.
1018        let data_reduced = Dataset::<F> {
1019            y: self.data.y.clone(),
1020            x: x_reduced,
1021            linear_offset: self.data.linear_offset.clone(),
1022            weights: self.data.weights.clone(),
1023            freqs: self.data.freqs.clone(),
1024            has_intercept: !is_intercept,
1025            // It shouldn't be necessary to standardize again. Each individual column should
1026            // already be standardized, if the full dataset was.
1027            standardizer: None,
1028        };
1029        let model_reduced = Model {
1030            model: PhantomData::<M>,
1031            data: data_reduced,
1032        };
1033        // Start with the non-excluded parameters at the values from the main fit.
1034        let init_guess = if is_intercept {
1035            // For the intercept term, we need to use the original unscaled data, or else the
1036            // transformation mixes the intercept with the other terms. This number is supposed to
1037            // represent the model deviance from excluding the *external* intercept term.
1038            self.result.slice(s![1..]).to_owned()
1039        } else {
1040            // The data is already standardized in the non-intercept case, so we should start the
1041            // the standardized parameters, removing the covariate of interest.
1042            concatenate![
1043                Axis(0),
1044                self.result_std.slice(s![..i]),
1045                self.result_std.slice(s![i + 1..])
1046            ]
1047        };
1048        let fit_options = {
1049            let mut fit_options = self.options.clone();
1050            fit_options.init_guess = Some(init_guess);
1051            fit_options
1052        };
1053        let fit_reduced = model_reduced.with_options(fit_options).fit()?;
1054        Ok(fit_reduced.deviance())
1055    }
1056
1057    /// Returns the p-value for the omnibus likelihood-ratio test (full model vs. null/intercept-only
1058    /// model).
1059    ///
1060    /// The LR statistic is asymptotically $`\chi^2`$-distributed with
1061    /// [`test_ndf()`](Self::test_ndf) degrees of freedom, so the p-value is the upper-tail
1062    /// probability:
1063    ///
1064    /// ```math
1065    /// p = 1 - F_{\chi^2}(\Lambda;\, \text{test\_ndf})
1066    /// ```
1067    pub fn pvalue_lr_test(&self) -> F {
1068        use statrs::distribution::{ChiSquared, ContinuousCDF};
1069        let stat = self.lr_test().to_f64().unwrap();
1070        let ndf = self.test_ndf() as f64;
1071        if ndf == 0.0 {
1072            return F::one();
1073        }
1074        let chi2 = ChiSquared::new(ndf).unwrap();
1075        F::from(chi2.sf(stat)).unwrap()
1076    }
1077
1078    /// Returns per-parameter p-values from the Wald $`z`$-statistics.
1079    ///
1080    /// The reference distribution depends on whether the family has a free dispersion parameter:
1081    ///
1082    /// - **No dispersion** (logistic, Poisson): standard normal — two-tailed
1083    ///   $`p_k = 2\bigl[1 - \Phi(|z_k|)\bigr]`$
1084    /// - **Free dispersion** (linear): Student-$`t`$ with [`ndf()`](Self::ndf) degrees of
1085    ///   freedom — two-tailed $`p_k = 2\bigl[1 - F_t(|z_k|;\, \text{ndf})\bigr]`$
1086    ///
1087    /// IMPORTANT: Note that this test is an approximation for non-linear models and is known to
1088    /// sometimes yield misleading values compared to an exact test. It is not hard to find it
1089    /// give p-values that may imply significantly different conclusions for your analysis (e.g.
1090    /// p<0.07 vs. p<0.02 in one of our tests).
1091    pub fn pvalue_wald(&self) -> RegressionResult<Array1<F>, F> {
1092        let z = self.wald_z()?;
1093        let pvals = match M::DISPERSED {
1094            DispersionType::NoDispersion => {
1095                use statrs::distribution::Normal;
1096                let norm = Normal::standard();
1097                z.mapv(|zi| {
1098                    let abs_z = num_traits::Float::abs(zi).to_f64().unwrap();
1099                    F::from(2.0 * norm.sf(abs_z)).unwrap()
1100                })
1101            }
1102            DispersionType::FreeDispersion => {
1103                use statrs::distribution::StudentsT;
1104                let df = self.ndf().to_f64().unwrap();
1105                let t_dist = StudentsT::new(0.0, 1.0, df).unwrap();
1106                z.mapv(|zi| {
1107                    let abs_z = num_traits::Float::abs(zi).to_f64().unwrap();
1108                    F::from(2.0 * t_dist.sf(abs_z)).unwrap()
1109                })
1110            }
1111        };
1112        Ok(pvals)
1113    }
1114
1115    /// Returns per-parameter p-values from drop-one analysis of deviance (exact, expensive).
1116    ///
1117    /// For each parameter $`k`$, a reduced model is fit with that parameter removed and the
1118    /// deviance difference $`\Delta D = D_{\text{reduced}} - D_{\text{full}}`$ is used:
1119    ///
1120    /// - **No dispersion**: $`p = 1 - F_{\chi^2}(\Delta D;\, 1)`$
1121    /// - **Free dispersion**: $`F = \Delta D / \hat\phi`$, $`p = 1 - F_F(F;\, 1,\,
1122    ///   \text{ndf})`$
1123    ///
1124    /// For the intercept (if present), the reduced model is fit without an intercept. For all
1125    /// other parameters, the reduced model is fit with that column removed from the design matrix.
1126    pub fn pvalue_exact(&self) -> RegressionResult<Array1<F>, F> {
1127        let n_par = self.n_par;
1128        let dev_full = self.deviance();
1129        let phi_full = self.dispersion();
1130        let ndf_f64 = self.ndf().to_f64().unwrap();
1131        let mut pvals = Array1::<F>::zeros(n_par);
1132        for k in 0..n_par {
1133            let dev_reduced = self.dev_without_covariate(k)?;
1134            let delta_d = dev_reduced - dev_full;
1135
1136            let p = match M::DISPERSED {
1137                DispersionType::NoDispersion => {
1138                    use statrs::distribution::ChiSquared;
1139                    let chi2 = ChiSquared::new(1.0).unwrap();
1140                    1.0 - chi2.cdf(delta_d.to_f64().unwrap())
1141                }
1142                DispersionType::FreeDispersion => {
1143                    use statrs::distribution::FisherSnedecor;
1144                    let f_stat = delta_d / phi_full;
1145                    let f_dist = FisherSnedecor::new(1.0, ndf_f64).unwrap();
1146                    f_dist.sf(f_stat.to_f64().unwrap())
1147                }
1148            };
1149            pvals[k] = F::from(p).unwrap();
1150        }
1151
1152        Ok(pvals)
1153    }
1154}
1155
1156#[cfg(feature = "stats")]
1157impl<'a, M, F> Fit<'a, M, F>
1158where
1159    M: Glm + Response,
1160    F: 'static + Float,
1161{
1162    /// Returns the quantile residuals, which are standard-normal distributed by construction if
1163    /// the residuals are distributed according to the GLM family's response function.
1164    ///
1165    /// ```math
1166    /// q_i = \Phi^{-1}(F(y_i | \mathbf{x}_i \cdot \boldsymbol{\beta}))
1167    /// ```
1168    /// where $`\Phi`$ is the standard normal quantile function and $`F`$ is the CDF of the
1169    /// response distribution, conditioned on the predicted mean.
1170    ///
1171    /// Only implemented for families with continuous response distributions. The variance weights
1172    /// are used to scale the spread for families that use free dispersion. For the Linear model,
1173    /// this is an expensive way of getting $`\frac{(y_i - \hat \mu_i)}{\sqrt{\hat\phi / w_i}} =
1174    /// \frac{(y_i - \hat \mu_i)}{\hat\sigma_i}`$.
1175    ///
1176    /// These residuals are not standardized/studentized in any way, meaning the impact of each
1177    /// observation is present in the corresponding response distribution for that point.
1178    pub fn resid_quantile(&self) -> Array1<F>
1179    where
1180        <M as Response>::DistributionType: ContinuousCDF<f64, f64>,
1181    {
1182        use ndarray::Zip;
1183        use statrs::distribution::Normal;
1184
1185        let std_norm = Normal::standard();
1186        let mu = self.y_hat.mapv(|mu| mu.to_f64().unwrap());
1187        let phi: f64 = self.dispersion().to_f64().unwrap();
1188        // The effective dispersion for each point should be scaled by the variance weight.
1189        // NOTE: The variance weights will not impact the families with no dispersion, as those do
1190        // not use the phi parameter in get_distribution().
1191        let phi_i = self
1192            .data
1193            .get_variance_weights()
1194            .mapv(|w| phi / w.to_f64().unwrap());
1195        Zip::from(&self.data.y)
1196            .and(&mu)
1197            .and(&phi_i)
1198            .map_collect(|y, &mu, &phi_i| {
1199                let dist = M::get_distribution(mu, phi_i);
1200                // This variable should be standard-uniform distributed under data following the model
1201                // assumption
1202                let f = dist.cdf(y.to_f64().unwrap());
1203                F::from_f64(std_norm.inverse_cdf(f)).unwrap()
1204            })
1205    }
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210    use super::*;
1211    use crate::{Linear, Logistic, model::ModelBuilder};
1212    use anyhow::Result;
1213    use approx::assert_abs_diff_eq;
1214    use ndarray::Axis;
1215    use ndarray::{concatenate, s};
1216
1217    /// Checks if the test statistics are invariant based upon whether the data is standardized.
1218    #[test]
1219    fn standardization_invariance() -> Result<()> {
1220        let data_y = array![true, false, false, true, true, true, true, false, true];
1221        let data_x = array![-0.5, 0.3, -0.6, 0.2, 0.3, 1.2, 0.8, 0.6, -0.2].insert_axis(Axis(1));
1222        let lin_off = array![0.1, 0.0, -0.1, 0.2, 0.1, 0.3, 0.4, -0.1, 0.1];
1223        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1224            .no_standardize()
1225            .linear_offset(lin_off.clone())
1226            .build()?;
1227        let fit = model.fit()?;
1228        let model_std = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1229            .linear_offset(lin_off)
1230            .build()?;
1231        let fit_std = model_std.fit()?;
1232        assert_abs_diff_eq!(&fit.result, &fit_std.result, epsilon = 4.0 * f64::EPSILON);
1233        assert_abs_diff_eq!(
1234            fit.covariance()?,
1235            fit_std.covariance()?,
1236            epsilon = 0.01 * f32::EPSILON as f64
1237        );
1238        let lr = fit.lr_test();
1239        let lr_std = fit_std.lr_test();
1240        assert_abs_diff_eq!(lr, lr_std, epsilon = 4.0 * f64::EPSILON);
1241        assert_abs_diff_eq!(
1242            fit.score_test()?,
1243            fit_std.score_test()?,
1244            epsilon = f32::EPSILON as f64
1245        );
1246        assert_abs_diff_eq!(
1247            fit.wald_test(),
1248            fit_std.wald_test(),
1249            epsilon = 4.0 * f64::EPSILON
1250        );
1251        assert_abs_diff_eq!(fit.aic(), fit_std.aic(), epsilon = 4.0 * f64::EPSILON);
1252        assert_abs_diff_eq!(fit.bic(), fit_std.bic(), epsilon = 4.0 * f64::EPSILON);
1253        assert_abs_diff_eq!(
1254            fit.deviance(),
1255            fit_std.deviance(),
1256            epsilon = 4.0 * f64::EPSILON
1257        );
1258        // The Wald Z-score of the intercept term is also invariant, with the new scaling
1259        // approach, so we can compare the full vectors.
1260        assert_abs_diff_eq!(
1261            fit.wald_z()?,
1262            fit_std.wald_z()?,
1263            epsilon = 0.01 * f32::EPSILON as f64
1264        );
1265        // try p-values
1266        #[cfg(feature = "stats")]
1267        assert_abs_diff_eq!(
1268            fit.pvalue_lr_test(),
1269            fit_std.pvalue_lr_test(),
1270            epsilon = 0.01 * f32::EPSILON as f64
1271        );
1272        #[cfg(feature = "stats")]
1273        assert_abs_diff_eq!(
1274            fit.pvalue_wald()?,
1275            fit_std.pvalue_wald()?,
1276            epsilon = 0.01 * f32::EPSILON as f64
1277        );
1278        #[cfg(feature = "stats")]
1279        assert_abs_diff_eq!(
1280            fit.pvalue_exact()?,
1281            fit_std.pvalue_exact()?,
1282            epsilon = 0.01 * f32::EPSILON as f64
1283        );
1284
1285        // Ensure that the score and fisher functions are identical even when evaluated at another
1286        // point. The fit results are near [0.5, 0.5], so pick somewhere not too close.
1287        let other = array![-0.5, 2.0];
1288        assert_abs_diff_eq!(fit.score(other.clone()), fit_std.score(other.clone()));
1289        assert_abs_diff_eq!(fit.fisher(&other), fit_std.fisher(&other.clone()));
1290
1291        Ok(())
1292    }
1293
1294    #[test]
1295    fn null_model() -> Result<()> {
1296        let data_y = array![true, false, false, true, true];
1297        let data_x: Array2<f64> = array![[], [], [], [], []];
1298        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1299        let fit = model.fit()?;
1300        dbg!(fit.n_iter);
1301        dbg!(&fit.result);
1302        // with no offsets, the result should be the link function of the mean.
1303        assert_abs_diff_eq!(
1304            fit.result[0],
1305            <Logistic as Glm>::Link::func(0.6),
1306            epsilon = 4.0 * f64::EPSILON
1307        );
1308        let empty_null_like = fit.null_like();
1309        assert_eq!(fit.test_ndf(), 0);
1310        dbg!(&fit.model_like);
1311        let lr = fit.lr_test();
1312        // Since there is no data, the null likelihood should be identical to
1313        // the fit likelihood, so the likelihood ratio test should yield zero.
1314        assert_abs_diff_eq!(lr, 0., epsilon = 4. * f64::EPSILON);
1315
1316        // Check that the assertions still hold if linear offsets are included.
1317        let lin_off: Array1<f64> = array![0.2, -0.1, 0.1, 0.0, 0.1];
1318        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1319            .linear_offset(lin_off)
1320            .build()?;
1321        let fit_off = model.fit()?;
1322        let empty_model_like_off = fit_off.model_like;
1323        let empty_null_like_off = fit_off.null_like();
1324        // these two assertions should be equivalent
1325        assert_abs_diff_eq!(fit_off.lr_test(), 0.);
1326        assert_abs_diff_eq!(empty_model_like_off, empty_null_like_off);
1327
1328        // check consistency with data provided
1329        let data_x_with = array![[0.5], [-0.2], [0.3], [0.4], [-0.1]];
1330        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x_with).build()?;
1331        let fit_with = model.fit()?;
1332        dbg!(&fit_with.result);
1333        // The null likelihood of the model with parameters should be the same
1334        // as the likelihood of the model with only the intercept.
1335        assert_abs_diff_eq!(empty_null_like, fit_with.null_like());
1336
1337        Ok(())
1338    }
1339
1340    #[test]
1341    fn null_like_logistic() -> Result<()> {
1342        // 6 true and 4 false for y_bar = 0.6.
1343        let data_y = array![
1344            true, true, true, true, true, true, false, false, false, false
1345        ];
1346        let ybar: f64 = 0.6;
1347        let data_x = array![0.4, 0.2, 0.5, 0.1, 0.6, 0.7, 0.3, 0.8, -0.1, 0.1].insert_axis(Axis(1));
1348        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1349        let fit = model.fit()?;
1350        let target_null_like = fit
1351            .data
1352            .y
1353            .mapv(|y| {
1354                let eta = (ybar / (1. - ybar)).ln();
1355                y * eta - eta.exp().ln_1p()
1356            })
1357            .sum();
1358        assert_abs_diff_eq!(fit.null_like(), target_null_like);
1359        Ok(())
1360    }
1361
1362    // Check that the deviance is equal to the sum of square deviations for a linear model
1363    #[test]
1364    fn deviance_linear() -> Result<()> {
1365        let data_y = array![0.3, -0.2, 0.5, 0.7, 0.2, 1.4, 1.1, 0.2];
1366        let data_x = array![0.6, 2.1, 0.4, -3.2, 0.7, 0.1, -0.3, 0.5].insert_axis(Axis(1));
1367        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1368        let fit = model.fit()?;
1369        // The predicted values of Y given the model.
1370        let pred_y = fit.predict(&data_x, None);
1371        let target_dev = (data_y - pred_y).mapv(|dy| dy * dy).sum();
1372        assert_abs_diff_eq!(fit.deviance(), target_dev, epsilon = 4. * f64::EPSILON);
1373        Ok(())
1374    }
1375
1376    // Check that the deviance and dispersion parameter are equal up to the number of degrees of
1377    // freedom for a linea model.
1378    #[test]
1379    fn deviance_dispersion_eq_linear() -> Result<()> {
1380        let data_y = array![0.2, -0.1, 0.4, 1.3, 0.2, -0.6, 0.9];
1381        let data_x = array![
1382            [0.4, 0.2],
1383            [0.1, 0.4],
1384            [-0.1, 0.3],
1385            [0.5, 0.7],
1386            [0.4, 0.1],
1387            [-0.2, -0.3],
1388            [0.4, -0.1]
1389        ];
1390        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1391        let fit = model.fit()?;
1392        let dev = fit.deviance();
1393        let disp = fit.dispersion();
1394        let ndf = fit.ndf();
1395        assert_abs_diff_eq!(dev, disp * ndf, epsilon = 4. * f64::EPSILON);
1396        Ok(())
1397    }
1398
1399    // Check that the residuals for a linear model are all consistent.
1400    #[test]
1401    fn residuals_linear() -> Result<()> {
1402        let data_y = array![0.1, -0.3, 0.7, 0.2, 1.2, -0.4];
1403        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1404        let weights = array![0.8, 1.2, 0.9, 0.8, 1.1, 0.9];
1405        // the implied variances from the weights
1406        let wgt_sigmas = weights.map(|w: &f64| 1. / w.sqrt());
1407        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1408            .var_weights(weights.clone())
1409            .build()?;
1410        let fit = model.fit()?;
1411        let response = fit.resid_resp();
1412        let resp_scaled = &response / wgt_sigmas;
1413        let pearson = fit.resid_pear();
1414        let deviance = fit.resid_dev();
1415        assert_abs_diff_eq!(resp_scaled, pearson);
1416        assert_abs_diff_eq!(resp_scaled, deviance);
1417        let pearson_std = fit.resid_pear_std()?;
1418        let deviance_std = fit.resid_dev_std()?;
1419        assert_abs_diff_eq!(pearson_std, deviance_std, epsilon = 8. * f64::EPSILON);
1420        // The externally-studentized residuals aren't expected to match the internally-studentized
1421        // ones.
1422        let dev_terms_loo = fit.deviance_terms_loo()?;
1423        let disp_terms_loo = fit.dispersion_loo()?;
1424        let student = fit.resid_student()?;
1425
1426        // NOTE: Studentization can't be checked directly in general because the method used is a
1427        // one-step approximation, however it should be exact in the linear OLS case.
1428        let n_data = data_y.len();
1429        // Check that the leave-one-out stats hold literally
1430        let mut loo_diff: Vec<f64> = Vec::new();
1431        let mut loo_dev_res: Vec<f64> = Vec::new();
1432        let mut loo_disp: Vec<f64> = Vec::new();
1433        for i in 0..n_data {
1434            let ya = data_y.slice(s![0..i]);
1435            let yb = data_y.slice(s![i + 1..]);
1436            let xa = data_x.slice(s![0..i, ..]);
1437            let xb = data_x.slice(s![i + 1.., ..]);
1438            let wa = weights.slice(s![0..i]);
1439            let wb = weights.slice(s![i + 1..]);
1440            let y_loo = concatenate![Axis(0), ya, yb];
1441            let x_loo = concatenate![Axis(0), xa, xb];
1442            let w_loo = concatenate![Axis(0), wa, wb];
1443            let model_i = ModelBuilder::<Linear>::data(&y_loo, &x_loo)
1444                .var_weights(w_loo)
1445                .build()?;
1446            let fit_i = model_i.fit()?;
1447            let yi = data_y[i];
1448            let xi = data_x.slice(s![i..i + 1, ..]);
1449            let wi = weights[i];
1450            let yi_pred: f64 = fit_i.predict(&xi, None)[0];
1451            let disp_i = fit_i.dispersion();
1452            let var_i = disp_i / wi;
1453            let diff_i = yi - yi_pred;
1454            let res_dev_i = diff_i / var_i.sqrt();
1455            loo_diff.push(wi * diff_i * diff_i);
1456            loo_disp.push(disp_i);
1457            loo_dev_res.push(res_dev_i);
1458        }
1459        let loo_diff: Array1<f64> = loo_diff.into();
1460        let loo_disp: Array1<f64> = loo_disp.into();
1461        let loo_dev_res: Array1<f64> = loo_dev_res.into();
1462        assert_abs_diff_eq!(dev_terms_loo, loo_diff, epsilon = 8. * f64::EPSILON);
1463        assert_abs_diff_eq!(disp_terms_loo, loo_disp, epsilon = 8. * f64::EPSILON);
1464        assert_abs_diff_eq!(student, loo_dev_res, epsilon = 8. * f64::EPSILON);
1465        Ok(())
1466    }
1467
1468    // check the null likelihood for the case where it can be counted exactly.
1469    #[test]
1470    fn null_like_linear() -> Result<()> {
1471        let data_y = array![0.3, -0.2, 0.5, 0.7, 0.2, 1.4, 1.1, 0.2];
1472        let data_x = array![0.6, 2.1, 0.4, -3.2, 0.7, 0.1, -0.3, 0.5].insert_axis(Axis(1));
1473        let ybar: f64 = data_y.mean().unwrap();
1474        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1475        let fit = model.fit()?;
1476        // let target_null_like = data_y.mapv(|y| -0.5 * (y - ybar) * (y - ybar)).sum();
1477        let target_null_like = data_y.mapv(|y| y * ybar - 0.5 * ybar * ybar).sum();
1478        // With the saturated likelihood subtracted the null likelihood should
1479        // just be the sum of squared differences from the mean.
1480        // let target_null_like = 0.;
1481        // dbg!(target_null_like);
1482        let fit_null_like = fit.null_like();
1483        assert_abs_diff_eq!(2. * (fit.model_like - fit_null_like), fit.lr_test());
1484        assert_eq!(fit.test_ndf(), 1);
1485        assert_abs_diff_eq!(
1486            fit_null_like,
1487            target_null_like,
1488            epsilon = 4.0 * f64::EPSILON
1489        );
1490        Ok(())
1491    }
1492
1493    // check the leave-one-out one-step for the linear model
1494    #[test]
1495    fn loo_linear() -> Result<()> {
1496        let data_y = array![0.1, -0.3, 0.7, 0.2, 1.2, -0.4];
1497        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1498        let weights = array![1.0, 1.2, 0.8, 1.1, 1.0, 0.7];
1499        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1500            .var_weights(weights.clone())
1501            .build()?;
1502        let fit = model.fit()?;
1503
1504        let loo_exact = fit.loo_exact()?;
1505
1506        let loo_coef: Array2<f64> = fit.infl_coef()?;
1507        let loo_results = &fit.result - loo_coef;
1508        let n_data = data_y.len();
1509        for i in 0..n_data {
1510            let ya = data_y.slice(s![0..i]);
1511            let yb = data_y.slice(s![i + 1..]);
1512            let xa = data_x.slice(s![0..i, ..]);
1513            let xb = data_x.slice(s![i + 1.., ..]);
1514            let wa = weights.slice(s![0..i]);
1515            let wb = weights.slice(s![i + 1..]);
1516            let y_loo = concatenate![Axis(0), ya, yb];
1517            let x_loo = concatenate![Axis(0), xa, xb];
1518            let w_loo = concatenate![Axis(0), wa, wb];
1519            let model_i = ModelBuilder::<Linear>::data(&y_loo, &x_loo)
1520                .var_weights(w_loo)
1521                .build()?;
1522            let fit_i = model_i.fit()?;
1523            assert_abs_diff_eq!(
1524                loo_exact.row(i),
1525                &fit_i.result,
1526                epsilon = f32::EPSILON as f64
1527            );
1528            assert_abs_diff_eq!(
1529                loo_results.row(i),
1530                &fit_i.result,
1531                epsilon = f32::EPSILON as f64
1532            );
1533        }
1534        Ok(())
1535    }
1536
1537    // check the leave-one-out one-step for the logistic model
1538    #[test]
1539    fn loo_logistic() -> Result<()> {
1540        let data_y = array![false, false, true, true, true, false];
1541        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1542        let weights = array![1.0, 1.2, 0.8, 1.1, 1.0, 0.7];
1543        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1544            .var_weights(weights.clone())
1545            .build()?;
1546        let fit = model.fit()?;
1547        let fit_reg = model.fit_options().l2_reg(0.5).fit()?;
1548
1549        // NOTE: The one-step approximation fails for non-linear response functions, so we
1550        // should only test the exact case.
1551        let loo_exact = fit.loo_exact()?;
1552        let loo_exact_reg = fit_reg.loo_exact()?;
1553        let n_data = data_y.len();
1554        for i in 0..n_data {
1555            let ya = data_y.slice(s![0..i]);
1556            let yb = data_y.slice(s![i + 1..]);
1557            let xa = data_x.slice(s![0..i, ..]);
1558            let xb = data_x.slice(s![i + 1.., ..]);
1559            let wa = weights.slice(s![0..i]);
1560            let wb = weights.slice(s![i + 1..]);
1561            let y_loo = concatenate![Axis(0), ya, yb];
1562            let x_loo = concatenate![Axis(0), xa, xb];
1563            let w_loo = concatenate![Axis(0), wa, wb];
1564            let model_i = ModelBuilder::<Logistic>::data(&y_loo, &x_loo)
1565                .var_weights(w_loo)
1566                .build()?;
1567            let fit_i = model_i.fit()?;
1568            let fit_i_reg = model_i.fit_options().l2_reg(0.5).fit()?;
1569            assert_abs_diff_eq!(
1570                loo_exact.row(i),
1571                &fit_i.result,
1572                epsilon = f32::EPSILON as f64
1573            );
1574            assert_abs_diff_eq!(
1575                loo_exact_reg.row(i),
1576                &fit_i_reg.result,
1577                epsilon = f32::EPSILON as f64
1578            );
1579        }
1580        Ok(())
1581    }
1582
1583    // check the null likelihood where there is no dependence on the X data.
1584    #[test]
1585    fn null_like_logistic_nodep() -> Result<()> {
1586        let data_y = array![true, true, false, false, true, false, false, true];
1587        let data_x = array![0.4, 0.2, 0.4, 0.2, 0.7, 0.7, -0.1, -0.1].insert_axis(Axis(1));
1588        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1589        let fit = model.fit()?;
1590        let lr = fit.lr_test();
1591        assert_abs_diff_eq!(lr, 0.);
1592        Ok(())
1593    }
1594    // TODO: Test that the statistics behave sensibly under regularization. The
1595    // likelihood ratio test should yield a smaller value.
1596
1597    // Test the basic caching funcions.
1598    #[test]
1599    fn cached_computations() -> Result<()> {
1600        let data_y = array![true, true, false, true, true, false, false, false, true];
1601        let data_x = array![0.4, 0.1, -0.3, 0.7, -0.5, -0.1, 0.8, 1.0, 0.4].insert_axis(Axis(1));
1602        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1603        let fit = model.fit()?;
1604        let _null_like = fit.null_like();
1605        let _null_like = fit.null_like();
1606        let _cov = fit.covariance()?;
1607        let _wald = fit.wald_z();
1608        Ok(())
1609    }
1610
1611    // Check the consistency of the various statistical tests for linear
1612    // regression, where they should be the most comparable.
1613    #[test]
1614    fn linear_stat_tests() -> Result<()> {
1615        let data_y = array![-0.3, -0.1, 0.0, 0.2, 0.4, 0.5, 0.8, 0.8, 1.1];
1616        let data_x = array![-0.5, -0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.3].insert_axis(Axis(1));
1617        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1618        let fit = model.fit()?;
1619        let lr = fit.lr_test();
1620        let wald = fit.wald_test();
1621        let score = fit.score_test()?;
1622        let eps = 32.0 * f64::EPSILON;
1623        assert_abs_diff_eq!(lr, wald, epsilon = eps);
1624        assert_abs_diff_eq!(lr, score, epsilon = eps);
1625        // The score vector should be zero at the minimum
1626        assert_abs_diff_eq!(fit.score(fit.result.clone()), array![0., 0.], epsilon = eps,);
1627        Ok(())
1628    }
1629
1630    // The score should be zero at the MLE even with L2 regularization and internal standardization.
1631    #[test]
1632    fn score_zero_at_mle_regularized() -> Result<()> {
1633        let data_y = array![-0.3, -0.1, 0.0, 0.2, 0.4, 0.5, 0.8, 0.8, 1.1];
1634        let data_x = array![-0.5, -0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.3].insert_axis(Axis(1));
1635        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1636        let fit = model.fit_options().l2_reg(0.1).fit()?;
1637        let eps = 1e-8;
1638        assert_abs_diff_eq!(fit.score(fit.result.clone()), array![0., 0.], epsilon = eps);
1639        // Also check with standardization disabled
1640        let model_nostd = ModelBuilder::<Linear>::data(&data_y, &data_x)
1641            .no_standardize()
1642            .build()?;
1643        let fit_nostd = model_nostd.fit_options().l2_reg(0.1).fit()?;
1644        assert_abs_diff_eq!(
1645            fit_nostd.score(fit_nostd.result.clone()),
1646            array![0., 0.],
1647            epsilon = eps
1648        );
1649        // NOTE: Without regularization, the results themselves will not be exactly identical
1650        // through standardization.
1651        Ok(())
1652    }
1653
1654    #[cfg(feature = "stats")]
1655    #[test]
1656    fn linear_quantile_deviance_equivalence() -> Result<()> {
1657        let data_y = array![-0.5, 0.2, -0.2, 1.2, 0.4, 0.6];
1658        let data_x = array![
1659            [-0.3, 1.0],
1660            [0.2, 0.4],
1661            [-0.5, -0.1],
1662            [-0.2, 0.8],
1663            [0.6, 0.1],
1664            [0.1, -0.4]
1665        ];
1666        let var_weights = array![1.0, 0.2, 2.0, 1.5, 0.7, 1.2];
1667        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1668            .var_weights(var_weights.clone())
1669            .build()?;
1670        let fit = model.fit_options().l2_reg(0.25).fit()?;
1671        let phi = fit.dispersion();
1672        let std_devs = var_weights.mapv_into(|w| num_traits::Float::sqrt(phi / w));
1673        let scaled_resid = fit.resid_resp() / std_devs;
1674        let res_qua = fit.resid_quantile();
1675        assert_abs_diff_eq!(scaled_resid, res_qua, epsilon = 0.01 * f32::EPSILON as f64);
1676
1677        Ok(())
1678    }
1679
1680    /// Test that `pvalue_exact` computes a valid intercept p-value by comparing against R's
1681    /// `anova(glm(y ~ x - 1), glm(y ~ x), test=...)`.
1682    ///
1683    /// R reference (linear):
1684    /// ```r
1685    /// y  <- c(0.3, 1.5, 0.8, 2.1, 1.7, 3.2, 2.5, 0.9)
1686    /// x1 <- c(0.1, 0.5, 0.2, 0.8, 0.6, 1.1, 0.9, 0.3)
1687    /// x2 <- c(0.4, 0.1, 0.3, 0.7, 0.2, 0.5, 0.8, 0.6)
1688    /// anova(glm(y ~ x1 + x2 - 1), glm(y ~ x1 + x2), test = "F")
1689    /// # Pr(>F) = 0.1203156
1690    /// ```
1691    ///
1692    /// R reference (logistic):
1693    /// ```r
1694    /// y <- c(1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1)
1695    /// x <- c(0.5, -0.3, 0.2, 0.8, 0.1, 0.6, -0.1, -0.4, 0.3, 0.4, 0.7, -0.2)
1696    /// anova(glm(y ~ x - 1, family=binomial()), glm(y ~ x, family=binomial()), test = "Chisq")
1697    /// # Pr(>Chi) = 0.6042003
1698    ///
1699    /// NOTE: This test was generated by claude code, hence the ugly hard-coded values.
1700    /// ```
1701    #[cfg(feature = "stats")]
1702    #[test]
1703    fn pvalue_exact_intercept() -> Result<()> {
1704        use crate::Logistic;
1705
1706        // Linear: intercept F-test should match R's anova() result.
1707        // For Gaussian the exact (F-test) intercept p-value equals the Wald t-test p-value.
1708        let y = array![0.3_f64, 1.5, 0.8, 2.1, 1.7, 3.2, 2.5, 0.9];
1709        let x = array![
1710            [0.1_f64, 0.4],
1711            [0.5, 0.1],
1712            [0.2, 0.3],
1713            [0.8, 0.7],
1714            [0.6, 0.2],
1715            [1.1, 0.5],
1716            [0.9, 0.8],
1717            [0.3, 0.6]
1718        ];
1719        let model = ModelBuilder::<Linear>::data(&y, &x).build()?;
1720        let fit = model.fit()?;
1721        let exact_p = fit.pvalue_exact()?;
1722        assert!(
1723            exact_p[0].is_finite() && exact_p[0] >= 0.0 && exact_p[0] <= 1.0,
1724            "intercept p-value must be in [0, 1]"
1725        );
1726        // R: anova(glm(y ~ x1+x2-1), glm(y ~ x1+x2), test="F") Pr(>F) = 0.1203156
1727        assert_abs_diff_eq!(exact_p[0], 0.1203156, epsilon = 1e-5);
1728
1729        // Logistic: intercept chi-squared test.
1730        let y_bin = array![
1731            true, false, true, true, false, true, false, false, true, false, true, true
1732        ];
1733        let x_bin = array![
1734            0.5_f64, -0.3, 0.2, 0.8, 0.1, 0.6, -0.1, -0.4, 0.3, 0.4, 0.7, -0.2
1735        ]
1736        .insert_axis(Axis(1));
1737        let model_bin = ModelBuilder::<Logistic>::data(&y_bin, &x_bin).build()?;
1738        let fit_bin = model_bin.fit()?;
1739        let exact_p_bin = fit_bin.pvalue_exact()?;
1740        // R: anova(..., test="Chisq") Pr(>Chi) = 0.6042003
1741        assert_abs_diff_eq!(exact_p_bin[0], 0.6042003, epsilon = 1e-4);
1742
1743        Ok(())
1744    }
1745
1746    /// Quantile residuals for Exponential regression are Φ⁻¹(F_Exp(rate=1/μ̂)(y)).
1747    ///
1748    /// Uses the same two-group dataset as the `exp_ex` unit test in `response/exponential.rs`,
1749    /// where the exact MLE is known: group means ȳ₀=2, ȳ₁=4, so μ̂ = [2,2,4,4,4].
1750    /// The expected quantile residuals are computed independently here using the Exp CDF.
1751    #[cfg(feature = "stats")]
1752    #[test]
1753    fn exponential_quantile_residuals() -> Result<()> {
1754        use crate::Exponential;
1755        use statrs::distribution::{ContinuousCDF, Exp, Normal};
1756
1757        let data_y = array![1.0_f64, 3.0, 2.0, 4.0, 6.0];
1758        let data_x = array![[0.0_f64], [0.], [1.0], [1.0], [1.0]];
1759        let model = ModelBuilder::<Exponential>::data(&data_y, &data_x).build()?;
1760        let fit = model.fit()?;
1761
1762        // Exact MLE: μ̂ = [2, 2, 4, 4, 4]. Compute expected q_i = Φ⁻¹(F_Exp(rate=1/μ)(y)).
1763        let mu_hat = array![2.0_f64, 2.0, 4.0, 4.0, 4.0];
1764        let std_norm = Normal::standard();
1765        let expected = Array1::from_iter(data_y.iter().zip(mu_hat.iter()).map(|(&y, &mu)| {
1766            let dist = Exp::new(1.0 / mu).unwrap();
1767            std_norm.inverse_cdf(dist.cdf(y))
1768        }));
1769
1770        assert_abs_diff_eq!(fit.resid_quantile(), expected, epsilon = 1e-10);
1771        Ok(())
1772    }
1773
1774    /// Quantile residuals for Gamma regression are Φ⁻¹(F_Gamma(α=1/φ̂, β=α/μ̂)(y)).
1775    ///
1776    /// Uses the same two-group dataset as the `gamma_ex` unit test in `response/gamma.rs`,
1777    /// where the exact MLE is known: group means ȳ₀=2, ȳ₁=4, so μ̂ = [2,2,4,4,4].
1778    /// Expected quantile residuals are independently computed here using the Gamma CDF with
1779    /// the dispersion estimate from the fit.
1780    #[cfg(feature = "stats")]
1781    #[test]
1782    fn gamma_quantile_residuals() -> Result<()> {
1783        use crate::Gamma;
1784        use statrs::distribution::{ContinuousCDF, Gamma as GammaDist, Normal};
1785
1786        let data_y = array![1.0_f64, 3.0, 2.0, 4.0, 6.0];
1787        let data_x = array![[0.0_f64], [0.], [1.0], [1.0], [1.0]];
1788        let model = ModelBuilder::<Gamma>::data(&data_y, &data_x).build()?;
1789        let fit = model.fit()?;
1790
1791        // Exact MLE: μ̂ = [2, 2, 4, 4, 4]. Compute expected q_i = Φ⁻¹(F_Gamma(α, β)(y))
1792        // where α = 1/φ̂ and β = α/μ̂ (rate parameterization: mean = α/β = μ).
1793        let phi: f64 = fit.dispersion();
1794        let alpha = 1.0 / phi;
1795        let mu_hat = array![2.0_f64, 2.0, 4.0, 4.0, 4.0];
1796        let std_norm = Normal::standard();
1797        let expected = Array1::from_iter(data_y.iter().zip(mu_hat.iter()).map(|(&y, &mu)| {
1798            let dist = GammaDist::new(alpha, alpha / mu).unwrap();
1799            std_norm.inverse_cdf(dist.cdf(y))
1800        }));
1801
1802        assert_abs_diff_eq!(fit.resid_quantile(), expected, epsilon = 1e-10);
1803        Ok(())
1804    }
1805}