Skip to main content

ndarray_glm/
fit.rs

1//! Stores the fit results of the IRLS regression and provides functions that
2//! depend on the MLE estimate. These include statistical tests for goodness-of-fit.
3
4pub mod options;
5#[cfg(feature = "stats")]
6use crate::response::Response;
7use crate::{
8    Linear,
9    data::{Dataset, one_pad},
10    error::RegressionResult,
11    glm::{DispersionType, Glm},
12    irls::{Irls, IrlsStep},
13    link::{Link, Transform},
14    model::Model,
15    num::Float,
16    regularization::IrlsReg,
17};
18use ndarray::{Array1, Array2, ArrayBase, ArrayView1, Axis, Data, Ix2, array, s};
19use ndarray_linalg::InverseInto;
20use once_cell::unsync::OnceCell; // can be replaced by std::cell::OnceCell upon stabilization
21use options::FitOptions;
22#[cfg(feature = "stats")]
23use statrs::distribution::ContinuousCDF;
24use std::marker::PhantomData;
25
26/// the result of a successful GLM fit
27pub struct Fit<'a, M, F>
28where
29    M: Glm,
30    F: Float,
31{
32    model: PhantomData<M>,
33    /// The data and model specification used in the fit.
34    data: &'a Dataset<F>,
35    /// The parameter values that maximize the likelihood as given by the IRLS regression. If the
36    /// dataset was internally standardized, this is transformed back
37    pub result: Array1<F>,
38    /// The parameter values used internally for the standardized data.
39    result_std: Array1<F>,
40    /// The predicted y-values for the training data.
41    y_hat: Array1<F>,
42    /// The options used for this fit.
43    pub options: FitOptions<F>,
44    /// The value of the likelihood function for the fit result.
45    pub model_like: F,
46    /// The regularizer of the fit
47    reg: Box<dyn IrlsReg<F>>,
48    /// The number of overall iterations taken in the IRLS.
49    pub n_iter: usize,
50    /// The history of guesses and likelihoods over the IRLS iterations.
51    pub history: Vec<IrlsStep<F>>,
52    /// The number of parameters
53    n_par: usize,
54    // The remaining variables hold cached results and matrices
55    /// The estimated dispersion parameter, which is called in many places. For some families this
56    /// is just one.
57    phi: OnceCell<F>,
58    /// The Pearson residuals, a common statistic that is re-used in several other quantities.
59    resid_pear: OnceCell<Array1<F>>,
60    /// The covariance matrix, using the sandwich approach.
61    covariance: OnceCell<Array2<F>>,
62    /// The inverse of the regularized Fisher information matrix in the external non-standardized
63    /// parameter basis. Used as a component of the sandwich covariance and for influence
64    /// calculations. Cached on first access.
65    fisher_inv: OnceCell<Array2<F>>,
66    /// The inverse of the fisher matrix in standardized internal parameters.
67    fisher_std_inv: OnceCell<Array2<F>>,
68    /// The hat matrix of the data and fit. Since the calculation requires a matrix inversion of
69    /// the fisher information, it is computed only when needed and the value is cached. Access
70    /// through the `hat()` function.
71    hat: OnceCell<Array2<F>>,
72    /// The likelihood and parameters for the null model.
73    null_model: OnceCell<(F, Array1<F>)>,
74}
75
76impl<'a, M, F> Fit<'a, M, F>
77where
78    M: Glm,
79    F: 'static + Float,
80{
81    /// Returns the Akaike information criterion for the model fit.
82    ///
83    /// ```math
84    /// \text{AIC} = D + 2K - 2\sum_{i} \ln w_{i}
85    /// ```
86    ///
87    /// where $`D`$ is the deviance, $`K`$ is the number of parameters, and $`w_i`$ are the
88    /// variance weights.
89    /// This is unique only to an additive constant, so only differences in AIC are meaningful.
90    pub fn aic(&self) -> F {
91        let log_weights = self.data.get_variance_weights().mapv(num_traits::Float::ln);
92        let sum_log_weights = self.data.freq_sum(&log_weights);
93        // NOTE: This is now the unregularized deviance.
94        self.deviance() + F::two() * self.rank() - F::two() * sum_log_weights
95    }
96
97    /// Returns the Bayesian information criterion for the model fit.
98    ///
99    /// ```math
100    /// \text{BIC} = K \ln(n) - 2l - 2\sum_{i} \ln w_{i}
101    /// ```
102    ///
103    /// where $`K`$ is the number of parameters, $`n`$ is the number of observations, and $`l`$ is
104    /// the log-likelihood (including the variance weight normalization terms).
105    // TODO: Also consider the effect of regularization on this statistic.
106    // TODO: Wikipedia suggests that the variance should included in the number
107    // of parameters for multiple linear regression. Should an additional
108    // parameter be included for the dispersion parameter? This question does
109    // not affect the difference between two models fit with the methodology in
110    // this package.
111    pub fn bic(&self) -> F {
112        let log_weights = self.data.get_variance_weights().mapv(num_traits::Float::ln);
113        let sum_log_weights = self.data.freq_sum(&log_weights);
114        let logn = num_traits::Float::ln(self.data.n_obs());
115        logn * self.rank() - F::two() * self.model_like - F::two() * sum_log_weights
116    }
117
118    /// The Cook's distance for each observation, which measures how much the predicted values
119    /// change when leaving out each observation.
120    ///
121    /// ```math
122    /// C_i = \frac{r_i^2 \, h_i}{K \, \hat\phi \, (1 - h_i)^2}
123    /// ```
124    ///
125    /// where $`r_i`$ is the Pearson residual, $`h_i`$ is the leverage, $`K`$ is the rank
126    /// (number of parameters), and $`\hat\phi`$ is the estimated dispersion.
127    pub fn cooks(&self) -> RegressionResult<Array1<F>, F> {
128        let hat = self.leverage()?;
129        let pear_sq = self.resid_pear().mapv(|r| r * r);
130        let h_terms: Array1<F> = hat.mapv_into(|h| {
131            let omh = F::one() - h;
132            h / (omh * omh)
133        });
134        let denom: F = self.rank() * self.dispersion();
135        Ok(pear_sq * h_terms / denom)
136    }
137
138    /// The covariance matrix of the parameter estimates. When no regularization is used, this is:
139    ///
140    /// ```math
141    /// \text{Cov}[\hat{\boldsymbol\beta}] = \hat\phi \, (\mathbf{X}^\mathsf{T}\mathbf{WSX})^{-1}
142    /// ```
143    ///
144    /// When regularization is active, the sandwich form is used to correctly account for the bias
145    /// introduced by the penalty:
146    ///
147    /// ```math
148    /// \text{Cov}[\hat{\boldsymbol\beta}] = \hat\phi \, \mathcal{I}_\text{reg}^{-1} \, \mathcal{I}_\text{data} \, \mathcal{I}_\text{reg}^{-1}
149    /// ```
150    ///
151    /// where $`\mathcal{I}_\text{reg}`$ is the regularized Fisher information and
152    /// $`\mathcal{I}_\text{data}`$ is the unregularized (data-only) Fisher information.
153    /// When unregularized, $`\mathcal{I}_\text{reg} = \mathcal{I}_\text{data}`$ and this reduces
154    /// to the standard form. The result is cached on first access.
155    pub fn covariance(&self) -> RegressionResult<&Array2<F>, F> {
156        self.covariance.get_or_try_init(|| {
157            // The covariance must be multiplied by the dispersion parameter.
158            // For logistic/poisson regression, this is identically 1.
159            // For linear/gamma regression it is estimated from the data.
160            let phi: F = self.dispersion();
161            // NOTE: invh()/invh_into() are bugged and give incorrect values!
162            let f_reg_inv: Array2<F> = self.fisher_inv()?.to_owned();
163            // Use the sandwich form so that regularization doesn't artificially deflate uncertainty.
164            // When unregularized, F_reg = F_data and this reduces to F_data^{-1}.
165            // The unregularized fisher matrix is most easily acquired in terms of the standardized
166            // variables, so apply the external transformation to it.
167            let f_data = self
168                .data
169                .inverse_transform_fisher(self.fisher_data_std(&self.result_std));
170            Ok(f_reg_inv.dot(&f_data).dot(&f_reg_inv) * phi)
171        })
172    }
173
174    /// Returns the deviance of the fit:
175    ///
176    /// ```math
177    /// D = -2 \left[ l(\hat{\boldsymbol\beta}) - l_\text{sat} \right]
178    /// ```
179    ///
180    /// Asymptotically $`\chi^2`$-distributed with [`ndf()`](Self::ndf) degrees of freedom.
181    /// The unregularized likelihood is used.
182    pub fn deviance(&self) -> F {
183        let terms = self.deviance_terms();
184        self.data.freq_sum(&terms)
185    }
186
187    /// Returns the contribution to the deviance from each observation. The total deviance should
188    /// be the sum of all of these. Variance weights are already included, but not frequency
189    /// weights.
190    fn deviance_terms(&self) -> Array1<F> {
191        let ll_terms: Array1<F> = M::log_like_terms(self.data, &self.result_std);
192        let ll_sat: Array1<F> = self.data.y.mapv(M::log_like_sat);
193        let terms = (ll_sat - ll_terms) * F::two();
194        self.data.apply_var_weights(terms)
195    }
196
197    /// Returns the self-excluded deviance terms, i.e. the deviance of an observation as if the
198    /// model was fit without it. This is a one-step approximation.
199    fn deviance_terms_loo(&self) -> RegressionResult<Array1<F>, F> {
200        let dev_terms = self.deviance_terms();
201        let pear_sq = self.resid_pear().mapv(|r| r * r);
202        let hat_rat = self.leverage()?.mapv(|h| h / (F::one() - h));
203        let result = dev_terms + &hat_rat * (&hat_rat + F::two()) * pear_sq;
204        Ok(result)
205    }
206
207    /// The dispersion parameter $`\hat\phi`$ relating the variance to the variance function:
208    /// $`\text{Var}[y] = \phi \, V(\mu)`$.
209    ///
210    /// Identically one for logistic, binomial, and Poisson regression.
211    /// For families with a free dispersion (linear, gamma), estimated as:
212    ///
213    /// ```math
214    /// \hat\phi = \frac{D}{\left(1 - \frac{K}{n_\text{eff}}\right) \sum_i w_i}
215    /// ```
216    ///
217    /// which reduces to $`D / (N - K)`$ without variance weights.
218    pub fn dispersion(&self) -> F {
219        *self.phi.get_or_init(|| {
220            use DispersionType::*;
221            match M::DISPERSED {
222                FreeDispersion => {
223                    let dev = self.deviance();
224                    let p = self.rank();
225                    let n_eff = self.data.n_eff();
226                    let scaling = if p >= n_eff {
227                        // This is the overparameterized regime, which is checked directly instead of
228                        // allowing negative values. It's not clear what conditions result in this when
229                        // p < N.
230                        F::zero()
231                    } else {
232                        (F::one() - p / n_eff) * self.data.sum_weights()
233                    };
234                    dev / scaling
235                }
236                NoDispersion => F::one(),
237            }
238        })
239    }
240
241    /// Return the dispersion terms with the observation(s) at each point excluded from the fit.
242    fn dispersion_loo(&self) -> RegressionResult<Array1<F>, F> {
243        use DispersionType::*;
244        match M::DISPERSED {
245            FreeDispersion => {
246                let pear_sq = self.resid_pear().mapv(|r| r * r);
247                let hat_rat = self.leverage()?.mapv(|h| h / (F::one() - h));
248                let terms = self.deviance_terms() + hat_rat * pear_sq;
249                // Don't apply total weights since the variance weights are already
250                // included in the residual terms. However, we do need the frequency weights.
251                let terms = self.data.apply_freq_weights(terms);
252                let total: Array1<F> = -terms + self.deviance();
253                let scaled_total: Array1<F> = match &self.data.weights {
254                    None => match &self.data.freqs {
255                        Some(f) => total / -(f - self.ndf()),
256                        None => total / (self.ndf() - F::one()),
257                    },
258                    Some(w) => {
259                        let v1 = self.data.freq_sum(w);
260                        let w2 = w * w;
261                        let v2 = self.data.freq_sum(&w2);
262                        // The subtracted out terms need the frequency terms as well
263                        let f_w = self.data.apply_freq_weights(w.clone());
264                        let f_w2 = self.data.apply_freq_weights(w2);
265                        // the modifed sums from leaving out the ith observation
266                        let v1p = -f_w + v1;
267                        let v2p = -f_w2 + v2;
268                        let p = self.rank();
269                        let scale = &v1p - v2p / &v1p * p;
270                        total / scale
271                    }
272                };
273                Ok(scaled_total)
274            }
275            NoDispersion => Ok(Array1::<F>::ones(self.data.y.len())),
276        }
277    }
278
279    /// Returns the Fisher information (the negative Hessian of the log-likelihood) at the
280    /// parameter values given:
281    ///
282    /// ```math
283    /// \mathcal{I}(\boldsymbol\beta) = \mathbf{X}^\mathsf{T}\mathbf{W}\eta'^2\mathbf{S}\mathbf{X}
284    /// ```
285    ///
286    /// where $`\mathbf{S} = \text{diag}(V(\mu_i))`$ and $`\eta'`$ is the derivative of the natural
287    /// parameter in terms of the linear predictor ($`\eta(\omega) = g_0(g^{-1}(\omega))`$ where
288    /// $`g_0`$ is the canonical link function).
289    /// The regularization is included.
290    pub fn fisher(&self, params: &Array1<F>) -> Array2<F> {
291        // Note that fisher() is a public function so it should take the external parameters.
292        let params = self.data.transform_beta(params.clone());
293        // We actually need to futher apply the transformation beyond this, so that *arrays
294        // multiplying this resulting matrix* are hit the right way.
295        let fish_std = self.fisher_std(&params);
296        self.data.inverse_transform_fisher(fish_std)
297    }
298
299    /// The inverse of the (regularized) fisher information matrix, in the external parameter
300    /// basis. Used for the influence calculations, and as a component of the sandwich covariance.
301    /// Cached on first access.
302    fn fisher_inv(&self) -> RegressionResult<&Array2<F>, F> {
303        self.fisher_inv.get_or_try_init(|| {
304            let fisher_reg = self.fisher(&self.result);
305            // NOTE: invh/invh_into() are bugged and incorrect!
306            let fish_inv = fisher_reg.inv_into()?;
307            Ok(fish_inv)
308        })
309    }
310
311    /// Compute the data-only (unregularized) Fisher information in the internal standardized basis.
312    fn fisher_data_std(&self, params: &Array1<F>) -> Array2<F> {
313        let lin_pred: Array1<F> = self.data.linear_predictor_std(params);
314        let adj_var: Array1<F> = M::adjusted_variance_diag(&lin_pred);
315        (self.data.x_conj() * &adj_var).dot(&self.data.x)
316    }
317
318    /// Compute the fisher information in terms of the internal (likely standardized)
319    /// coefficients. The regularization is included here.
320    fn fisher_std(&self, params: &Array1<F>) -> Array2<F> {
321        let fisher = self.fisher_data_std(params);
322        self.reg.as_ref().irls_mat(fisher, params)
323    }
324
325    /// The inverse of the (regularized) fisher information matrix, in the internal standardized
326    /// parameter basis.
327    fn fisher_std_inv(&self) -> RegressionResult<&Array2<F>, F> {
328        self.fisher_std_inv.get_or_try_init(|| {
329            let fisher_reg = self.fisher_std(&self.result_std);
330            // NOTE: invh/invh_into() are bugged and incorrect!
331            let fish_inv = fisher_reg.inv_into()?;
332            Ok(fish_inv)
333        })
334    }
335
336    /// Returns the hat matrix, also known as the "projection" or "influence" matrix:
337    ///
338    /// ```math
339    /// P_{ij} = \frac{\partial \hat{y}_i}{\partial y_j}
340    /// ```
341    ///
342    /// Orthogonal to the response residuals at the fit result: $`\mathbf{P}(\mathbf{y} -
343    /// \hat{\mathbf{y}}) = 0`$.
344    /// This version is not symmetric, but the diagonal is invariant to this choice of convention.
345    pub fn hat(&self) -> RegressionResult<&Array2<F>, F> {
346        self.hat.get_or_try_init(|| {
347            // Do the full computation in terms of the internal parameters, since this observable
348            // is not sensitive to the choice of basis.
349            let lin_pred = self.data.linear_predictor_std(&self.result_std);
350            // Apply the eta' terms manually instead of calling adjusted_variance_diag, because the
351            // adjusted variance method applies 2 powers to the variance, while we want one power
352            // to the variance and one to the weights.
353            // let adj_var = M::adjusted_variance_diag(&lin_pred);
354
355            let mu = M::mean(&lin_pred);
356            let var = mu.mapv_into(M::variance);
357            let eta_d = M::Link::d_nat_param(&lin_pred);
358
359            let fisher_inv = self.fisher_std_inv()?;
360
361            // the GLM variance and the data weights are put on different sides in this convention
362            let left = (var * &eta_d).insert_axis(Axis(1)) * &self.data.x;
363            let right = self.data.x_conj() * &eta_d;
364            Ok(left.dot(&fisher_inv.dot(&right)))
365        })
366    }
367
368    /// The one-step approximation to the change in coefficients from excluding each observation:
369    ///
370    /// ```math
371    /// \Delta\boldsymbol\beta^{(-i)} \approx \frac{1}{1-h_i}\,\mathcal{I}^{-1}\mathbf{x}^{(i)} w_i \eta'_i e_i
372    /// ```
373    ///
374    /// Each row $`i`$ should be subtracted from $`\hat{\boldsymbol\beta}`$ to approximate
375    /// the coefficients that would result from excluding observation $`i`$.
376    /// Exact for linear models; a one-step approximation for nonlinear models.
377    pub fn infl_coef(&self) -> RegressionResult<Array2<F>, F> {
378        // The linear predictor can be acquired in terms of the standardized parameters, but the
379        // rest of the computation should use external.
380        let lin_pred = self.data.linear_predictor_std(&self.result_std);
381        let resid_resp = self.resid_resp();
382        let omh = -self.leverage()? + F::one();
383        let resid_adj = M::Link::adjust_errors(resid_resp, &lin_pred) / omh;
384        let xte = self.data.x_conj_ext() * resid_adj;
385        let fisher_inv = self.fisher_inv()?;
386        let delta_b = xte.t().dot(fisher_inv);
387        Ok(delta_b)
388    }
389
390    /// Returns the leverage $`h_i = P_{ii}`$ for each observation: the diagonal of the hat matrix.
391    /// Indicates the sensitivity of each prediction to its corresponding observation.
392    pub fn leverage(&self) -> RegressionResult<Array1<F>, F> {
393        let hat = self.hat()?;
394        Ok(hat.diag().to_owned())
395    }
396
397    /// Returns exact coefficients from leaving each observation out, one-at-a-time.
398    /// This is a much more expensive operation than the original regression because a new one is
399    /// performed for each observation.
400    pub fn loo_exact(&self) -> RegressionResult<Array2<F>, F> {
401        // Use the one-step approximation to get good starting points for each exclusion.
402        let loo_coef: Array2<F> = self.infl_coef()?;
403        // NOTE: These coefficients need to be in terms of external parameters
404        let loo_initial = &self.result - loo_coef;
405        let mut loo_result = loo_initial.clone();
406        let n_obs = self.data.y.len();
407        for i in 0..n_obs {
408            let data_i: Dataset<F> = {
409                // Get the proper X data matrix as it existed before standardization and
410                // intercept-padding.
411                let x_i = self.data.x_orig();
412                // Leave the observation out by setting the frequency weight to zero.
413                let mut freqs_i: Array1<F> =
414                    self.data.freqs.clone().unwrap_or(Array1::<F>::ones(n_obs));
415                freqs_i[i] = F::zero();
416                let mut data_i = Dataset {
417                    y: self.data.y.clone(),
418                    x: x_i,
419                    linear_offset: self.data.linear_offset.clone(),
420                    weights: self.data.weights.clone(),
421                    freqs: Some(freqs_i),
422                    // These fields must be set this way, as they are in the ModelBuilder, before
423                    // finalize_design_matrix() is called.
424                    has_intercept: false,
425                    standardizer: None,
426                };
427                data_i.finalize_design_matrix(
428                    self.data.standardizer.is_some(),
429                    self.data.has_intercept,
430                );
431                data_i
432            };
433            let model_i = Model {
434                model: PhantomData::<M>,
435                data: data_i,
436            };
437            let options = {
438                let mut options = self.options.clone();
439                // The one-step approximation should be a good starting point.
440                // Use the external result as it should be re-standardized to the new dataset
441                // internally before being passed to IRLS.
442                options.init_guess = Some(loo_initial.row(i).to_owned());
443                options
444            };
445            let fit_i = model_i.with_options(options).fit()?;
446            // The internal re-fit transforms back to the external scale.
447            loo_result.row_mut(i).assign(&fit_i.result);
448        }
449        Ok(loo_result)
450    }
451
452    /// Perform a likelihood-ratio test, returning the statistic:
453    ///
454    /// ```math
455    /// \Lambda = -2 \ln \frac{L_0}{L} = -2(l_0 - l)
456    /// ```
457    ///
458    /// where $`L_0`$ is the null model likelihood (intercept only) and $`L`$ is the fit likelihood.
459    /// By Wilks' theorem, asymptotically $`\chi^2`$-distributed with
460    /// [`test_ndf()`](Self::test_ndf) degrees of freedom.
461    // TODO: Should the effective number of degrees of freedom due to
462    // regularization be taken into account?
463    pub fn lr_test(&self) -> F {
464        // The model likelihood should include regularization terms and there
465        // shouldn't be any in the null model with all non-intercept parameters
466        // set to zero.
467        let null_like = self.null_like();
468        F::from(-2.).unwrap() * (null_like - self.model_like)
469    }
470
471    /// Perform a likelihood-ratio test against a general alternative model, not
472    /// necessarily a null model. The alternative model is regularized the same
473    /// way that the regression resulting in this fit was. The degrees of
474    /// freedom cannot be generally inferred.
475    pub fn lr_test_against(&self, alternative: &Array1<F>) -> F {
476        let alt_std = self.data.transform_beta(alternative.clone());
477        let alt_like = M::log_like(self.data, &alt_std);
478        let alt_like_reg = alt_like + self.reg.likelihood(&alt_std);
479        F::two() * (self.model_like - alt_like_reg)
480    }
481
482    /// Returns the residual degrees of freedom in the model, i.e. the number
483    /// of data points minus the number of parameters. Not to be confused with
484    /// `test_ndf()`, the degrees of freedom in the statistical tests of the
485    /// fit parameters.
486    pub fn ndf(&self) -> F {
487        self.data.n_obs() - self.rank()
488    }
489
490    pub(crate) fn new(data: &'a Dataset<F>, optimum: IrlsStep<F>, irls: Irls<M, F>) -> Self {
491        let IrlsStep {
492            guess: result_std,
493            like: model_like,
494        } = optimum;
495        let Irls {
496            options,
497            reg,
498            n_iter,
499            history,
500            ..
501        } = irls;
502        // Cache some of these variables that will be used often.
503        let n_par = result_std.len();
504        // NOTE: This necessarily uses the coefficients directly from the standardized data.
505        // Store these predictions as they are commonly used.
506        let y_hat = M::mean(&data.linear_predictor_std(&result_std));
507        // The public result must be transformed back to the external scale for compatability with
508        // the input data.
509        let result_ext = data.inverse_transform_beta(result_std.clone());
510        // The history also needs to be transformed back to the external scale, since it is public.
511        // It shouldn't be used directly by the Fit's methods.
512        let history = history
513            .into_iter()
514            .map(|IrlsStep { guess, like }| IrlsStep {
515                guess: data.inverse_transform_beta(guess),
516                like,
517            })
518            .collect();
519        Self {
520            model: PhantomData,
521            data,
522            result: result_ext,
523            result_std,
524            y_hat,
525            options,
526            model_like,
527            reg,
528            n_iter,
529            history,
530            n_par,
531            phi: OnceCell::new(),
532            resid_pear: OnceCell::new(),
533            covariance: OnceCell::new(),
534            fisher_inv: OnceCell::new(),
535            fisher_std_inv: OnceCell::new(),
536            hat: OnceCell::new(),
537            null_model: OnceCell::new(),
538        }
539    }
540
541    /// Returns the likelihood given the null model, which fixes all parameters
542    /// to zero except the intercept (if it is used). A total of `test_ndf()`
543    /// parameters are constrained.
544    pub fn null_like(&self) -> F {
545        let (null_like, _) = self.null_model_fit();
546        *null_like
547    }
548
549    /// Return the likelihood and null model parameters, which will be zero with the possible
550    /// exception of the intercept term. Since this can require an additional regression, the
551    /// values are cached.
552    fn null_model_fit(&self) -> &(F, Array1<F>) {
553        self.null_model
554            .get_or_init(|| match &self.data.linear_offset {
555                None => {
556                    // If there is no linear offset, the natural parameter is
557                    // identical for all observations so it is sufficient to
558                    // calculate the null likelihood for a single point with y equal
559                    // to the average.
560                    // The average y
561                    let y_bar: F = self.data.weighted_sum(&self.data.y) / self.data.sum_weights();
562                    // This approach assumes that the likelihood is in the natural
563                    // exponential form as calculated by Glm::log_like_natural(). If that
564                    // function is overridden and the values differ significantly, this
565                    // approach will give incorrect results. If the likelihood has terms
566                    // non-linear in y, then the likelihood must be calculated for every
567                    // point rather than averaged.
568                    // If the intercept is allowed to maximize the likelihood, the natural
569                    // parameter is equal to the link of the expectation. Otherwise it is
570                    // the transformation function of zero.
571                    let intercept: F = if self.data.has_intercept {
572                        M::Link::func(y_bar)
573                    } else {
574                        F::zero()
575                    };
576                    // this is a length-one array. This works because the
577                    // likelihood contribution is the same for all observations.
578                    let nat_par = M::Link::nat_param(array![intercept]);
579                    // The null likelihood per observation
580                    let null_like_one: F = M::log_like_natural(y_bar, nat_par[0]);
581                    // just multiply the average likelihood by the number of data points, since every term is the same.
582                    let null_like_total = self.data.sum_weights() * null_like_one;
583                    let null_params: Array1<F> = {
584                        let mut par = Array1::<F>::zeros(self.n_par);
585                        par[0] = intercept;
586                        par
587                    };
588                    (null_like_total, null_params)
589                }
590                Some(off) => {
591                    if self.data.has_intercept {
592                        // If there are linear offsets and the intercept is allowed
593                        // to be free, there is not a major simplification and the
594                        // model needs to be re-fit.
595                        // the X data is a single column of ones. Since this model
596                        // isn't being created by the ModelBuilder, the X data
597                        // has to be automatically padded with ones.
598                        let data_x_null = Array2::<F>::ones((self.data.y.len(), 1));
599                        let null_model = Model {
600                            model: std::marker::PhantomData::<M>,
601                            data: Dataset::<F> {
602                                y: self.data.y.clone(),
603                                x: data_x_null,
604                                linear_offset: Some(off.clone()),
605                                weights: self.data.weights.clone(),
606                                freqs: self.data.freqs.clone(),
607                                // If we are in this branch it is because an intercept is needed.
608                                has_intercept: true,
609                                // We don't use standardization for the null model.
610                                standardizer: None,
611                            },
612                        };
613                        // TODO: Make this function return an error, although it's
614                        // difficult to imagine this case happening.
615                        // TODO: Should the tolerance of this fit be stricter?
616                        // The intercept should not be regularized
617                        let null_fit = null_model
618                            .fit_options()
619                            // There shouldn't be too much trouble fitting this
620                            // single-parameter fit, but there shouldn't be harm in
621                            // using the same maximum as in the original model.
622                            .max_iter(self.options.max_iter)
623                            .fit()
624                            .expect("Could not fit null model!");
625                        let null_params: Array1<F> = {
626                            let mut par = Array1::<F>::zeros(self.n_par);
627                            // there is only one parameter in this fit. It should be the same as
628                            // the external result since this model doesn't have standardization.
629                            par[0] = null_fit.result_std[0];
630                            par
631                        };
632                        (null_fit.model_like, null_params)
633                    } else {
634                        // If the intercept is fixed to zero, then no minimization is
635                        // required. The natural parameters are directly known in terms
636                        // of the linear offset. The likelihood must still be summed
637                        // over all observations, since they have different offsets.
638                        let nat_par = M::Link::nat_param(off.clone());
639                        let null_like_terms = ndarray::Zip::from(&self.data.y)
640                            .and(&nat_par)
641                            .map_collect(|&y, &eta| M::log_like_natural(y, eta));
642                        let null_like = self.data.weighted_sum(&null_like_terms);
643                        let null_params = Array1::<F>::zeros(self.n_par);
644                        (null_like, null_params)
645                    }
646                }
647            })
648    }
649
650    /// Returns $`\hat{\mathbf{y}} = g^{-1}(\mathbf{X}\hat{\boldsymbol\beta} + \boldsymbol\omega_0)`$
651    /// given input data $`\mathbf{X}`$ and an optional linear offset
652    /// $`\boldsymbol\omega_0`$. The data need not be the training data.
653    pub fn predict<S>(&self, data_x: &ArrayBase<S, Ix2>, lin_off: Option<&Array1<F>>) -> Array1<F>
654    where
655        S: Data<Elem = F>,
656    {
657        let lin_pred = if self.data.has_intercept {
658            one_pad(data_x.view())
659        } else {
660            data_x.to_owned()
661        }
662        .dot(&self.result);
663        let lin_pred: Array1<F> = if let Some(off) = &lin_off {
664            lin_pred + *off
665        } else {
666            lin_pred
667        };
668        M::mean(&lin_pred)
669    }
670
671    /// Returns the rank $`K`$ of the model (i.e. the number of parameters)
672    fn rank(&self) -> F {
673        F::from(self.n_par).unwrap()
674    }
675
676    /// Return the deviance residuals for each point in the training data:
677    ///
678    /// ```math
679    /// d_i = \text{sign}(y_i - \hat\mu_i)\sqrt{D_i}
680    /// ```
681    ///
682    /// where $`D_i`$ is the per-observation deviance contribution.
683    /// This is usually a better choice than Pearson residuals for non-linear models.
684    pub fn resid_dev(&self) -> Array1<F> {
685        let signs = self.resid_resp().mapv_into(F::signum);
686        let ll_diff = self.deviance_terms();
687        let dev: Array1<F> = ll_diff.mapv_into(num_traits::Float::sqrt);
688        signs * dev
689    }
690
691    /// Return the standardized deviance residuals (internally studentized):
692    ///
693    /// ```math
694    /// d_i^* = \frac{d_i}{\sqrt{\hat\phi(1 - h_i)}}
695    /// ```
696    ///
697    /// where $`d_i`$ is the deviance residual, $`\hat\phi`$ is the dispersion, and $`h_i`$ is
698    /// the leverage. Generally applicable for outlier detection.
699    pub fn resid_dev_std(&self) -> RegressionResult<Array1<F>, F> {
700        let dev = self.resid_dev();
701        let phi = self.dispersion();
702        let hat: Array1<F> = self.leverage()?;
703        let omh: Array1<F> = -hat + F::one();
704        let denom: Array1<F> = (omh * phi).mapv_into(num_traits::Float::sqrt);
705        Ok(dev / denom)
706    }
707
708    /// Return the partial residuals as an n_obs × n_predictors matrix. Each column $`j`$
709    /// contains working residuals plus the centered contribution of predictor $`j`$:
710    ///
711    /// ```math
712    /// r^{(j)}_i = r^w_i + (x_{ij} - \bar x_j) \beta_j
713    /// ```
714    ///
715    /// The bar denotes the **fully weighted** column mean (combining both variance and frequency
716    /// weights), which is the WLS-consistent choice: it ensures that $`\sum_i w_i r^{(j)}_i = 0`$
717    /// for each predictor. R's `residuals(model, type = "partial")` uses only frequency weights in
718    /// its centering (excluding variance weights), so results will differ when variance weights are
719    /// present. For models without an intercept, no centering is applied. The intercept term is
720    /// always excluded from the output.
721    pub fn resid_part(&self) -> Array2<F> {
722        let resid_work_i = self.resid_work().insert_axis(Axis(1));
723        let n = resid_work_i.len();
724
725        // Use external (unstandardized) feature columns and coefficients.
726        let x_orig = self.data.x_orig();
727        // The intercept term is not part of the partial residuals.
728        let beta_j = if self.data.has_intercept {
729            self.result.slice(s![1..]).to_owned()
730        } else {
731            self.result.clone()
732        }
733        .insert_axis(Axis(0));
734
735        // Center by fully-weighted (variance × frequency) column means. This is the WLS-consistent
736        // choice: sum_i w_i r^(j)_i = 0 for each predictor. R excludes variance weights from
737        // centering; results will differ from R when variance weights are present with an intercept.
738        // No centering for no-intercept models.
739        let x_centered: Array2<F> = if self.data.has_intercept {
740            let wt_factors = self
741                .data
742                .apply_total_weights(Array1::ones(n))
743                .insert_axis(Axis(1));
744            let x_mean: Array1<F> = (&x_orig * &wt_factors).sum_axis(Axis(0)) / wt_factors.sum();
745            x_orig - x_mean.insert_axis(Axis(0))
746        } else {
747            x_orig
748        };
749        resid_work_i + x_centered * beta_j
750    }
751
752    /// Return the Pearson residuals for each point in the training data:
753    ///
754    /// ```math
755    /// r_i = \sqrt{w_i} \, \frac{y_i - \hat\mu_i}{\sqrt{V(\hat\mu_i)}}
756    /// ```
757    ///
758    /// where $`V`$ is the variance function and $`w_i`$ are the variance weights.
759    /// Not scaled by the dispersion for families with a free dispersion parameter.
760    pub fn resid_pear(&self) -> &Array1<F> {
761        self.resid_pear.get_or_init(|| {
762            let residuals = self.resid_resp();
763            let inv_var_diag: Array1<F> = self
764                .y_hat
765                .clone()
766                .mapv_into(M::variance)
767                .mapv_into(F::recip);
768            // the variance weights are the reciprocal of the corresponding variance
769            let scales = self
770                .data
771                .apply_var_weights(inv_var_diag)
772                .mapv_into(num_traits::Float::sqrt);
773            scales * residuals
774        })
775    }
776
777    /// Return the standardized Pearson residuals (internally studentized):
778    ///
779    /// ```math
780    /// r_i^* = \frac{r_i}{\sqrt{\hat\phi(1 - h_i)}}
781    /// ```
782    ///
783    /// where $`r_i`$ is the Pearson residual and $`h_i`$ is the leverage. These are expected
784    /// to have unit variance.
785    pub fn resid_pear_std(&self) -> RegressionResult<Array1<F>, F> {
786        let pearson = self.resid_pear();
787        let phi = self.dispersion();
788        let hat = self.leverage()?;
789        let omh = -hat + F::one();
790        let denom: Array1<F> = (omh * phi).mapv_into(num_traits::Float::sqrt);
791        Ok(pearson / denom)
792    }
793
794    /// Return the response residuals: $`e_i^\text{resp} = y_i - \hat\mu_i`$.
795    pub fn resid_resp(&self) -> Array1<F> {
796        &self.data.y - &self.y_hat
797    }
798
799    /// Return the externally studentized residuals:
800    ///
801    /// ```math
802    /// \tilde{t}_i = \text{sign}(e_i) \sqrt{\frac{D_i^{(-i)}}{\hat\phi^{(-i)}}}
803    /// ```
804    ///
805    /// where $`D_i^{(-i)}`$ and $`\hat\phi^{(-i)}`$ are the LOO deviance and dispersion
806    /// approximated via one-step deletion. Under normality, $`t`$-distributed with
807    /// $`N - K - 1`$ degrees of freedom. This is a robust and general method for outlier
808    /// detection.
809    pub fn resid_student(&self) -> RegressionResult<Array1<F>, F> {
810        let signs = self.resid_resp().mapv(F::signum);
811        let dev_terms_loo: Array1<F> = self.deviance_terms_loo()?;
812        // NOTE: This match could also be handled internally in dispersion_loo()
813        let dev_terms_scaled = match M::DISPERSED {
814            // The dispersion is corrected for the contribution from each current point.
815            // This is an approximation; the exact solution would perform a fit at each point.
816            DispersionType::FreeDispersion => dev_terms_loo / self.dispersion_loo()?,
817            DispersionType::NoDispersion => dev_terms_loo,
818        };
819        Ok(signs * dev_terms_scaled.mapv_into(num_traits::Float::sqrt))
820    }
821
822    /// Returns the working residuals:
823    ///
824    /// ```math
825    /// e_i^\text{work} = g'(\hat\mu_i)\,(y_i - \hat\mu_i) = \frac{y_i - \hat\mu_i}{\eta'(\omega_i)\,V(\hat\mu_i)}
826    /// ```
827    ///
828    /// where $`g'(\mu)`$ is the derivative of the link function and $`\eta'(\omega)`$ is the
829    /// derivative of the natural parameter with respect to the linear predictor. For canonical
830    /// links $`\eta'(\omega) = 1`$, reducing this to $`(y_i - \hat\mu_i)/V(\hat\mu_i)`$.
831    ///
832    /// These can be interpreted as the residual differences mapped into the linear predictor space
833    /// of $`\omega = \mathbf{x}\cdot\boldsymbol{\beta}`$.
834    pub fn resid_work(&self) -> Array1<F> {
835        let lin_pred: Array1<F> = self.data.linear_predictor_std(&self.result_std);
836        let mu: Array1<F> = self.y_hat.clone();
837        let resid_response: Array1<F> = &self.data.y - &mu;
838        let var: Array1<F> = mu.mapv(M::variance);
839        // adjust for non-canonical link functions; we want a total factor of 1/eta'
840        let (adj_response, adj_var) =
841            M::Link::adjust_errors_variance(resid_response, var, &lin_pred);
842        adj_response / adj_var
843    }
844
845    /// Returns the score function $`\nabla_{\boldsymbol\beta} l`$ (the gradient of the
846    /// regularized log-likelihood) at the parameter values given. Should be zero at the MLE.
847    /// The input and output are in the external (unstandardized) parameter space.
848    pub fn score(&self, params: Array1<F>) -> Array1<F> {
849        // Compute in the internal (standardized) basis so the regularization gradient is applied
850        // to the correct parameters, then transform the result back to external coordinates via
851        // score_ext = J^T score_int where J = d(beta_std)/d(beta_ext).
852        let params_std = self.data.transform_beta(params);
853        let lin_pred: Array1<F> = self.data.linear_predictor_std(&params_std);
854        let mu: Array1<F> = M::mean(&lin_pred);
855        let resid_response = &self.data.y - mu;
856        let resid_working = M::Link::adjust_errors(resid_response, &lin_pred);
857        let score_int = self.data.x_conj().dot(&resid_working);
858        let score_int_reg = self.reg.as_ref().gradient(score_int, &params_std);
859        self.data.inverse_transform_score(score_int_reg)
860    }
861
862    /// Returns the score test statistic:
863    ///
864    /// ```math
865    /// S = \mathbf{J}(\boldsymbol\beta_0)^\mathsf{T} \, \mathcal{I}(\boldsymbol\beta_0)^{-1} \, \mathbf{J}(\boldsymbol\beta_0)
866    /// ```
867    ///
868    /// where $`\mathbf{J}`$ is the score and $`\mathcal{I}`$ is the Fisher information, both
869    /// evaluated at the null parameters. Asymptotically $`\chi^2`$-distributed with
870    /// [`test_ndf()`](Self::test_ndf) degrees of freedom.
871    pub fn score_test(&self) -> RegressionResult<F, F> {
872        let (_, null_params) = self.null_model_fit();
873        self.score_test_against(null_params.clone())
874    }
875
876    /// Returns the score test statistic compared to another set of model
877    /// parameters, not necessarily a null model. The degrees of freedom cannot
878    /// be generally inferred.
879    pub fn score_test_against(&self, alternative: Array1<F>) -> RegressionResult<F, F> {
880        let fisher_alt = self.fisher(&alternative);
881        let score_alt = self.score(alternative);
882        // The is not the same as the cached covariance matrix since it is
883        // evaluated at the null parameters.
884        // NOTE: invh/invh_into() are bugged and incorrect!
885        let inv_fisher_alt = fisher_alt.inv_into()?;
886        Ok(score_alt.t().dot(&inv_fisher_alt.dot(&score_alt)))
887    }
888
889    /// The degrees of freedom for the likelihood ratio test, the score test,
890    /// and the Wald test. Not to be confused with `ndf()`, the degrees of
891    /// freedom in the model fit.
892    pub fn test_ndf(&self) -> usize {
893        if self.data.has_intercept {
894            self.n_par - 1
895        } else {
896            self.n_par
897        }
898    }
899
900    /// Returns the Wald test statistic:
901    ///
902    /// ```math
903    /// W = (\hat{\boldsymbol\beta} - \boldsymbol\beta_0)^\mathsf{T} \, \mathcal{I}(\boldsymbol\beta_0) \, (\hat{\boldsymbol\beta} - \boldsymbol\beta_0)
904    /// ```
905    ///
906    /// Compared to a null model with only an intercept (if one is used). Asymptotically
907    /// $`\chi^2`$-distributed with [`test_ndf()`](Self::test_ndf) degrees of freedom.
908    pub fn wald_test(&self) -> F {
909        // The null parameters are all zero except for a possible intercept term
910        // which optimizes the null model.
911        let (_, null_params) = self.null_model_fit();
912        // NOTE: The null model is agnostic to standardization, since it discards all features
913        // except perhaps the intercept.
914        self.wald_test_against(null_params)
915    }
916
917    /// Returns the Wald test statistic compared to another specified model fit
918    /// instead of the null model. The degrees of freedom cannot be generally
919    /// inferred.
920    pub fn wald_test_against(&self, alternative: &Array1<F>) -> F {
921        // This could be computed in either the internal or external basis. This implementation
922        // will use internal, so first we just need to transform the input.
923        let alt_std = self.data.transform_beta(alternative.clone());
924        let d_params_std = &self.result_std - &alt_std;
925        // Get the fisher matrix at the *other* result, since this is a test of this fit's
926        // parameters under a model given by the other.
927        let fisher_std: Array2<F> = self.fisher_std(&alt_std);
928        d_params_std.t().dot(&fisher_std.dot(&d_params_std))
929    }
930
931    /// Returns the per-parameter Wald $`z`$-statistic:
932    ///
933    /// ```math
934    /// z_k = \frac{\hat\beta_k}{\sqrt{\text{Cov}[\hat{\boldsymbol\beta}]_{kk}}}
935    /// ```
936    ///
937    /// Since it does not account for covariance between parameters it may not be accurate.
938    pub fn wald_z(&self) -> RegressionResult<Array1<F>, F> {
939        let par_cov = self.covariance()?;
940        let par_variances: ArrayView1<F> = par_cov.diag();
941        Ok(&self.result / &par_variances.mapv(num_traits::Float::sqrt))
942    }
943}
944
945/// Specialized functions for OLS.
946impl<'a, F> Fit<'a, Linear, F>
947where
948    F: 'static + Float,
949{
950    /// Returns the coefficient of determination $`R^2`$:
951    ///
952    /// ```math
953    /// R^2 = 1 - \frac{D}{D_0}
954    /// ```
955    ///
956    /// where $`D`$ is the deviance of the fitted model and $`D_0`$ is the null deviance
957    /// (deviance of the intercept-only model). For Gaussian with no weights this reduces to
958    /// $`1 - \text{RSS}/\text{TSS}`$. Using the deviance ratio correctly handles variance and
959    /// frequency weights.
960    // NOTE: that this implementation would be valid as a pseudo-$`R^2`$ for all GLMs, but it's
961    // mostly expected in the OLS context and probably wouldn't add much.
962    pub fn r_sq(&self) -> F {
963        let lr = self.lr_test();
964        lr / (lr + self.deviance())
965    }
966
967    /// Returns the weighted residual sum of squares (RSS):
968    /// $`\text{RSS} = \sum_i w_i (y_i - \hat\mu_i)^2`$
969    /// where $`w_i`$ combines variance and frequency weights.
970    // NOTE: this implementation would be well-defined for any GLM, but there are typically better
971    // tools for the job so it's not exposed for non-linear families.
972    pub fn rss(&self) -> F {
973        let resid_sq = self.resid_resp().mapv(|r| r * r);
974        self.data.apply_total_weights(resid_sq).sum()
975    }
976}
977
978/// Methods that require the `stats` feature for distribution CDF evaluation.
979#[cfg(feature = "stats")]
980impl<'a, M, F> Fit<'a, M, F>
981where
982    M: Glm,
983    F: 'static + Float,
984{
985    /// Perform a full re-fit of the model excluding the ith data column.
986    /// NOTE: Ideally we could return a full Fit object, but as it stands the source data must
987    /// outlive the Fit result. It would be nice if we could get around this with some COW
988    /// shenanigans but this will probably require a big change. To get around this for now, just
989    /// return the deviance, which is all we're using this function for at the moment.
990    fn dev_without_covariate(&self, i: usize) -> RegressionResult<F, F> {
991        use ndarray::{concatenate, s};
992
993        let is_intercept = self.data.has_intercept && (i == 0);
994        let x_reduced: Array2<F> = if is_intercept {
995            // If this is the intercept term, we need to use the original unscaled data to avoid
996            // mixing
997            self.data.x_ext().slice(s![.., 1..]).to_owned()
998        } else {
999            concatenate![
1000                Axis(1),
1001                self.data.x.slice(s![.., ..i]),
1002                self.data.x.slice(s![.., i + 1..])
1003            ]
1004        };
1005        // NOTE: This data has been standardized already. Since we are fully dropping a covariate,
1006        // each data column should still be standardized if the full external set is.
1007        let data_reduced = Dataset::<F> {
1008            y: self.data.y.clone(),
1009            x: x_reduced,
1010            linear_offset: self.data.linear_offset.clone(),
1011            weights: self.data.weights.clone(),
1012            freqs: self.data.freqs.clone(),
1013            has_intercept: !is_intercept,
1014            // It shouldn't be necessary to standardize again. Each individual column should
1015            // already be standardized, if the full dataset was.
1016            standardizer: None,
1017        };
1018        let model_reduced = Model {
1019            model: PhantomData::<M>,
1020            data: data_reduced,
1021        };
1022        // Start with the non-excluded parameters at the values from the main fit.
1023        let init_guess = if is_intercept {
1024            // For the intercept term, we need to use the original unscaled data, or else the
1025            // transformation mixes the intercept with the other terms. This number is supposed to
1026            // represent the model deviance from excluding the *external* intercept term.
1027            self.result.slice(s![1..]).to_owned()
1028        } else {
1029            // The data is already standardized in the non-intercept case, so we should start the
1030            // the standardized parameters, removing the covariate of interest.
1031            concatenate![
1032                Axis(0),
1033                self.result_std.slice(s![..i]),
1034                self.result_std.slice(s![i + 1..])
1035            ]
1036        };
1037        let fit_options = {
1038            let mut fit_options = self.options.clone();
1039            fit_options.init_guess = Some(init_guess);
1040            fit_options
1041        };
1042        let fit_reduced = model_reduced.with_options(fit_options).fit()?;
1043        Ok(fit_reduced.deviance())
1044    }
1045
1046    /// Returns the p-value for the omnibus likelihood-ratio test (full model vs. null/intercept-only
1047    /// model).
1048    ///
1049    /// The LR statistic is asymptotically $`\chi^2`$-distributed with
1050    /// [`test_ndf()`](Self::test_ndf) degrees of freedom, so the p-value is the upper-tail
1051    /// probability:
1052    ///
1053    /// ```math
1054    /// p = 1 - F_{\chi^2}(\Lambda;\, \text{test\_ndf})
1055    /// ```
1056    pub fn pvalue_lr_test(&self) -> F {
1057        use statrs::distribution::{ChiSquared, ContinuousCDF};
1058        let stat = self.lr_test().to_f64().unwrap();
1059        let ndf = self.test_ndf() as f64;
1060        if ndf == 0.0 {
1061            return F::one();
1062        }
1063        let chi2 = ChiSquared::new(ndf).unwrap();
1064        F::from(chi2.sf(stat)).unwrap()
1065    }
1066
1067    /// Returns per-parameter p-values from the Wald $`z`$-statistics.
1068    ///
1069    /// The reference distribution depends on whether the family has a free dispersion parameter:
1070    ///
1071    /// - **No dispersion** (logistic, Poisson): standard normal — two-tailed
1072    ///   $`p_k = 2\bigl[1 - \Phi(|z_k|)\bigr]`$
1073    /// - **Free dispersion** (linear): Student-$`t`$ with [`ndf()`](Self::ndf) degrees of
1074    ///   freedom — two-tailed $`p_k = 2\bigl[1 - F_t(|z_k|;\, \text{ndf})\bigr]`$
1075    ///
1076    /// IMPORTANT: Note that this test is an approximation for non-linear models and is known to
1077    /// sometimes yield misleading values compared to an exact test. It is not hard to find it
1078    /// give p-values that may imply significantly different conclusions for your analysis (e.g.
1079    /// p<0.07 vs. p<0.02 in one of our tests).
1080    pub fn pvalue_wald(&self) -> RegressionResult<Array1<F>, F> {
1081        let z = self.wald_z()?;
1082        let pvals = match M::DISPERSED {
1083            DispersionType::NoDispersion => {
1084                use statrs::distribution::Normal;
1085                let norm = Normal::standard();
1086                z.mapv(|zi| {
1087                    let abs_z = num_traits::Float::abs(zi).to_f64().unwrap();
1088                    F::from(2.0 * norm.sf(abs_z)).unwrap()
1089                })
1090            }
1091            DispersionType::FreeDispersion => {
1092                use statrs::distribution::StudentsT;
1093                let df = self.ndf().to_f64().unwrap();
1094                let t_dist = StudentsT::new(0.0, 1.0, df).unwrap();
1095                z.mapv(|zi| {
1096                    let abs_z = num_traits::Float::abs(zi).to_f64().unwrap();
1097                    F::from(2.0 * t_dist.sf(abs_z)).unwrap()
1098                })
1099            }
1100        };
1101        Ok(pvals)
1102    }
1103
1104    /// Returns per-parameter p-values from drop-one analysis of deviance (exact, expensive).
1105    ///
1106    /// For each parameter $`k`$, a reduced model is fit with that parameter removed and the
1107    /// deviance difference $`\Delta D = D_{\text{reduced}} - D_{\text{full}}`$ is used:
1108    ///
1109    /// - **No dispersion**: $`p = 1 - F_{\chi^2}(\Delta D;\, 1)`$
1110    /// - **Free dispersion**: $`F = \Delta D / \hat\phi`$, $`p = 1 - F_F(F;\, 1,\,
1111    ///   \text{ndf})`$
1112    ///
1113    /// For the intercept (if present), the reduced model is fit without an intercept. For all
1114    /// other parameters, the reduced model is fit with that column removed from the design matrix.
1115    pub fn pvalue_exact(&self) -> RegressionResult<Array1<F>, F> {
1116        let n_par = self.n_par;
1117        let dev_full = self.deviance();
1118        let phi_full = self.dispersion();
1119        let ndf_f64 = self.ndf().to_f64().unwrap();
1120        let mut pvals = Array1::<F>::zeros(n_par);
1121        for k in 0..n_par {
1122            let dev_reduced = self.dev_without_covariate(k)?;
1123            let delta_d = dev_reduced - dev_full;
1124
1125            let p = match M::DISPERSED {
1126                DispersionType::NoDispersion => {
1127                    use statrs::distribution::ChiSquared;
1128                    let chi2 = ChiSquared::new(1.0).unwrap();
1129                    1.0 - chi2.cdf(delta_d.to_f64().unwrap())
1130                }
1131                DispersionType::FreeDispersion => {
1132                    use statrs::distribution::FisherSnedecor;
1133                    let f_stat = delta_d / phi_full;
1134                    let f_dist = FisherSnedecor::new(1.0, ndf_f64).unwrap();
1135                    f_dist.sf(f_stat.to_f64().unwrap())
1136                }
1137            };
1138            pvals[k] = F::from(p).unwrap();
1139        }
1140
1141        Ok(pvals)
1142    }
1143}
1144
1145#[cfg(feature = "stats")]
1146impl<'a, M, F> Fit<'a, M, F>
1147where
1148    M: Glm + Response,
1149    F: 'static + Float,
1150{
1151    /// Returns the quantile residuals, which are standard-normal distributed by construction if
1152    /// the residuals are distributed according to the GLM family's response function.
1153    ///
1154    /// ```math
1155    /// q_i = \Phi^{-1}(F(y_i | \mathbf{x}_i \cdot \boldsymbol{\beta}))
1156    /// ```
1157    /// where $`\Phi`$ is the standard normal quantile function and $`F`$ is the CDF of the
1158    /// response distribution, conditioned on the predicted mean.
1159    ///
1160    /// Only implemented for families with continuous response distributions. The variance weights
1161    /// are used to scale the spread for families that use free dispersion. For the Linear model,
1162    /// this is an expensive way of getting $`\frac{(y_i - \hat \mu_i)}{\sqrt{\hat\phi / w_i}} =
1163    /// \frac{(y_i - \hat \mu_i)}{\hat\sigma_i}`$.
1164    ///
1165    /// These residuals are not standardized/studentized in any way, meaning the impact of each
1166    /// observation is present in the corresponding response distribution for that point.
1167    pub fn resid_quantile(&self) -> Array1<F>
1168    where
1169        <M as Response>::DistributionType: ContinuousCDF<f64, f64>,
1170    {
1171        use ndarray::Zip;
1172        use statrs::distribution::Normal;
1173
1174        let std_norm = Normal::standard();
1175        let mu = self.y_hat.mapv(|mu| mu.to_f64().unwrap());
1176        let phi: f64 = self.dispersion().to_f64().unwrap();
1177        // The effective dispersion for each point should be scaled by the variance weight.
1178        // NOTE: The variance weights will not impact the families with no dispersion, as those do
1179        // not use the phi parameter in get_distribution().
1180        let phi_i = self
1181            .data
1182            .get_variance_weights()
1183            .mapv(|w| phi / w.to_f64().unwrap());
1184        Zip::from(&self.data.y)
1185            .and(&mu)
1186            .and(&phi_i)
1187            .map_collect(|y, &mu, &phi_i| {
1188                let dist = M::get_distribution(mu, phi_i);
1189                // This variable should be standard-uniform distributed under data following the model
1190                // assumption
1191                let f = dist.cdf(y.to_f64().unwrap());
1192                F::from_f64(std_norm.inverse_cdf(f)).unwrap()
1193            })
1194    }
1195}
1196
1197#[cfg(test)]
1198mod tests {
1199    use super::*;
1200    use crate::{Linear, Logistic, model::ModelBuilder};
1201    use anyhow::Result;
1202    use approx::assert_abs_diff_eq;
1203    use ndarray::Axis;
1204    use ndarray::{concatenate, s};
1205
1206    /// Checks if the test statistics are invariant based upon whether the data is standardized.
1207    #[test]
1208    fn standardization_invariance() -> Result<()> {
1209        let data_y = array![true, false, false, true, true, true, true, false, true];
1210        let data_x = array![-0.5, 0.3, -0.6, 0.2, 0.3, 1.2, 0.8, 0.6, -0.2].insert_axis(Axis(1));
1211        let lin_off = array![0.1, 0.0, -0.1, 0.2, 0.1, 0.3, 0.4, -0.1, 0.1];
1212        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1213            .no_standardize()
1214            .linear_offset(lin_off.clone())
1215            .build()?;
1216        let fit = model.fit()?;
1217        let model_std = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1218            .linear_offset(lin_off)
1219            .build()?;
1220        let fit_std = model_std.fit()?;
1221        assert_abs_diff_eq!(&fit.result, &fit_std.result, epsilon = 4.0 * f64::EPSILON);
1222        assert_abs_diff_eq!(
1223            fit.covariance()?,
1224            fit_std.covariance()?,
1225            epsilon = 0.01 * f32::EPSILON as f64
1226        );
1227        let lr = fit.lr_test();
1228        let lr_std = fit_std.lr_test();
1229        assert_abs_diff_eq!(lr, lr_std, epsilon = 4.0 * f64::EPSILON);
1230        assert_abs_diff_eq!(
1231            fit.score_test()?,
1232            fit_std.score_test()?,
1233            epsilon = f32::EPSILON as f64
1234        );
1235        assert_abs_diff_eq!(
1236            fit.wald_test(),
1237            fit_std.wald_test(),
1238            epsilon = 4.0 * f64::EPSILON
1239        );
1240        assert_abs_diff_eq!(fit.aic(), fit_std.aic(), epsilon = 4.0 * f64::EPSILON);
1241        assert_abs_diff_eq!(fit.bic(), fit_std.bic(), epsilon = 4.0 * f64::EPSILON);
1242        assert_abs_diff_eq!(
1243            fit.deviance(),
1244            fit_std.deviance(),
1245            epsilon = 4.0 * f64::EPSILON
1246        );
1247        // The Wald Z-score of the intercept term is also invariant, with the new scaling
1248        // approach, so we can compare the full vectors.
1249        assert_abs_diff_eq!(
1250            fit.wald_z()?,
1251            fit_std.wald_z()?,
1252            epsilon = 0.01 * f32::EPSILON as f64
1253        );
1254        // try p-values
1255        assert_abs_diff_eq!(
1256            fit.pvalue_lr_test(),
1257            fit_std.pvalue_lr_test(),
1258            epsilon = 0.01 * f32::EPSILON as f64
1259        );
1260        assert_abs_diff_eq!(
1261            fit.pvalue_wald()?,
1262            fit_std.pvalue_wald()?,
1263            epsilon = 0.01 * f32::EPSILON as f64
1264        );
1265        assert_abs_diff_eq!(
1266            fit.pvalue_exact()?,
1267            fit_std.pvalue_exact()?,
1268            epsilon = 0.01 * f32::EPSILON as f64
1269        );
1270
1271        // Ensure that the score and fisher functions are identical even when evaluated at another
1272        // point. The fit results are near [0.5, 0.5], so pick somewhere not too close.
1273        let other = array![-0.5, 2.0];
1274        assert_abs_diff_eq!(fit.score(other.clone()), fit_std.score(other.clone()));
1275        assert_abs_diff_eq!(fit.fisher(&other), fit_std.fisher(&other.clone()));
1276
1277        Ok(())
1278    }
1279
1280    #[test]
1281    fn null_model() -> Result<()> {
1282        let data_y = array![true, false, false, true, true];
1283        let data_x: Array2<f64> = array![[], [], [], [], []];
1284        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1285        let fit = model.fit()?;
1286        dbg!(fit.n_iter);
1287        dbg!(&fit.result);
1288        // with no offsets, the result should be the link function of the mean.
1289        assert_abs_diff_eq!(
1290            fit.result[0],
1291            <Logistic as Glm>::Link::func(0.6),
1292            epsilon = 4.0 * f64::EPSILON
1293        );
1294        let empty_null_like = fit.null_like();
1295        assert_eq!(fit.test_ndf(), 0);
1296        dbg!(&fit.model_like);
1297        let lr = fit.lr_test();
1298        // Since there is no data, the null likelihood should be identical to
1299        // the fit likelihood, so the likelihood ratio test should yield zero.
1300        assert_abs_diff_eq!(lr, 0., epsilon = 4. * f64::EPSILON);
1301
1302        // Check that the assertions still hold if linear offsets are included.
1303        let lin_off: Array1<f64> = array![0.2, -0.1, 0.1, 0.0, 0.1];
1304        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1305            .linear_offset(lin_off)
1306            .build()?;
1307        let fit_off = model.fit()?;
1308        let empty_model_like_off = fit_off.model_like;
1309        let empty_null_like_off = fit_off.null_like();
1310        // these two assertions should be equivalent
1311        assert_abs_diff_eq!(fit_off.lr_test(), 0.);
1312        assert_abs_diff_eq!(empty_model_like_off, empty_null_like_off);
1313
1314        // check consistency with data provided
1315        let data_x_with = array![[0.5], [-0.2], [0.3], [0.4], [-0.1]];
1316        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x_with).build()?;
1317        let fit_with = model.fit()?;
1318        dbg!(&fit_with.result);
1319        // The null likelihood of the model with parameters should be the same
1320        // as the likelihood of the model with only the intercept.
1321        assert_abs_diff_eq!(empty_null_like, fit_with.null_like());
1322
1323        Ok(())
1324    }
1325
1326    #[test]
1327    fn null_like_logistic() -> Result<()> {
1328        // 6 true and 4 false for y_bar = 0.6.
1329        let data_y = array![
1330            true, true, true, true, true, true, false, false, false, false
1331        ];
1332        let ybar: f64 = 0.6;
1333        let data_x = array![0.4, 0.2, 0.5, 0.1, 0.6, 0.7, 0.3, 0.8, -0.1, 0.1].insert_axis(Axis(1));
1334        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1335        let fit = model.fit()?;
1336        let target_null_like = fit
1337            .data
1338            .y
1339            .mapv(|y| {
1340                let eta = (ybar / (1. - ybar)).ln();
1341                y * eta - eta.exp().ln_1p()
1342            })
1343            .sum();
1344        assert_abs_diff_eq!(fit.null_like(), target_null_like);
1345        Ok(())
1346    }
1347
1348    // Check that the deviance is equal to the sum of square deviations for a linear model
1349    #[test]
1350    fn deviance_linear() -> Result<()> {
1351        let data_y = array![0.3, -0.2, 0.5, 0.7, 0.2, 1.4, 1.1, 0.2];
1352        let data_x = array![0.6, 2.1, 0.4, -3.2, 0.7, 0.1, -0.3, 0.5].insert_axis(Axis(1));
1353        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1354        let fit = model.fit()?;
1355        // The predicted values of Y given the model.
1356        let pred_y = fit.predict(&data_x, None);
1357        let target_dev = (data_y - pred_y).mapv(|dy| dy * dy).sum();
1358        assert_abs_diff_eq!(fit.deviance(), target_dev, epsilon = 4. * f64::EPSILON);
1359        Ok(())
1360    }
1361
1362    // Check that the deviance and dispersion parameter are equal up to the number of degrees of
1363    // freedom for a linea model.
1364    #[test]
1365    fn deviance_dispersion_eq_linear() -> Result<()> {
1366        let data_y = array![0.2, -0.1, 0.4, 1.3, 0.2, -0.6, 0.9];
1367        let data_x = array![
1368            [0.4, 0.2],
1369            [0.1, 0.4],
1370            [-0.1, 0.3],
1371            [0.5, 0.7],
1372            [0.4, 0.1],
1373            [-0.2, -0.3],
1374            [0.4, -0.1]
1375        ];
1376        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1377        let fit = model.fit()?;
1378        let dev = fit.deviance();
1379        let disp = fit.dispersion();
1380        let ndf = fit.ndf();
1381        assert_abs_diff_eq!(dev, disp * ndf, epsilon = 4. * f64::EPSILON);
1382        Ok(())
1383    }
1384
1385    // Check that the residuals for a linear model are all consistent.
1386    #[test]
1387    fn residuals_linear() -> Result<()> {
1388        let data_y = array![0.1, -0.3, 0.7, 0.2, 1.2, -0.4];
1389        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1390        let weights = array![0.8, 1.2, 0.9, 0.8, 1.1, 0.9];
1391        // the implied variances from the weights
1392        let wgt_sigmas = weights.map(|w: &f64| 1. / w.sqrt());
1393        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1394            .var_weights(weights.clone())
1395            .build()?;
1396        let fit = model.fit()?;
1397        let response = fit.resid_resp();
1398        let resp_scaled = &response / wgt_sigmas;
1399        let pearson = fit.resid_pear();
1400        let deviance = fit.resid_dev();
1401        assert_abs_diff_eq!(resp_scaled, pearson);
1402        assert_abs_diff_eq!(resp_scaled, deviance);
1403        let pearson_std = fit.resid_pear_std()?;
1404        let deviance_std = fit.resid_dev_std()?;
1405        assert_abs_diff_eq!(pearson_std, deviance_std, epsilon = 8. * f64::EPSILON);
1406        // The externally-studentized residuals aren't expected to match the internally-studentized
1407        // ones.
1408        let dev_terms_loo = fit.deviance_terms_loo()?;
1409        let disp_terms_loo = fit.dispersion_loo()?;
1410        let student = fit.resid_student()?;
1411
1412        // NOTE: Studentization can't be checked directly in general because the method used is a
1413        // one-step approximation, however it should be exact in the linear OLS case.
1414        let n_data = data_y.len();
1415        // Check that the leave-one-out stats hold literally
1416        let mut loo_diff: Vec<f64> = Vec::new();
1417        let mut loo_dev_res: Vec<f64> = Vec::new();
1418        let mut loo_disp: Vec<f64> = Vec::new();
1419        for i in 0..n_data {
1420            let ya = data_y.slice(s![0..i]);
1421            let yb = data_y.slice(s![i + 1..]);
1422            let xa = data_x.slice(s![0..i, ..]);
1423            let xb = data_x.slice(s![i + 1.., ..]);
1424            let wa = weights.slice(s![0..i]);
1425            let wb = weights.slice(s![i + 1..]);
1426            let y_loo = concatenate![Axis(0), ya, yb];
1427            let x_loo = concatenate![Axis(0), xa, xb];
1428            let w_loo = concatenate![Axis(0), wa, wb];
1429            let model_i = ModelBuilder::<Linear>::data(&y_loo, &x_loo)
1430                .var_weights(w_loo)
1431                .build()?;
1432            let fit_i = model_i.fit()?;
1433            let yi = data_y[i];
1434            let xi = data_x.slice(s![i..i + 1, ..]);
1435            let wi = weights[i];
1436            let yi_pred: f64 = fit_i.predict(&xi, None)[0];
1437            let disp_i = fit_i.dispersion();
1438            let var_i = disp_i / wi;
1439            let diff_i = yi - yi_pred;
1440            let res_dev_i = diff_i / var_i.sqrt();
1441            loo_diff.push(wi * diff_i * diff_i);
1442            loo_disp.push(disp_i);
1443            loo_dev_res.push(res_dev_i);
1444        }
1445        let loo_diff: Array1<f64> = loo_diff.into();
1446        let loo_disp: Array1<f64> = loo_disp.into();
1447        let loo_dev_res: Array1<f64> = loo_dev_res.into();
1448        assert_abs_diff_eq!(dev_terms_loo, loo_diff, epsilon = 8. * f64::EPSILON);
1449        assert_abs_diff_eq!(disp_terms_loo, loo_disp, epsilon = 8. * f64::EPSILON);
1450        assert_abs_diff_eq!(student, loo_dev_res, epsilon = 8. * f64::EPSILON);
1451        Ok(())
1452    }
1453
1454    // check the null likelihood for the case where it can be counted exactly.
1455    #[test]
1456    fn null_like_linear() -> Result<()> {
1457        let data_y = array![0.3, -0.2, 0.5, 0.7, 0.2, 1.4, 1.1, 0.2];
1458        let data_x = array![0.6, 2.1, 0.4, -3.2, 0.7, 0.1, -0.3, 0.5].insert_axis(Axis(1));
1459        let ybar: f64 = data_y.mean().unwrap();
1460        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1461        let fit = model.fit()?;
1462        // let target_null_like = data_y.mapv(|y| -0.5 * (y - ybar) * (y - ybar)).sum();
1463        let target_null_like = data_y.mapv(|y| y * ybar - 0.5 * ybar * ybar).sum();
1464        // With the saturated likelihood subtracted the null likelihood should
1465        // just be the sum of squared differences from the mean.
1466        // let target_null_like = 0.;
1467        // dbg!(target_null_like);
1468        let fit_null_like = fit.null_like();
1469        assert_abs_diff_eq!(2. * (fit.model_like - fit_null_like), fit.lr_test());
1470        assert_eq!(fit.test_ndf(), 1);
1471        assert_abs_diff_eq!(
1472            fit_null_like,
1473            target_null_like,
1474            epsilon = 4.0 * f64::EPSILON
1475        );
1476        Ok(())
1477    }
1478
1479    // check the leave-one-out one-step for the linear model
1480    #[test]
1481    fn loo_linear() -> Result<()> {
1482        let data_y = array![0.1, -0.3, 0.7, 0.2, 1.2, -0.4];
1483        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1484        let weights = array![1.0, 1.2, 0.8, 1.1, 1.0, 0.7];
1485        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1486            .var_weights(weights.clone())
1487            .build()?;
1488        let fit = model.fit()?;
1489
1490        let loo_exact = fit.loo_exact()?;
1491
1492        let loo_coef: Array2<f64> = fit.infl_coef()?;
1493        let loo_results = &fit.result - loo_coef;
1494        let n_data = data_y.len();
1495        for i in 0..n_data {
1496            let ya = data_y.slice(s![0..i]);
1497            let yb = data_y.slice(s![i + 1..]);
1498            let xa = data_x.slice(s![0..i, ..]);
1499            let xb = data_x.slice(s![i + 1.., ..]);
1500            let wa = weights.slice(s![0..i]);
1501            let wb = weights.slice(s![i + 1..]);
1502            let y_loo = concatenate![Axis(0), ya, yb];
1503            let x_loo = concatenate![Axis(0), xa, xb];
1504            let w_loo = concatenate![Axis(0), wa, wb];
1505            let model_i = ModelBuilder::<Linear>::data(&y_loo, &x_loo)
1506                .var_weights(w_loo)
1507                .build()?;
1508            let fit_i = model_i.fit()?;
1509            assert_abs_diff_eq!(
1510                loo_exact.row(i),
1511                &fit_i.result,
1512                epsilon = f32::EPSILON as f64
1513            );
1514            assert_abs_diff_eq!(
1515                loo_results.row(i),
1516                &fit_i.result,
1517                epsilon = f32::EPSILON as f64
1518            );
1519        }
1520        Ok(())
1521    }
1522
1523    // check the leave-one-out one-step for the logistic model
1524    #[test]
1525    fn loo_logistic() -> Result<()> {
1526        let data_y = array![false, false, true, true, true, false];
1527        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1528        let weights = array![1.0, 1.2, 0.8, 1.1, 1.0, 0.7];
1529        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
1530            .var_weights(weights.clone())
1531            .build()?;
1532        let fit = model.fit()?;
1533        let fit_reg = model.fit_options().l2_reg(0.5).fit()?;
1534
1535        // NOTE: The one-step approximation fails for non-linear response functions, so we
1536        // should only test the exact case.
1537        let loo_exact = fit.loo_exact()?;
1538        let loo_exact_reg = fit_reg.loo_exact()?;
1539        let n_data = data_y.len();
1540        for i in 0..n_data {
1541            let ya = data_y.slice(s![0..i]);
1542            let yb = data_y.slice(s![i + 1..]);
1543            let xa = data_x.slice(s![0..i, ..]);
1544            let xb = data_x.slice(s![i + 1.., ..]);
1545            let wa = weights.slice(s![0..i]);
1546            let wb = weights.slice(s![i + 1..]);
1547            let y_loo = concatenate![Axis(0), ya, yb];
1548            let x_loo = concatenate![Axis(0), xa, xb];
1549            let w_loo = concatenate![Axis(0), wa, wb];
1550            let model_i = ModelBuilder::<Logistic>::data(&y_loo, &x_loo)
1551                .var_weights(w_loo)
1552                .build()?;
1553            let fit_i = model_i.fit()?;
1554            let fit_i_reg = model_i.fit_options().l2_reg(0.5).fit()?;
1555            assert_abs_diff_eq!(
1556                loo_exact.row(i),
1557                &fit_i.result,
1558                epsilon = f32::EPSILON as f64
1559            );
1560            assert_abs_diff_eq!(
1561                loo_exact_reg.row(i),
1562                &fit_i_reg.result,
1563                epsilon = f32::EPSILON as f64
1564            );
1565        }
1566        Ok(())
1567    }
1568
1569    // check the null likelihood where there is no dependence on the X data.
1570    #[test]
1571    fn null_like_logistic_nodep() -> Result<()> {
1572        let data_y = array![true, true, false, false, true, false, false, true];
1573        let data_x = array![0.4, 0.2, 0.4, 0.2, 0.7, 0.7, -0.1, -0.1].insert_axis(Axis(1));
1574        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1575        let fit = model.fit()?;
1576        let lr = fit.lr_test();
1577        assert_abs_diff_eq!(lr, 0.);
1578        Ok(())
1579    }
1580    // TODO: Test that the statistics behave sensibly under regularization. The
1581    // likelihood ratio test should yield a smaller value.
1582
1583    // Test the basic caching funcions.
1584    #[test]
1585    fn cached_computations() -> Result<()> {
1586        let data_y = array![true, true, false, true, true, false, false, false, true];
1587        let data_x = array![0.4, 0.1, -0.3, 0.7, -0.5, -0.1, 0.8, 1.0, 0.4].insert_axis(Axis(1));
1588        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1589        let fit = model.fit()?;
1590        let _null_like = fit.null_like();
1591        let _null_like = fit.null_like();
1592        let _cov = fit.covariance()?;
1593        let _wald = fit.wald_z();
1594        Ok(())
1595    }
1596
1597    // Check the consistency of the various statistical tests for linear
1598    // regression, where they should be the most comparable.
1599    #[test]
1600    fn linear_stat_tests() -> Result<()> {
1601        let data_y = array![-0.3, -0.1, 0.0, 0.2, 0.4, 0.5, 0.8, 0.8, 1.1];
1602        let data_x = array![-0.5, -0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.3].insert_axis(Axis(1));
1603        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1604        let fit = model.fit()?;
1605        let lr = fit.lr_test();
1606        let wald = fit.wald_test();
1607        let score = fit.score_test()?;
1608        let eps = 32.0 * f64::EPSILON;
1609        assert_abs_diff_eq!(lr, wald, epsilon = eps);
1610        assert_abs_diff_eq!(lr, score, epsilon = eps);
1611        // The score vector should be zero at the minimum
1612        assert_abs_diff_eq!(fit.score(fit.result.clone()), array![0., 0.], epsilon = eps,);
1613        Ok(())
1614    }
1615
1616    // The score should be zero at the MLE even with L2 regularization and internal standardization.
1617    #[test]
1618    fn score_zero_at_mle_regularized() -> Result<()> {
1619        let data_y = array![-0.3, -0.1, 0.0, 0.2, 0.4, 0.5, 0.8, 0.8, 1.1];
1620        let data_x = array![-0.5, -0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.3].insert_axis(Axis(1));
1621        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1622        let fit = model.fit_options().l2_reg(0.1).fit()?;
1623        let eps = 1e-8;
1624        assert_abs_diff_eq!(fit.score(fit.result.clone()), array![0., 0.], epsilon = eps);
1625        // Also check with standardization disabled
1626        let model_nostd = ModelBuilder::<Linear>::data(&data_y, &data_x)
1627            .no_standardize()
1628            .build()?;
1629        let fit_nostd = model_nostd.fit_options().l2_reg(0.1).fit()?;
1630        assert_abs_diff_eq!(
1631            fit_nostd.score(fit_nostd.result.clone()),
1632            array![0., 0.],
1633            epsilon = eps
1634        );
1635        // NOTE: Without regularization, the results themselves will not be exactly identical
1636        // through standardization.
1637        Ok(())
1638    }
1639
1640    #[cfg(feature = "stats")]
1641    #[test]
1642    fn linear_quantile_deviance_equivalence() -> Result<()> {
1643        let data_y = array![-0.5, 0.2, -0.2, 1.2, 0.4, 0.6];
1644        let data_x = array![
1645            [-0.3, 1.0],
1646            [0.2, 0.4],
1647            [-0.5, -0.1],
1648            [-0.2, 0.8],
1649            [0.6, 0.1],
1650            [0.1, -0.4]
1651        ];
1652        let var_weights = array![1.0, 0.2, 2.0, 1.5, 0.7, 1.2];
1653        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1654            .var_weights(var_weights.clone())
1655            .build()?;
1656        let fit = model.fit_options().l2_reg(0.25).fit()?;
1657        let phi = fit.dispersion();
1658        let std_devs = var_weights.mapv_into(|w| num_traits::Float::sqrt(phi / w));
1659        let scaled_resid = fit.resid_resp() / std_devs;
1660        let res_qua = fit.resid_quantile();
1661        assert_abs_diff_eq!(scaled_resid, res_qua, epsilon = 0.01 * f32::EPSILON as f64);
1662
1663        Ok(())
1664    }
1665
1666    /// Test that `pvalue_exact` computes a valid intercept p-value by comparing against R's
1667    /// `anova(glm(y ~ x - 1), glm(y ~ x), test=...)`.
1668    ///
1669    /// R reference (linear):
1670    /// ```r
1671    /// y  <- c(0.3, 1.5, 0.8, 2.1, 1.7, 3.2, 2.5, 0.9)
1672    /// x1 <- c(0.1, 0.5, 0.2, 0.8, 0.6, 1.1, 0.9, 0.3)
1673    /// x2 <- c(0.4, 0.1, 0.3, 0.7, 0.2, 0.5, 0.8, 0.6)
1674    /// anova(glm(y ~ x1 + x2 - 1), glm(y ~ x1 + x2), test = "F")
1675    /// # Pr(>F) = 0.1203156
1676    /// ```
1677    ///
1678    /// R reference (logistic):
1679    /// ```r
1680    /// y <- c(1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1)
1681    /// x <- c(0.5, -0.3, 0.2, 0.8, 0.1, 0.6, -0.1, -0.4, 0.3, 0.4, 0.7, -0.2)
1682    /// anova(glm(y ~ x - 1, family=binomial()), glm(y ~ x, family=binomial()), test = "Chisq")
1683    /// # Pr(>Chi) = 0.6042003
1684    ///
1685    /// NOTE: This test was generated by claude code, hence the ugly hard-coded values.
1686    /// ```
1687    #[cfg(feature = "stats")]
1688    #[test]
1689    fn pvalue_exact_intercept() -> Result<()> {
1690        use crate::Logistic;
1691
1692        // Linear: intercept F-test should match R's anova() result.
1693        // For Gaussian the exact (F-test) intercept p-value equals the Wald t-test p-value.
1694        let y = array![0.3_f64, 1.5, 0.8, 2.1, 1.7, 3.2, 2.5, 0.9];
1695        let x = array![
1696            [0.1_f64, 0.4],
1697            [0.5, 0.1],
1698            [0.2, 0.3],
1699            [0.8, 0.7],
1700            [0.6, 0.2],
1701            [1.1, 0.5],
1702            [0.9, 0.8],
1703            [0.3, 0.6]
1704        ];
1705        let model = ModelBuilder::<Linear>::data(&y, &x).build()?;
1706        let fit = model.fit()?;
1707        let exact_p = fit.pvalue_exact()?;
1708        assert!(
1709            exact_p[0].is_finite() && exact_p[0] >= 0.0 && exact_p[0] <= 1.0,
1710            "intercept p-value must be in [0, 1]"
1711        );
1712        // R: anova(glm(y ~ x1+x2-1), glm(y ~ x1+x2), test="F") Pr(>F) = 0.1203156
1713        assert_abs_diff_eq!(exact_p[0], 0.1203156, epsilon = 1e-5);
1714
1715        // Logistic: intercept chi-squared test.
1716        let y_bin = array![
1717            true, false, true, true, false, true, false, false, true, false, true, true
1718        ];
1719        let x_bin = array![
1720            0.5_f64, -0.3, 0.2, 0.8, 0.1, 0.6, -0.1, -0.4, 0.3, 0.4, 0.7, -0.2
1721        ]
1722        .insert_axis(Axis(1));
1723        let model_bin = ModelBuilder::<Logistic>::data(&y_bin, &x_bin).build()?;
1724        let fit_bin = model_bin.fit()?;
1725        let exact_p_bin = fit_bin.pvalue_exact()?;
1726        // R: anova(..., test="Chisq") Pr(>Chi) = 0.6042003
1727        assert_abs_diff_eq!(exact_p_bin[0], 0.6042003, epsilon = 1e-4);
1728
1729        Ok(())
1730    }
1731}