ndarray_glm/
fit.rs

1//! Stores the fit results of the IRLS regression and provides functions that
2//! depend on the MLE estimate. These include statistical tests for goodness-of-fit.
3
4pub mod options;
5use crate::{
6    error::RegressionResult,
7    glm::{DispersionType, Glm},
8    irls::{Irls, IrlsStep},
9    link::{Link, Transform},
10    model::{Dataset, Model},
11    num::Float,
12    regularization::IrlsReg,
13    Linear,
14};
15use ndarray::{array, Array1, Array2, ArrayBase, ArrayView1, Axis, Data, Ix2};
16use ndarray_linalg::InverseInto;
17use options::FitOptions;
18use std::{
19    cell::{Ref, RefCell},
20    marker::PhantomData,
21};
22
23/// the result of a successful GLM fit
24pub struct Fit<'a, M, F>
25where
26    M: Glm,
27    F: Float,
28{
29    model: PhantomData<M>,
30    /// The data and model specification used in the fit.
31    data: &'a Dataset<F>,
32    /// Whether the intercept covariate is used
33    use_intercept: bool,
34    /// The parameter values that maximize the likelihood as given by the IRLS regression.
35    pub result: Array1<F>,
36    /// The options used for this fit.
37    pub options: FitOptions<F>,
38    /// The value of the likelihood function for the fit result.
39    pub model_like: F,
40    /// The regularizer of the fit
41    reg: Box<dyn IrlsReg<F>>,
42    /// The number of overall iterations taken in the IRLS.
43    pub n_iter: usize,
44    /// The history of guesses and likelihoods over the IRLS iterations.
45    pub history: Vec<IrlsStep<F>>,
46    /// The number of parameters
47    n_par: usize,
48    /// The unscaled covariance matrix of the parameters, otherwise known as the Fisher
49    /// information. Since the calculation requires a matrix inversion, it is computed only when
50    /// needed and the value is cached.
51    cov_unscaled: RefCell<Option<Array2<F>>>,
52    /// The hat matrix of the data and fit. Since the calculation requires a matrix inversion of
53    /// the fisher information, it is computed only when needed and the value is cached. Access
54    /// through the `hat()` function.
55    hat: RefCell<Option<Array2<F>>>,
56    /// The likelihood and parameters for the null model.
57    null_model: RefCell<Option<(F, Array1<F>)>>,
58}
59
60impl<'a, M, F> Fit<'a, M, F>
61where
62    M: Glm,
63    F: 'static + Float,
64{
65    /// Returns the Akaike information criterion for the model fit. It is unique only to an
66    /// additive constant, so only differences in AIC are meaningful.
67    // TODO: Should an effective number of parameters that takes regularization
68    // into acount be considered?
69    pub fn aic(&self) -> F {
70        let log_weights = self.data.get_variance_weights().mapv(num_traits::Float::ln);
71        let sum_log_weights = self.data.freq_sum(&log_weights);
72        // NOTE: This is now the unregularized deviance.
73        self.deviance() + F::two() * self.rank() - F::two() * sum_log_weights
74    }
75
76    /// Returns the Bayesian information criterion for the model fit.
77    // TODO: Also consider the effect of regularization on this statistic.
78    // TODO: Wikipedia suggests that the variance should included in the number
79    // of parameters for multiple linear regression. Should an additional
80    // parameter be included for the dispersion parameter? This question does
81    // not affect the difference between two models fit with the methodology in
82    // this package.
83    pub fn bic(&self) -> F {
84        let logn = num_traits::Float::ln(self.data.n_obs());
85        logn * self.rank() - F::two() * self.model_like
86    }
87
88    /// The covariance matrix estimated by the Fisher information and the dispersion parameter (for
89    /// families with a free scale). The Fisher matrix is cached to avoid repeating the potentially
90    /// expensive matrix inversion.
91    pub fn covariance(&self) -> RegressionResult<Array2<F>> {
92        // The covariance must be multiplied by the dispersion parameter.
93        // For logistic/poisson regression, this is identically 1.
94        // For linear/gamma regression it is estimated from the data.
95        let phi: F = self.dispersion();
96        // NOTE: invh/invh_into() are bugged and incorrect!
97        let unscaled_cov: Array2<F> = self.fisher_inv()?.to_owned();
98        let cov = unscaled_cov * phi;
99        Ok(cov)
100    }
101
102    /// Returns the deviance of the fit: twice the difference between the
103    /// saturated likelihood and the model likelihood. Asymptotically this fits
104    /// a chi-squared distribution with `self.ndf()` degrees of freedom.
105    /// Note that the unregularized likelihood is used here.
106    pub fn deviance(&self) -> F {
107        let terms = self.deviance_terms();
108        self.data.freq_sum(&terms)
109    }
110
111    /// Returns the contribution to the deviance from each observation. The total deviance should
112    /// be the sum of all of these. Variance weights are already included, but not frequency
113    /// weights.
114    fn deviance_terms(&self) -> Array1<F> {
115        let ll_terms: Array1<F> = M::log_like_terms(self.data, &self.result);
116        let ll_sat: Array1<F> = self.data.y.mapv(M::log_like_sat);
117        let terms = (ll_sat - ll_terms) * F::two();
118        self.data.apply_var_weights(terms)
119    }
120
121    /// Returns the self-excluded deviance terms, i.e. the deviance of an observation as if the
122    /// model was fit without it. This is a one-step approximation.
123    fn deviance_terms_loo(&self) -> RegressionResult<Array1<F>> {
124        let dev_terms = self.deviance_terms();
125        let pear_sq = self.resid_pear().mapv(|r| r * r);
126        let hat_rat = self.leverage()?.mapv(|h| h / (F::one() - h));
127        let result = dev_terms + &hat_rat * (&hat_rat + F::two()) * pear_sq;
128        Ok(result)
129    }
130
131    /// The dispersion parameter (typically denoted `phi`)  which relates the variance of the `y`
132    /// values with the variance of the response distribution: `Var[y] = phi * V[mu]`.
133    /// Identically one for logistic, binomial, and Poisson regression.
134    /// For others (linear, gamma) the dispersion parameter is estimated from the data.
135    /// This is equal to the total deviance divided by the degrees of freedom.  For OLS linear
136    /// regression this is equal to the sum of `(y_i - mu_i)^2 / (n-p)`, an estimate of `sigma^2`;
137    /// with no covariates it is equal to the sample variance.
138    pub fn dispersion(&self) -> F {
139        use DispersionType::*;
140        match M::DISPERSED {
141            FreeDispersion => {
142                let dev = self.deviance();
143                let p = self.rank();
144                let n_eff = self.data.n_eff();
145                let scaling = if p >= n_eff {
146                    // This is the overparameterized regime, which is checked directly instead of
147                    // allowing negative values. It's not clear what conditions result in this when
148                    // p < N.
149                    F::zero()
150                } else {
151                    (F::one() - p / n_eff) * self.data.sum_weights()
152                };
153                dev / scaling
154            }
155            NoDispersion => F::one(),
156        }
157    }
158
159    /// Return the dispersion terms with the observation(s) at each point excluded from the fit.
160    fn dispersion_loo(&self) -> RegressionResult<Array1<F>> {
161        use DispersionType::*;
162        match M::DISPERSED {
163            FreeDispersion => {
164                let pear_sq = self.resid_pear().mapv(|r| r * r);
165                let hat_rat = self.leverage()?.mapv(|h| h / (F::one() - h));
166                let terms = self.deviance_terms() + hat_rat * pear_sq;
167                // Don't apply total weights since the variance weights are already
168                // included in the residual terms. However, we do need the frequency weights.
169                let terms = self.data.apply_freq_weights(terms);
170                let total: Array1<F> = -terms + self.deviance();
171                let scaled_total: Array1<F> = match &self.data.weights {
172                    None => match &self.data.freqs {
173                        Some(f) => total / -(f - self.ndf()),
174                        None => total / (self.ndf() - F::one()),
175                    },
176                    Some(w) => {
177                        let v1 = self.data.freq_sum(w);
178                        let w2 = w * w;
179                        let v2 = self.data.freq_sum(&w2);
180                        // The subtracted out terms need the frequency terms as well
181                        let f_w = self.data.apply_freq_weights(w.clone());
182                        let f_w2 = self.data.apply_freq_weights(w2);
183                        // the modifed sums from leaving out the ith observation
184                        let v1p = -f_w + v1;
185                        let v2p = -f_w2 + v2;
186                        let p = self.rank();
187                        let scale = &v1p - v2p / &v1p * p;
188                        total / scale
189                    }
190                };
191                Ok(scaled_total)
192            }
193            NoDispersion => Ok(Array1::<F>::ones(self.data.y.len())),
194        }
195    }
196
197    /// Returns the errors in the response variables for the data passed as an
198    /// argument given the current model fit.
199    fn errors(&self, data: &Dataset<F>) -> Array1<F> {
200        &data.y - &self.predict(&data.x, data.linear_offset.as_ref())
201    }
202
203    #[deprecated(since = "0.0.10", note = "use predict() instead")]
204    pub fn expectation<S>(
205        &self,
206        data_x: &ArrayBase<S, Ix2>,
207        lin_off: Option<&Array1<F>>,
208    ) -> Array1<F>
209    where
210        S: Data<Elem = F>,
211    {
212        self.predict(data_x, lin_off)
213    }
214
215    /// Returns the fisher information (the negative hessian of the likelihood)
216    /// at the parameter values given. The regularization is included.
217    pub fn fisher(&self, params: &Array1<F>) -> Array2<F> {
218        let lin_pred: Array1<F> = self.data.linear_predictor(params);
219        let adj_var: Array1<F> = M::adjusted_variance_diag(&lin_pred);
220        // calculate the fisher matrix
221        let fisher: Array2<F> = (self.data.x_conj() * &adj_var).dot(&self.data.x);
222        // Regularize the fisher matrix
223        self.reg.as_ref().irls_mat(fisher, params)
224    }
225
226    /// The inverse of the (regularized) fisher information matrix. This is used in some other
227    /// calculations (like the covariance and hat matrices) so it is cached.
228    fn fisher_inv(&self) -> RegressionResult<Ref<'_, Array2<F>>> {
229        if self.cov_unscaled.borrow().is_none() {
230            let fisher_reg = self.fisher(&self.result);
231            // NOTE: invh/invh_into() are bugged and incorrect!
232            let unscaled_cov: Array2<F> = fisher_reg.inv_into()?;
233            *self.cov_unscaled.borrow_mut() = Some(unscaled_cov);
234        }
235        Ok(Ref::map(self.cov_unscaled.borrow(), |x| {
236            x.as_ref().unwrap()
237        }))
238    }
239
240    /// Returns the hat matrix of fit, also known as the "projection" or "influence" matrix.
241    /// The convention used corresponds to H = dE[y]/dy and is orthogonal to the response
242    /// residuals. This version is not symmetric.
243    pub fn hat(&self) -> RegressionResult<Ref<'_, Array2<F>>> {
244        if self.hat.borrow().is_none() {
245            let lin_pred = self.data.linear_predictor(&self.result);
246            // Apply the eta' terms manually instead of calling adjusted_variance_diag, because the
247            // adjusted variance method applies 2 powers to the variance, while we want one power
248            // to the variance and one to the weights.
249            // let adj_var = M::adjusted_variance_diag(&lin_pred);
250
251            let mu = M::mean(&lin_pred);
252            let var = mu.mapv_into(M::variance);
253            let eta_d = M::Link::d_nat_param(&lin_pred);
254
255            let fisher_inv = self.fisher_inv()?;
256
257            // the GLM variance and the data weights are put on different sides in this convention
258            let left = (var * &eta_d).insert_axis(Axis(1)) * &self.data.x;
259            let right = self.data.x_conj() * &eta_d;
260            let result = left.dot(&fisher_inv.dot(&right));
261
262            *self.hat.borrow_mut() = Some(result);
263        }
264        let borrowed: Ref<Option<Array2<F>>> = self.hat.borrow();
265        Ok(Ref::map(borrowed, |x| x.as_ref().unwrap()))
266    }
267
268    /// A matrix where each row corresponds to the contribution to the coefficients incurred by
269    /// including the observation in that row. This is inexact for nonlinear models, as a one-step
270    /// approximation is used.
271    /// To approximate the coeficients that would result from excluding the ith observation, the
272    /// ith row of this matrix should be subtracted from the fit result.
273    pub fn infl_coef(&self) -> RegressionResult<Array2<F>> {
274        let lin_pred = self.data.linear_predictor(&self.result);
275        let resid_resp = self.resid_resp();
276        let omh = -self.leverage()? + F::one();
277        let resid_adj = M::Link::adjust_errors(resid_resp, &lin_pred) / omh;
278        let xte = self.data.x_conj() * resid_adj;
279        let fisher_inv = self.fisher_inv()?;
280        let delta_b = xte.t().dot(&*fisher_inv);
281        Ok(delta_b)
282    }
283
284    /// Returns the leverage for each observation. This is given by the diagonal of the projection
285    /// matrix and indicates the sensitivity of each prediction to its corresponding observation.
286    pub fn leverage(&self) -> RegressionResult<Array1<F>> {
287        let hat = self.hat()?;
288        Ok(hat.diag().to_owned())
289    }
290
291    /// Returns exact coefficients from leaving each observation out, one-at-a-time.
292    /// This is a much more expensive operation than the original regression because a new one is
293    /// performed for each observation.
294    pub fn loo_exact(&self) -> RegressionResult<Array2<F>> {
295        let loo_coef: Array2<F> = self.infl_coef()?;
296        // NOTE: could also use the result itself as the initial guess
297        let loo_initial = &self.result - loo_coef;
298        let mut loo_result = loo_initial.clone();
299        let n_obs = self.data.y.len();
300        for i in 0..n_obs {
301            let data_i: Dataset<F> = {
302                let mut data_i: Dataset<F> = self.data.clone();
303                let mut freqs: Array1<F> = data_i.freqs.unwrap_or(Array1::<F>::ones(n_obs));
304                freqs[i] = F::zero();
305                data_i.freqs = Some(freqs);
306                data_i
307            };
308            let model_i = Model {
309                model: PhantomData::<M>,
310                data: data_i,
311                use_intercept: self.use_intercept,
312            };
313            let options = {
314                let mut options = self.options.clone();
315                options.init_guess = Some(self.result.clone());
316                options
317            };
318            let fit_i = model_i.with_options(options).fit()?;
319            loo_result.row_mut(i).assign(&fit_i.result);
320        }
321        Ok(loo_result)
322    }
323
324    /// Perform a likelihood-ratio test, returning the statistic -2*ln(L_0/L)
325    /// where L_0 is the likelihood of the best-fit null model (with no
326    /// parameters but the intercept) and L is the likelihood of the fit result.
327    /// The number of degrees of freedom of this statistic, equal to the number
328    /// of parameters fixed to zero to form the null model, is `test_ndf()`. By
329    /// Wilks' theorem this statistic is asymptotically chi-squared distributed
330    /// with this number of degrees of freedom.
331    // TODO: Should the effective number of degrees of freedom due to
332    // regularization be taken into account?
333    pub fn lr_test(&self) -> F {
334        // The model likelihood should include regularization terms and there
335        // shouldn't be any in the null model with all non-intercept parameters
336        // set to zero.
337        let null_like = self.null_like();
338        F::from(-2.).unwrap() * (null_like - self.model_like)
339    }
340
341    /// Perform a likelihood-ratio test against a general alternative model, not
342    /// necessarily a null model. The alternative model is regularized the same
343    /// way that the regression resulting in this fit was. The degrees of
344    /// freedom cannot be generally inferred.
345    pub fn lr_test_against(&self, alternative: &Array1<F>) -> F {
346        let alt_like = M::log_like(self.data, alternative);
347        let alt_like_reg = alt_like + self.reg.likelihood(alternative);
348        F::two() * (self.model_like - alt_like_reg)
349    }
350
351    /// Returns the residual degrees of freedom in the model, i.e. the number
352    /// of data points minus the number of parameters. Not to be confused with
353    /// `test_ndf()`, the degrees of freedom in the statistical tests of the
354    /// fit parameters.
355    pub fn ndf(&self) -> F {
356        self.data.n_obs() - self.rank()
357    }
358
359    pub(crate) fn new(
360        data: &'a Dataset<F>,
361        use_intercept: bool,
362        irls: Irls<M, F>,
363        history: Vec<IrlsStep<F>>,
364    ) -> Self {
365        let Irls {
366            guess: result,
367            options,
368            reg,
369            n_iter,
370            last_like_data: data_like,
371            ..
372        } = irls;
373        assert_eq!(
374            data_like,
375            M::log_like(data, &result),
376            "Unregularized likelihoods should match exactly."
377        );
378        assert_eq!(
379            n_iter,
380            history.len(),
381            "Number of iterations should match history length"
382        );
383        assert_eq!(
384            result,
385            history.last().unwrap().guess,
386            "Last guess should be the same"
387        );
388        // Cache some of these variables that will be used often.
389        let n_par = result.len();
390        let model_like = data_like + reg.likelihood(&result);
391        assert_eq!(
392            model_like,
393            history.last().unwrap().like,
394            "Last data likelihood should be the same"
395        );
396        Self {
397            model: PhantomData,
398            data,
399            use_intercept,
400            result,
401            options,
402            model_like,
403            reg,
404            n_iter,
405            history,
406            n_par,
407            cov_unscaled: RefCell::new(None),
408            hat: RefCell::new(None),
409            null_model: RefCell::new(None),
410        }
411    }
412
413    /// Returns the likelihood given the null model, which fixes all parameters
414    /// to zero except the intercept (if it is used). A total of `test_ndf()`
415    /// parameters are constrained.
416    pub fn null_like(&self) -> F {
417        let (null_like, _) = self.null_model_fit();
418        null_like
419    }
420
421    /// Return the likelihood and intercept for the null model. Since this can
422    /// require an additional regression, the values are cached.
423    fn null_model_fit(&self) -> (F, Array1<F>) {
424        // TODO: make a result instead of allowing a potential panic in the borrow.
425        if self.null_model.borrow().is_none() {
426            let (null_like, null_intercept): (F, Array1<F>) = match &self.data.linear_offset {
427                None => {
428                    // If there is no linear offset, the natural parameter is
429                    // identical for all observations so it is sufficient to
430                    // calculate the null likelihood for a single point with y equal
431                    // to the average.
432                    // The average y
433                    let y_bar: F = self.data.weighted_sum(&self.data.y) / self.data.sum_weights();
434                    // This approach assumes that the likelihood is in the natural
435                    // exponential form as calculated by Glm::log_like_natural(). If that
436                    // function is overridden and the values differ significantly, this
437                    // approach will give incorrect results. If the likelihood has terms
438                    // non-linear in y, then the likelihood must be calculated for every
439                    // point rather than averaged.
440                    // If the intercept is allowed to maximize the likelihood, the natural
441                    // parameter is equal to the link of the expectation. Otherwise it is
442                    // the transformation function of zero.
443                    let intercept: F = if self.use_intercept {
444                        M::Link::func(y_bar)
445                    } else {
446                        F::zero()
447                    };
448                    // this is a length-one array. This works because the
449                    // likelihood contribution is the same for all observations.
450                    let nat_par = M::Link::nat_param(array![intercept]);
451                    // The null likelihood per observation
452                    let null_like_one: F = M::log_like_natural(y_bar, nat_par[0]);
453                    // just multiply the average likelihood by the number of data points, since every term is the same.
454                    let null_like_total = self.data.sum_weights() * null_like_one;
455                    let null_params: Array1<F> = {
456                        let mut par = Array1::<F>::zeros(self.n_par);
457                        par[0] = intercept;
458                        par
459                    };
460                    (null_like_total, null_params)
461                }
462                Some(off) => {
463                    if self.use_intercept {
464                        // If there are linear offsets and the intercept is allowed
465                        // to be free, there is not a major simplification and the
466                        // model needs to be re-fit.
467                        // the X data is a single column of ones. Since this model
468                        // isn't being created by the ModelBuilder, the X data
469                        // has to be automatically padded with ones.
470                        let data_x_null = Array2::<F>::ones((self.data.y.len(), 1));
471                        let null_model = Model {
472                            model: std::marker::PhantomData::<M>,
473                            data: Dataset::<F> {
474                                y: self.data.y.clone(),
475                                x: data_x_null,
476                                linear_offset: Some(off.clone()),
477                                weights: self.data.weights.clone(),
478                                freqs: self.data.freqs.clone(),
479                            },
480                            // If we are in this branch it is because an intercept is needed.
481                            use_intercept: true,
482                        };
483                        // TODO: Make this function return an error, although it's
484                        // difficult to imagine this case happening.
485                        // TODO: Should the tolerance of this fit be stricter?
486                        // The intercept should not be regularized
487                        let null_fit = null_model
488                            .fit_options()
489                            // There shouldn't be too much trouble fitting this
490                            // single-parameter fit, but there shouldn't be harm in
491                            // using the same maximum as in the original model.
492                            .max_iter(self.options.max_iter)
493                            .fit()
494                            .expect("Could not fit null model!");
495                        let null_params: Array1<F> = {
496                            let mut par = Array1::<F>::zeros(self.n_par);
497                            // there is only one parameter in this fit.
498                            par[0] = null_fit.result[0];
499                            par
500                        };
501                        (null_fit.model_like, null_params)
502                    } else {
503                        // If the intercept is fixed to zero, then no minimization is
504                        // required. The natural parameters are directly known in terms
505                        // of the linear offset. The likelihood must still be summed
506                        // over all observations, since they have different offsets.
507                        let nat_par = M::Link::nat_param(off.clone());
508                        let null_like_terms = ndarray::Zip::from(&self.data.y)
509                            .and(&nat_par)
510                            .map_collect(|&y, &eta| M::log_like_natural(y, eta));
511                        let null_like = self.data.weighted_sum(&null_like_terms);
512                        let null_params = Array1::<F>::zeros(self.n_par);
513                        (null_like, null_params)
514                    }
515                }
516            };
517            *self.null_model.borrow_mut() = Some((null_like, null_intercept));
518        }
519        self.null_model
520            .borrow()
521            .as_ref()
522            .expect("the null model should be cached now")
523            .clone()
524    }
525
526    /// Returns the expected value of Y given the input data X. This data need
527    /// not be the training data, so an option for linear offsets is provided.
528    /// Panics if the number of covariates in the data matrix is not consistent
529    /// with the training set. The data matrix may need to be padded by ones if
530    /// it is not part of a Model. The `utility::one_pad()` function facilitates
531    /// this.
532    pub fn predict<S>(&self, data_x: &ArrayBase<S, Ix2>, lin_off: Option<&Array1<F>>) -> Array1<F>
533    where
534        S: Data<Elem = F>,
535    {
536        let lin_pred: Array1<F> = data_x.dot(&self.result);
537        let lin_pred: Array1<F> = if let Some(off) = &lin_off {
538            lin_pred + *off
539        } else {
540            lin_pred
541        };
542        lin_pred.mapv_into(M::Link::func_inv)
543    }
544
545    /// Returns the rank of the model (i.e. the number of parameters)
546    fn rank(&self) -> F {
547        F::from(self.n_par).unwrap()
548    }
549
550    /// Return the deviance residuals for each point in the training data.
551    /// Equal to `sign(y-E[y|x])*sqrt(-2*(L[y|x] - L_sat[y]))`.
552    /// This is usually a better choice for non-linear models.
553    /// NaNs might be possible if L[y|x] > L_sat[y] due to floating-point operations. These are
554    /// not checked or clipped right now.
555    pub fn resid_dev(&self) -> Array1<F> {
556        let signs = self.resid_resp().mapv_into(F::signum);
557        let ll_diff = self.deviance_terms();
558        let dev: Array1<F> = ll_diff.mapv_into(num_traits::Float::sqrt);
559        signs * dev
560    }
561
562    /// Return the standardized deviance residuals, also known as the "internally studentized
563    /// deviance residuals". This is generally applicable for outlier detection, although the
564    /// influence of each point on the fit is only approximately accounted for.
565    /// `d / sqrt(phi * (1 - h))` where `d` is the deviance residual, phi is the dispersion (e.g.
566    /// sigma^2 for linear regression, 1 for logistic regression), and h is the leverage.
567    pub fn resid_dev_std(&self) -> RegressionResult<Array1<F>> {
568        let dev = self.resid_dev();
569        let phi = self.dispersion();
570        let hat: Array1<F> = self.leverage()?;
571        let omh: Array1<F> = -hat + F::one();
572        let denom: Array1<F> = (omh * phi).mapv_into(num_traits::Float::sqrt);
573        Ok(dev / denom)
574    }
575
576    /// Return the partial residuals.
577    pub fn resid_part(&self) -> Array1<F> {
578        let x_mean = self.data.x.mean_axis(Axis(0)).expect("empty dataset");
579        let x_centered = &self.data.x - x_mean.insert_axis(Axis(0));
580        self.resid_work() + x_centered.dot(&self.result)
581    }
582
583    /// Return the Pearson residuals for each point in the training data.
584    /// This is equal to `(y - E[y])/sqrt(V(E[y]))`, where V is the variance function.
585    /// These are not scaled by the sample standard deviation for families with a free dispersion
586    /// parameter like linear regression.
587    pub fn resid_pear(&self) -> Array1<F> {
588        let mu: Array1<F> = self.predict(&self.data.x, self.data.linear_offset.as_ref());
589        let residuals = &self.data.y - &mu;
590        let inv_var_diag: Array1<F> = mu.mapv_into(M::variance).mapv_into(F::recip);
591        // the variance weights are the reciprocal of the corresponding variance
592        let scales = self
593            .data
594            .apply_var_weights(inv_var_diag)
595            .mapv_into(num_traits::Float::sqrt);
596        scales * residuals
597    }
598
599    /// Return the standardized Pearson residuals for every observation.
600    /// Also known as the "internally studentized Pearson residuals".
601    /// (y - E[y]) / (sqrt(Var[y] * (1 - h))) where h is a vector representing the leverage for
602    /// each observation. These are meant to have a variance of 1.
603    pub fn resid_pear_std(&self) -> RegressionResult<Array1<F>> {
604        let pearson = self.resid_pear();
605        let phi = self.dispersion();
606        let hat = self.leverage()?;
607        let omh = -hat + F::one();
608        let denom: Array1<F> = (omh * phi).mapv_into(num_traits::Float::sqrt);
609        Ok(pearson / denom)
610    }
611
612    /// Return the response residuals, or fitting deviation, for each data point in the fit; that
613    /// is, the difference y - E[y|x] where the expectation value is the y value predicted by the
614    /// model given x.
615    pub fn resid_resp(&self) -> Array1<F> {
616        self.errors(self.data)
617    }
618
619    /// Return the studentized residuals, which are the deviance residuals at each point computed
620    /// as if the fit is leaving out the corresponding observation. For families with a free
621    /// dispersion parameter, the deviance is normalized by the one-step approximation to the
622    /// dispersion.
623    /// This is a robust and general method for outlier detection, although a one-step
624    /// approximation is used to avoid re-fitting the model completely for each observation.
625    /// If the linear errors are standard normally distributed then this statistic should follow a
626    /// t-distribution with `self.ndf() - 1` degrees of freedom.
627    pub fn resid_student(&self) -> RegressionResult<Array1<F>> {
628        let signs = self.resid_resp().mapv(F::signum);
629        let dev_terms_loo: Array1<F> = self.deviance_terms_loo()?;
630        // NOTE: This match could also be handled internally in dispersion_loo()
631        let dev_terms_scaled = match M::DISPERSED {
632            // The dispersion is corrected for the contribution from each current point.
633            // This is an approximation; the exact solution would perform a fit at each point.
634            DispersionType::FreeDispersion => dev_terms_loo / self.dispersion_loo()?,
635            DispersionType::NoDispersion => dev_terms_loo,
636        };
637        Ok(signs * dev_terms_scaled.mapv_into(num_traits::Float::sqrt))
638    }
639
640    /// Returns the working residuals `dg(\mu)/d\mu * (y - E{y|x})`.
641    /// This should be equal to the response residuals divided by the variance function (as
642    /// opposed to the square root of the variance as in the Pearson residuals).
643    pub fn resid_work(&self) -> Array1<F> {
644        let lin_pred: Array1<F> = self.data.linear_predictor(&self.result);
645        let mu: Array1<F> = lin_pred.mapv(M::Link::func_inv);
646        let resid_response: Array1<F> = &self.data.y - &mu;
647        let var: Array1<F> = mu.mapv(M::variance);
648        // adjust for non-canonical link functions; we want a total factor of 1/eta'
649        let (adj_response, adj_var) =
650            M::Link::adjust_errors_variance(resid_response, var, &lin_pred);
651        adj_response / adj_var
652    }
653
654    /// Returns the score function (the gradient of the likelihood) at the
655    /// parameter values given. It should be zero within FPE at the minimized
656    /// result.
657    pub fn score(&self, params: &Array1<F>) -> Array1<F> {
658        // This represents the predictions given the input parameters, not the
659        // fit parameters.
660        let lin_pred: Array1<F> = self.data.linear_predictor(params);
661        let mu: Array1<F> = M::mean(&lin_pred);
662        let resid_response = &self.data.y - mu;
663        let resid_working = M::Link::adjust_errors(resid_response, &lin_pred);
664        let score_unreg = self.data.x_conj().dot(&resid_working);
665        self.reg.as_ref().gradient(score_unreg, params)
666    }
667
668    /// Returns the score test statistic. This statistic is asymptotically
669    /// chi-squared distributed with `test_ndf()` degrees of freedom.
670    pub fn score_test(&self) -> RegressionResult<F> {
671        let (_, null_params) = self.null_model_fit();
672        self.score_test_against(null_params)
673    }
674
675    /// Returns the score test statistic compared to another set of model
676    /// parameters, not necessarily a null model. The degrees of freedom cannot
677    /// be generally inferred.
678    pub fn score_test_against(&self, alternative: Array1<F>) -> RegressionResult<F> {
679        let score_alt = self.score(&alternative);
680        let fisher_alt = self.fisher(&alternative);
681        // The is not the same as the cached covariance matrix since it is
682        // evaluated at the null parameters.
683        // NOTE: invh/invh_into() are bugged and incorrect!
684        let inv_fisher_alt = fisher_alt.inv_into()?;
685        Ok(score_alt.t().dot(&inv_fisher_alt.dot(&score_alt)))
686    }
687
688    /// The degrees of freedom for the likelihood ratio test, the score test,
689    /// and the Wald test. Not to be confused with `ndf()`, the degrees of
690    /// freedom in the model fit.
691    pub fn test_ndf(&self) -> usize {
692        if self.use_intercept {
693            self.n_par - 1
694        } else {
695            self.n_par
696        }
697    }
698
699    /// Returns the Wald test statistic compared to a null model with only an
700    /// intercept (if one is used). This statistic is asymptotically chi-squared
701    /// distributed with `test_ndf()` degrees of freedom.
702    pub fn wald_test(&self) -> F {
703        // The null parameters are all zero except for a possible intercept term
704        // which optimizes the null model.
705        let (_, null_params) = self.null_model_fit();
706        self.wald_test_against(&null_params)
707    }
708
709    /// Returns the Wald test statistic compared to another specified model fit
710    /// instead of the null model. The degrees of freedom cannot be generally
711    /// inferred.
712    pub fn wald_test_against(&self, alternative: &Array1<F>) -> F {
713        let d_params: Array1<F> = &self.result - alternative;
714        let fisher_alt: Array2<F> = self.fisher(alternative);
715        d_params.t().dot(&fisher_alt.dot(&d_params))
716    }
717
718    /// Returns the signed square root of the Wald test statistic for each
719    /// parameter. Since it does not account for covariance between the
720    /// parameters it may not be accurate.
721    pub fn wald_z(&self) -> RegressionResult<Array1<F>> {
722        let par_cov = self.covariance()?;
723        let par_variances: ArrayView1<F> = par_cov.diag();
724        Ok(&self.result / &par_variances.mapv(num_traits::Float::sqrt))
725    }
726}
727
728/// Specialized functions for OLS.
729impl<'a, F> Fit<'a, Linear, F>
730where
731    F: 'static + Float,
732{
733    /// Returns the coefficient of multiple correlation, R^2.
734    pub fn r_sq(&self) -> F {
735        let y_avg: F = self.data.y.mean().expect("Data should be non-empty");
736        let total_sum_sq: F = self.data.y.mapv(|y| y - y_avg).mapv(|dy| dy * dy).sum();
737        (total_sum_sq - self.resid_sum_sq()) / total_sum_sq
738    }
739
740    /// Returns the residual sum of squares, i.e. the sum of the squared residuals.
741    pub fn resid_sum_sq(&self) -> F {
742        self.resid_resp().mapv_into(|r| r * r).sum()
743    }
744}
745
746#[cfg(test)]
747mod tests {
748    use super::*;
749    use crate::{
750        model::ModelBuilder,
751        utility::{one_pad, standardize},
752        Linear, Logistic,
753    };
754    use anyhow::Result;
755    use approx::assert_abs_diff_eq;
756    use ndarray::Axis;
757    use ndarray::{concatenate, s};
758
759    /// Checks if the test statistics are invariant based upon whether the data is standardized.
760    #[test]
761    fn standardization_invariance() -> Result<()> {
762        let data_y = array![true, false, false, true, true, true, true, false, true];
763        let data_x = array![-0.5, 0.3, -0.6, 0.2, 0.3, 1.2, 0.8, 0.6, -0.2].insert_axis(Axis(1));
764        let lin_off = array![0.1, 0.0, -0.1, 0.2, 0.1, 0.3, 0.4, -0.1, 0.1];
765        let data_x_std = standardize(data_x.clone());
766        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
767            .linear_offset(lin_off.clone())
768            .build()?;
769        let fit = model.fit()?;
770        let model_std = ModelBuilder::<Logistic>::data(&data_y, &data_x_std)
771            .linear_offset(lin_off)
772            .build()?;
773        let fit_std = model_std.fit()?;
774        let lr = fit.lr_test();
775        let lr_std = fit_std.lr_test();
776        assert_abs_diff_eq!(lr, lr_std);
777        eprintln!("about to try score test");
778        assert_abs_diff_eq!(
779            fit.score_test()?,
780            fit_std.score_test()?,
781            epsilon = f32::EPSILON as f64
782        );
783        eprintln!("about to try wald test");
784        assert_abs_diff_eq!(
785            fit.wald_test(),
786            fit_std.wald_test(),
787            epsilon = 4.0 * f64::EPSILON
788        );
789        assert_abs_diff_eq!(fit.aic(), fit_std.aic());
790        assert_abs_diff_eq!(fit.bic(), fit_std.bic());
791        eprintln!("about to try deviance");
792        assert_abs_diff_eq!(fit.deviance(), fit_std.deviance());
793        // The Wald Z-score of the intercept term is not invariant under a
794        // linear transformation of the data, but the parameter part seems to
795        // be, at least for single-component data.
796        assert_abs_diff_eq!(
797            fit.wald_z()?[1],
798            fit_std.wald_z()?[1],
799            epsilon = 0.01 * f32::EPSILON as f64
800        );
801
802        Ok(())
803    }
804
805    #[test]
806    fn null_model() -> Result<()> {
807        let data_y = array![true, false, false, true, true];
808        let data_x: Array2<f64> = array![[], [], [], [], []];
809        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
810        let fit = model.fit()?;
811        dbg!(fit.n_iter);
812        dbg!(&fit.result);
813        // with no offsets, the result should be the link function of the mean.
814        assert_abs_diff_eq!(
815            fit.result[0],
816            <Logistic as Glm>::Link::func(0.6),
817            epsilon = 4.0 * f64::EPSILON
818        );
819        let empty_null_like = fit.null_like();
820        assert_eq!(fit.test_ndf(), 0);
821        dbg!(&fit.model_like);
822        let lr = fit.lr_test();
823        // Since there is no data, the null likelihood should be identical to
824        // the fit likelihood, so the likelihood ratio test should yield zero.
825        assert_abs_diff_eq!(lr, 0., epsilon = 4. * f64::EPSILON);
826
827        // Check that the assertions still hold if linear offsets are included.
828        let lin_off: Array1<f64> = array![0.2, -0.1, 0.1, 0.0, 0.1];
829        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x)
830            .linear_offset(lin_off)
831            .build()?;
832        let fit_off = model.fit()?;
833        let empty_model_like_off = fit_off.model_like;
834        let empty_null_like_off = fit_off.null_like();
835        // these two assertions should be equivalent
836        assert_abs_diff_eq!(fit_off.lr_test(), 0.);
837        assert_abs_diff_eq!(empty_model_like_off, empty_null_like_off);
838
839        // check consistency with data provided
840        let data_x_with = array![[0.5], [-0.2], [0.3], [0.4], [-0.1]];
841        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x_with).build()?;
842        let fit_with = model.fit()?;
843        dbg!(&fit_with.result);
844        // The null likelihood of the model with parameters should be the same
845        // as the likelihood of the model with only the intercept.
846        assert_abs_diff_eq!(empty_null_like, fit_with.null_like());
847
848        Ok(())
849    }
850
851    #[test]
852    fn null_like_logistic() -> Result<()> {
853        // 6 true and 4 false for y_bar = 0.6.
854        let data_y = array![true, true, true, true, true, true, false, false, false, false];
855        let ybar: f64 = 0.6;
856        let data_x = array![0.4, 0.2, 0.5, 0.1, 0.6, 0.7, 0.3, 0.8, -0.1, 0.1].insert_axis(Axis(1));
857        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
858        let fit = model.fit()?;
859        let target_null_like = fit
860            .data
861            .y
862            .mapv(|y| {
863                let eta = (ybar / (1. - ybar)).ln();
864                y * eta - eta.exp().ln_1p()
865            })
866            .sum();
867        assert_abs_diff_eq!(fit.null_like(), target_null_like);
868        Ok(())
869    }
870
871    // Check that the deviance is equal to the sum of square deviations for a linear model
872    #[test]
873    fn deviance_linear() -> Result<()> {
874        let data_y = array![0.3, -0.2, 0.5, 0.7, 0.2, 1.4, 1.1, 0.2];
875        let data_x = array![0.6, 2.1, 0.4, -3.2, 0.7, 0.1, -0.3, 0.5].insert_axis(Axis(1));
876        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
877        let fit = model.fit()?;
878        // The predicted values of Y given the model.
879        let pred_y = fit.predict(&one_pad(data_x.view()), None);
880        let target_dev = (data_y - pred_y).mapv(|dy| dy * dy).sum();
881        assert_abs_diff_eq!(fit.deviance(), target_dev, epsilon = 4. * f64::EPSILON);
882        Ok(())
883    }
884
885    // Check that the deviance and dispersion parameter are equal up to the number of degrees of
886    // freedom for a linea model.
887    #[test]
888    fn deviance_dispersion_eq_linear() -> Result<()> {
889        let data_y = array![0.2, -0.1, 0.4, 1.3, 0.2, -0.6, 0.9];
890        let data_x = array![
891            [0.4, 0.2],
892            [0.1, 0.4],
893            [-0.1, 0.3],
894            [0.5, 0.7],
895            [0.4, 0.1],
896            [-0.2, -0.3],
897            [0.4, -0.1]
898        ];
899        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
900        let fit = model.fit()?;
901        let dev = fit.deviance();
902        let disp = fit.dispersion();
903        let ndf = fit.ndf();
904        assert_abs_diff_eq!(dev, disp * ndf, epsilon = 4. * f64::EPSILON);
905        Ok(())
906    }
907
908    // Check that the residuals for a linear model are all consistent.
909    #[test]
910    fn residuals_linear() -> Result<()> {
911        let data_y = array![0.1, -0.3, 0.7, 0.2, 1.2, -0.4];
912        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
913        let weights = array![0.8, 1.2, 0.9, 0.8, 1.1, 0.9];
914        // the implied variances from the weights
915        let wgt_sigmas = weights.map(|w: &f64| 1. / w.sqrt());
916        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
917            .var_weights(weights.clone())
918            .build()?;
919        let fit = model.fit()?;
920        let response = fit.resid_resp();
921        let resp_scaled = &response / wgt_sigmas;
922        let pearson = fit.resid_pear();
923        let deviance = fit.resid_dev();
924        assert_abs_diff_eq!(resp_scaled, pearson);
925        assert_abs_diff_eq!(resp_scaled, deviance);
926        let pearson_std = fit.resid_pear_std()?;
927        let deviance_std = fit.resid_dev_std()?;
928        assert_abs_diff_eq!(pearson_std, deviance_std, epsilon = 8. * f64::EPSILON);
929        // The externally-studentized residuals aren't expected to match the internally-studentized
930        // ones.
931        let dev_terms_loo = fit.deviance_terms_loo()?;
932        let disp_terms_loo = fit.dispersion_loo()?;
933        let student = fit.resid_student()?;
934
935        // NOTE: Studentization can't be checked directly in general because the method used is a
936        // one-step approximation, however it should be exact in the linear OLS case.
937        let n_data = data_y.len();
938        // Check that the leave-one-out stats hold literally
939        let mut loo_diff: Vec<f64> = Vec::new();
940        let mut loo_dev_res: Vec<f64> = Vec::new();
941        let mut loo_disp: Vec<f64> = Vec::new();
942        for i in 0..n_data {
943            let ya = data_y.slice(s![0..i]);
944            let yb = data_y.slice(s![i + 1..]);
945            let xa = data_x.slice(s![0..i, ..]);
946            let xb = data_x.slice(s![i + 1.., ..]);
947            let wa = weights.slice(s![0..i]);
948            let wb = weights.slice(s![i + 1..]);
949            let y_loo = concatenate![Axis(0), ya, yb];
950            let x_loo = concatenate![Axis(0), xa, xb];
951            let w_loo = concatenate![Axis(0), wa, wb];
952            let model_i = ModelBuilder::<Linear>::data(&y_loo, &x_loo)
953                .var_weights(w_loo)
954                .build()?;
955            let fit_i = model_i.fit()?;
956            let yi = data_y[i];
957            let xi = data_x.slice(s![i..i + 1, ..]);
958            let xi = crate::utility::one_pad(xi);
959            let wi = weights[i];
960            let yi_pred: f64 = fit_i.predict(&xi, None)[0];
961            let disp_i = fit_i.dispersion();
962            let var_i = disp_i / wi;
963            let diff_i = yi - yi_pred;
964            let res_dev_i = diff_i / var_i.sqrt();
965            loo_diff.push(wi * diff_i * diff_i);
966            loo_disp.push(disp_i);
967            loo_dev_res.push(res_dev_i);
968        }
969        let loo_diff: Array1<f64> = loo_diff.into();
970        let loo_disp: Array1<f64> = loo_disp.into();
971        let loo_dev_res: Array1<f64> = loo_dev_res.into();
972        assert_abs_diff_eq!(dev_terms_loo, loo_diff, epsilon = 8. * f64::EPSILON);
973        assert_abs_diff_eq!(disp_terms_loo, loo_disp, epsilon = 8. * f64::EPSILON);
974        assert_abs_diff_eq!(student, loo_dev_res, epsilon = 8. * f64::EPSILON);
975        Ok(())
976    }
977
978    // check the null likelihood for the case where it can be counted exactly.
979    #[test]
980    fn null_like_linear() -> Result<()> {
981        let data_y = array![0.3, -0.2, 0.5, 0.7, 0.2, 1.4, 1.1, 0.2];
982        let data_x = array![0.6, 2.1, 0.4, -3.2, 0.7, 0.1, -0.3, 0.5].insert_axis(Axis(1));
983        let ybar: f64 = data_y.mean().unwrap();
984        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
985        let fit = model.fit()?;
986        // let target_null_like = data_y.mapv(|y| -0.5 * (y - ybar) * (y - ybar)).sum();
987        let target_null_like = data_y.mapv(|y| y * ybar - 0.5 * ybar * ybar).sum();
988        // With the saturated likelihood subtracted the null likelihood should
989        // just be the sum of squared differences from the mean.
990        // let target_null_like = 0.;
991        // dbg!(target_null_like);
992        let fit_null_like = fit.null_like();
993        assert_abs_diff_eq!(2. * (fit.model_like - fit_null_like), fit.lr_test());
994        assert_eq!(fit.test_ndf(), 1);
995        assert_abs_diff_eq!(
996            fit_null_like,
997            target_null_like,
998            epsilon = 4.0 * f64::EPSILON
999        );
1000        Ok(())
1001    }
1002
1003    // check the leave-one-out one-step for the linear model
1004    #[test]
1005    fn loo_linear() -> Result<()> {
1006        let data_y = array![0.1, -0.3, 0.7, 0.2, 1.2, -0.4];
1007        let data_x = array![0.4, 0.1, 0.3, -0.1, 0.5, 0.6].insert_axis(Axis(1));
1008        let weights = array![1.0, 1.2, 0.8, 1.1, 1.0, 0.7];
1009        let model = ModelBuilder::<Linear>::data(&data_y, &data_x)
1010            .var_weights(weights.clone())
1011            .build()?;
1012        let fit = model.fit()?;
1013
1014        let loo_coef: Array2<f64> = fit.infl_coef()?;
1015        let loo_results = &fit.result - loo_coef;
1016        let n_data = data_y.len();
1017        for i in 0..n_data {
1018            let ya = data_y.slice(s![0..i]);
1019            let yb = data_y.slice(s![i + 1..]);
1020            let xa = data_x.slice(s![0..i, ..]);
1021            let xb = data_x.slice(s![i + 1.., ..]);
1022            let wa = weights.slice(s![0..i]);
1023            let wb = weights.slice(s![i + 1..]);
1024            let y_loo = concatenate![Axis(0), ya, yb];
1025            let x_loo = concatenate![Axis(0), xa, xb];
1026            let w_loo = concatenate![Axis(0), wa, wb];
1027            let model_i = ModelBuilder::<Linear>::data(&y_loo, &x_loo)
1028                .var_weights(w_loo)
1029                .build()?;
1030            let fit_i = model_i.fit()?;
1031            assert_abs_diff_eq!(
1032                loo_results.row(i),
1033                &fit_i.result,
1034                epsilon = f32::EPSILON as f64
1035            );
1036        }
1037        Ok(())
1038    }
1039
1040    // check the null likelihood where there is no dependence on the X data.
1041    #[test]
1042    fn null_like_logistic_nodep() -> Result<()> {
1043        let data_y = array![true, true, false, false, true, false, false, true];
1044        let data_x = array![0.4, 0.2, 0.4, 0.2, 0.7, 0.7, -0.1, -0.1].insert_axis(Axis(1));
1045        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1046        let fit = model.fit()?;
1047        let lr = fit.lr_test();
1048        assert_abs_diff_eq!(lr, 0.);
1049        Ok(())
1050    }
1051    // TODO: Test that the statistics behave sensibly under regularization. The
1052    // likelihood ratio test should yield a smaller value.
1053
1054    // Test the basic caching funcions.
1055    #[test]
1056    fn cached_computations() -> Result<()> {
1057        let data_y = array![true, true, false, true, true, false, false, false, true];
1058        let data_x = array![0.4, 0.1, -0.3, 0.7, -0.5, -0.1, 0.8, 1.0, 0.4].insert_axis(Axis(1));
1059        let model = ModelBuilder::<Logistic>::data(&data_y, &data_x).build()?;
1060        let fit = model.fit()?;
1061        let _null_like = fit.null_like();
1062        let _null_like = fit.null_like();
1063        let _cov = fit.covariance()?;
1064        let _wald = fit.wald_z();
1065        Ok(())
1066    }
1067
1068    // Check the consistency of the various statistical tests for linear
1069    // regression, where they should be the most comparable.
1070    #[test]
1071    fn linear_stat_tests() -> Result<()> {
1072        let data_y = array![-0.3, -0.1, 0.0, 0.2, 0.4, 0.5, 0.8, 0.8, 1.1];
1073        let data_x = array![-0.5, -0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.3].insert_axis(Axis(1));
1074        let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
1075        let fit = model.fit()?;
1076        let lr = fit.lr_test();
1077        let wald = fit.wald_test();
1078        let score = fit.score_test()?;
1079        assert_abs_diff_eq!(lr, wald, epsilon = 32.0 * f64::EPSILON);
1080        assert_abs_diff_eq!(lr, score, epsilon = 32.0 * f64::EPSILON);
1081        Ok(())
1082    }
1083}
ndarray_glm/fit.rs

ndarray_glm/
fit.rs