Skip to main content

ndarray_glm/
irls.rs

1//! Iteratively re-weighed least squares algorithm
2use crate::{
3    data::Dataset,
4    error::{RegressionError, RegressionResult},
5    fit::options::FitOptions,
6    glm::Glm,
7    link::Transform,
8    model::Model,
9    num::Float,
10    regularization::*,
11};
12use ndarray::{Array1, Array2, ArrayRef2};
13use ndarray_linalg::SolveH;
14use std::marker::PhantomData;
15
16/// Iterate over updates via iteratively re-weighted least-squares until
17/// reaching a specified tolerance.
18pub(crate) struct Irls<'a, M, F>
19where
20    M: Glm,
21    F: Float,
22    ArrayRef2<F>: SolveH<F>,
23{
24    model: PhantomData<M>,
25    data: &'a Dataset<F>,
26    /// The current parameter guess. No longer public because the full history is passed and this
27    /// may not end up being the optimal one. make_last_step() can be used to build an IrlsStep
28    /// from the current values, which is primarily used as a fallback when no iterations occur.
29    guess: Array1<F>,
30    /// The options for the fit
31    pub(crate) options: FitOptions<F>,
32    /// The regularizer object, which may be stateful
33    pub(crate) reg: Box<dyn IrlsReg<F>>,
34    /// The number of iterations taken so far
35    pub n_iter: usize,
36    /// The data likelihood for the previous iteration, unregularized and unaugmented.
37    /// This is cached separately from the guess because it demands expensive matrix
38    /// multiplications. The augmented and/or regularized terms are relatively cheap, so they
39    /// aren't stored.
40    last_like_data: F,
41    /// Sometimes the next guess is better than the previous but within
42    /// tolerance, so we want to return the current guess but exit immediately
43    /// in the next iteration.
44    done: bool,
45    /// Internally track the fit history, and allow the Fit to expose it. Note that this history is
46    /// stored on the internal standardized scale as it can know nothing about any external
47    /// transformations.
48    pub(crate) history: Vec<IrlsStep<F>>,
49}
50
51impl<'a, M, F> Irls<'a, M, F>
52where
53    M: Glm,
54    F: Float,
55    ArrayRef2<F>: SolveH<F>,
56{
57    pub(crate) fn new(model: &'a Model<M, F>, initial: Array1<F>, options: FitOptions<F>) -> Self {
58        let data = &model.data;
59        let reg = get_reg(&options, data.x.ncols(), data.has_intercept);
60        let initial_like_data: F = M::log_like(data, &initial);
61        Self {
62            model: PhantomData,
63            data,
64            guess: initial,
65            options,
66            reg,
67            n_iter: 0,
68            last_like_data: initial_like_data,
69            done: false,
70            history: Vec::new(),
71        }
72    }
73
74    /// A helper function to step to a new guess, while incrementing the number
75    /// of iterations and checking that it is not over the maximum.
76    fn step_with(&mut self, next_guess: Array1<F>, next_like_data: F) -> <Self as Iterator>::Item {
77        self.guess.assign(&next_guess);
78        self.last_like_data = next_like_data;
79        let model_like = next_like_data + self.reg.likelihood(&next_guess);
80        let step = IrlsStep {
81            guess: next_guess,
82            like: model_like,
83        };
84        self.history.push(step.clone());
85        self.n_iter += 1;
86        if self.n_iter > self.options.max_iter {
87            return Err(RegressionError::MaxIter {
88                n_iter: self.options.max_iter,
89                history: self.history.clone(),
90            });
91        }
92        Ok(step)
93    }
94
95    /// Returns the (LHS, RHS) of the IRLS update matrix equation. This is a bit
96    /// faster than computing the Fisher matrix and the Jacobian separately.
97    /// The returned matrix and vector are not regularized.
98    // TODO: re-factor to have the distributions compute the fisher information,
99    // as that is useful in the score test as well.
100    fn irls_mat_vec(&self) -> (Array2<F>, Array1<F>) {
101        // The linear predictor without control terms
102        let linear_predictor_no_control: Array1<F> = self.data.x.dot(&self.guess);
103        // the linear predictor given the model, including offsets if present
104        let linear_predictor = match &self.data.linear_offset {
105            Some(off) => &linear_predictor_no_control + off,
106            None => linear_predictor_no_control.clone(),
107        };
108        // The data.linear_predictor() function is not used above because we will use
109        // both versions, with and without the linear offset, and we don't want
110        // to repeat the matrix multiplication.
111
112        // Similarly, we don't use M::get_adjusted_variance(linear_predictor) because intermediate
113        // results are used as well, and we need to correct the error term too.
114
115        // The prediction of y given the current model.
116        // This does cause an unnecessary clone with an identity link, but we
117        // need the linear predictor around for the future.
118        let predictor: Array1<F> = M::mean(&linear_predictor);
119
120        // The variances predicted by the model. This should have weights with
121        // it and must be non-zero.
122        // This could become a full covariance with weights.
123        let var_diag: Array1<F> = predictor.mapv(M::variance);
124
125        // The errors represent the difference between observed and predicted.
126        let errors = &self.data.y - &predictor;
127
128        // Adjust the errors and variance using the appropriate derivatives of
129        // the link function. With the canonical link function, this is a no-op.
130        let (errors, var_diag) =
131            M::Link::adjust_errors_variance(errors, var_diag, &linear_predictor);
132
133        // condition after the adjustment in case the derivatives are zero. Or
134        // should the Hessian itself be conditioned?
135        // TODO: allow the variance conditioning to be a configurable parameter.
136        let var_diag: Array1<F> = var_diag.mapv_into(|v| v + F::epsilon());
137
138        // X weighted by the model variance for each observation
139        // This is really the negative Hessian of the likelihood.
140        let neg_hessian: Array2<F> = (self.data.x_conj() * &var_diag).dot(&self.data.x);
141
142        // This isn't quite the jacobian because the H*beta_old term is subtracted out.
143        let rhs: Array1<F> = {
144            // NOTE: This w*X should not include the linear offset, because it
145            // comes from the Hessian times the last guess.
146            let target: Array1<F> = (var_diag * linear_predictor_no_control) + errors;
147            let target: Array1<F> = self.data.x_conj().dot(&target);
148            target
149        };
150        (neg_hessian, rhs)
151    }
152
153    /// Get the most recent step as a new object. This is most useful as a fallback when there
154    /// are no iterations because the iteration started at the maximum, and we need to recover the
155    /// default with proper regularization. It could also be used within step_with() itself, after
156    /// setting the guess and likelihood, though that could be a little opaque.
157    pub(crate) fn make_last_step(&self) -> IrlsStep<F> {
158        // It's crucial that this regularization is applied the same way here as it is in
159        // step_with().
160        let model_like = self.last_like_data + self.reg.likelihood(&self.guess);
161        IrlsStep {
162            guess: self.guess.clone(),
163            like: model_like,
164        }
165    }
166}
167
168/// Represents a step in the IRLS. Holds the current guess and likelihood.
169#[derive(Clone, Debug)]
170pub struct IrlsStep<F> {
171    /// The current parameter guess.
172    pub guess: Array1<F>,
173    /// The regularized log-likelihood of the current guess.
174    pub like: F,
175    // TODO: Consider tracking data likelihood, regularized likelihood, and augmented likelihood
176    // separately.
177}
178
179impl<'a, M, F> Iterator for Irls<'a, M, F>
180where
181    M: Glm,
182    F: Float,
183    ArrayRef2<F>: SolveH<F>,
184{
185    type Item = RegressionResult<IrlsStep<F>, F>;
186
187    /// Acquire the next IRLS step based on the previous one.
188    fn next(&mut self) -> Option<Self::Item> {
189        // if the last step was an improvement but within tolerance, this step
190        // has been flagged to terminate early.
191        if self.done {
192            return None;
193        }
194
195        let (irls_mat, irls_vec) = self.irls_mat_vec();
196        let next_guess: Array1<F> = match self.reg.next_guess(&self.guess, irls_vec, irls_mat) {
197            Ok(solution) => solution,
198            Err(err) => return Some(Err(err)),
199        };
200
201        // This is the raw, unregularized and unaugmented
202        let next_like_data = M::log_like(self.data, &next_guess);
203
204        // The augmented likelihood to maximize may not be the same as the regularized model
205        // likelihood.
206        // NOTE: This must be computed after self.reg.next_guess() is called, because that step can
207        // change the penalty parameter in ADMM. last_like_obj does not represent the previous
208        // objective; it represents the current version of the objective function using the
209        // previous guess. These may be different because the augmentation parameter and dual
210        // variables for the regularization can change.
211        let last_like_obj = self.last_like_data + self.reg.irls_like(&self.guess);
212        let next_like_obj = next_like_data + self.reg.irls_like(&next_guess);
213
214        // If the regularization isn't convergent (which should only happen with ADMM in
215        // L1/ElasticNet), precision tolerance checks don't matter and we should continue on with
216        // the iteration.
217        if !self.reg.terminate_ok(self.options.tol) {
218            return Some(self.step_with(next_guess, next_like_data));
219        }
220
221        // If the next guess is literally equal to the last, then additional IRLS or step-halving
222        // procedures won't get use anywhere further.
223        // This should be true even under ADMM, given that it's converged per the previous check.
224        // This will fire if the optimum is passed in as the initial guess, in which case the
225        // iteration will have length zero and the GLM regression function will query this IRLS
226        // object for its initial step object.
227        if next_guess == self.guess {
228            return None;
229        }
230
231        // Terminate when both the likelihood change and the parameter step are within tolerance.
232        // This check comes before the strict-improvement shortcut so that problems where every
233        // step is a tiny improvement (e.g. logistic with y = p_true) converge rather than
234        // exhausting max_iter.
235        // If ADMM hasn't converged, we've already iterated to the next step, so we don't need to
236        // check self.reg.terminate_ok() again.
237        let n_obs = F::from(self.data.y.len()).unwrap();
238        if small_delta(next_like_obj, last_like_obj, self.options.tol * n_obs)
239            && small_delta_vec(&next_guess, &self.guess, self.options.tol)
240        {
241            self.done = true;
242        }
243
244        // If this guess is a strict improvement, continue. If we're within convergence tolerance
245        // at this stage, this will be the last step.
246        if next_like_obj > last_like_obj {
247            return Some(self.step_with(next_guess, next_like_data));
248        }
249
250        // apply step halving if the new likelihood is the same or worse as the previous guess.
251        // NOTE: It's difficult to engage the step halving because it's rarely necessary, so this
252        // part of the algorithm is undertested. It may be more common using L1 regularization.
253        // next_guess != self.guess because we've already checked that case above.
254        let f_step = |x: F| {
255            let b = &next_guess * x + &self.guess * (F::one() - x);
256            // The augmented and unaugmented checks should be close to equivalent at this point
257            // because the regularization has reported that the internals have converged via
258            // `terminate_ok()`. Since we are potentially finding the final best guess, look for
259            // the best model likelihood in the step search.
260            // NOTE: It's possible there are some edge cases with an inconsistency, given that the
261            // checks above use the augmented likelihood and this checks directly.
262            M::log_like(self.data, &b) + self.reg.likelihood(&b) // unaugmented
263            // M::log_like(self.data, &b) + self.reg.irls_like(&b) // augmented
264        };
265        let beta_tol_factor = num_traits::Float::sqrt(self.guess.mapv(|b| F::one() + b * b).sum());
266        let step_mult: F = step_scale(&f_step, beta_tol_factor * self.options.tol);
267        if step_mult.is_zero() {
268            // can't find a strictly better minimum if the step multiplier returns zero
269            return None;
270        }
271
272        // If the step multiplier is not zero, it found a better guess
273        let next_guess = &next_guess * step_mult + &self.guess * (F::one() - step_mult);
274        let next_like_data = M::log_like(self.data, &next_guess);
275
276        Some(self.step_with(next_guess, next_like_data))
277    }
278}
279
280fn small_delta<F>(new: F, old: F, tol: F) -> bool
281where
282    F: Float,
283{
284    num_traits::Float::abs(new - old) <= tol
285}
286
287fn small_delta_vec<F>(new: &Array1<F>, old: &Array1<F>, tol: F) -> bool
288where
289    F: Float,
290{
291    // this method interpolates between relative and absolute differences
292    let delta = new - old;
293    let n = F::from(delta.len()).unwrap();
294
295    let new2: F = new.mapv(|d| d * d).sum();
296    let delta2: F = delta.mapv(|d| d * d).sum();
297
298    // use sum of absolute values to indicate magnitude of beta
299    // sum of squares might be better
300    delta2 <= (n + new2) * tol * tol
301}
302
303/// Zero the first element of the array `l` if `use_intercept == true`
304fn zero_first_maybe<F>(mut l: Array1<F>, use_intercept: bool) -> Array1<F>
305where
306    F: Float,
307{
308    // if an intercept term is included it should not be subject to
309    // regularization.
310    if use_intercept {
311        l[0] = F::zero();
312    }
313    l
314}
315
316/// Generate a regularizer from the set of options
317fn get_reg<F: Float>(
318    options: &FitOptions<F>,
319    n: usize,
320    use_intercept: bool,
321) -> Box<dyn IrlsReg<F>> {
322    if options.l1 < F::zero() || options.l2 < F::zero() {
323        eprintln!("WARNING: regularization parameters should not be negative.");
324    }
325    let use_l1 = options.l1 > F::zero();
326    let use_l2 = options.l2 > F::zero();
327
328    if use_l1 && use_l2 {
329        let l1_diag: Array1<F> = Array1::<F>::from_elem(n, options.l1);
330        let l1_diag: Array1<F> = zero_first_maybe(l1_diag, use_intercept);
331        let l2_diag: Array1<F> = Array1::<F>::from_elem(n, options.l2);
332        let l2_diag: Array1<F> = zero_first_maybe(l2_diag, use_intercept);
333        Box::new(ElasticNet::from_diag(l1_diag, l2_diag))
334    } else if use_l2 {
335        let l2_diag: Array1<F> = Array1::<F>::from_elem(n, options.l2);
336        let l2_diag: Array1<F> = zero_first_maybe(l2_diag, use_intercept);
337        Box::new(Ridge::from_diag(l2_diag))
338    } else if use_l1 {
339        let l1_diag: Array1<F> = Array1::<F>::from_elem(n, options.l1);
340        let l1_diag: Array1<F> = zero_first_maybe(l1_diag, use_intercept);
341        Box::new(Lasso::from_diag(l1_diag))
342    } else {
343        Box::new(Null {})
344    }
345}
346
347/// Find a better step scale to optimize and objective function.
348/// Looks for a new solution better than x = 1 looking first at 0 < x < 1 and returning any value
349/// found to be a strict improvement.
350/// If none are found, it will check a single negative step.
351fn step_scale<F: Float>(f: &dyn Fn(F) -> F, tol: F) -> F {
352    let tol = num_traits::Float::abs(tol);
353    // TODO: Add list of values to explicitly try (for instance with zeroed parameters)
354
355    let zero: F = F::zero();
356    let one: F = F::one();
357    // `scale = 0.5` should also work, but using the golden ratio is prettier and might be less
358    // likely to fail in pathological cases.
359    let scale = F::from(0.618033988749894).unwrap();
360    let mut x: F = one;
361    let f0: F = f(zero);
362
363    // start at scale < 1, since 1 has already been checked.
364    while x > tol {
365        x *= scale;
366        let fx = f(x);
367        if fx > f0 {
368            return x;
369        }
370    }
371
372    // If f(1) > f(0), then an improvement has already been found. However, if the optimization is
373    // languishing, it could be useful to try x > 1. It's pretty rare to get to this state,
374    // however.
375
376    // If we're here a strict improvement hasn't been found, but it's possible that the likelihoods
377    // are equal.
378    // check a single step in the negative direction, in case this is an improvement.
379    if f(-scale) > f0 {
380        return -scale;
381    }
382
383    // Nothing checked has been an improvement, so return zero.
384    F::zero()
385}