ndarray_glm/irls.rs
1//! Iteratively re-weighed least squares algorithm
2use crate::{
3 data::Dataset,
4 error::{RegressionError, RegressionResult},
5 fit::options::FitOptions,
6 glm::Glm,
7 link::Transform,
8 model::Model,
9 num::Float,
10 regularization::*,
11};
12use ndarray::{Array1, Array2, ArrayRef2};
13use ndarray_linalg::SolveH;
14use std::marker::PhantomData;
15
16/// Iterate over updates via iteratively re-weighted least-squares until
17/// reaching a specified tolerance.
18pub(crate) struct Irls<'a, M, F>
19where
20 M: Glm,
21 F: Float,
22 ArrayRef2<F>: SolveH<F>,
23{
24 model: PhantomData<M>,
25 data: &'a Dataset<F>,
26 /// The current parameter guess. No longer public because the full history is passed and this
27 /// may not end up being the optimal one. make_last_step() can be used to build an IrlsStep
28 /// from the current values, which is primarily used as a fallback when no iterations occur.
29 guess: Array1<F>,
30 /// The options for the fit
31 pub(crate) options: FitOptions<F>,
32 /// The regularizer object, which may be stateful
33 pub(crate) reg: Box<dyn IrlsReg<F>>,
34 /// The number of iterations taken so far
35 pub n_iter: usize,
36 /// The data likelihood for the previous iteration, unregularized and unaugmented.
37 /// This is cached separately from the guess because it demands expensive matrix
38 /// multiplications. The augmented and/or regularized terms are relatively cheap, so they
39 /// aren't stored.
40 last_like_data: F,
41 /// Sometimes the next guess is better than the previous but within
42 /// tolerance, so we want to return the current guess but exit immediately
43 /// in the next iteration.
44 done: bool,
45 /// Internally track the fit history, and allow the Fit to expose it. Note that this history is
46 /// stored on the internal standardized scale as it can know nothing about any external
47 /// transformations.
48 pub(crate) history: Vec<IrlsStep<F>>,
49}
50
51impl<'a, M, F> Irls<'a, M, F>
52where
53 M: Glm,
54 F: Float,
55 ArrayRef2<F>: SolveH<F>,
56{
57 pub(crate) fn new(model: &'a Model<M, F>, initial: Array1<F>, options: FitOptions<F>) -> Self {
58 let data = &model.data;
59 let reg = get_reg(&options, data.x.ncols(), data.has_intercept);
60 let initial_like_data: F = M::log_like(data, &initial);
61 Self {
62 model: PhantomData,
63 data,
64 guess: initial,
65 options,
66 reg,
67 n_iter: 0,
68 last_like_data: initial_like_data,
69 done: false,
70 history: Vec::new(),
71 }
72 }
73
74 /// A helper function to step to a new guess, while incrementing the number
75 /// of iterations and checking that it is not over the maximum.
76 fn step_with(&mut self, next_guess: Array1<F>, next_like_data: F) -> <Self as Iterator>::Item {
77 self.guess.assign(&next_guess);
78 self.last_like_data = next_like_data;
79 let model_like = next_like_data + self.reg.likelihood(&next_guess);
80 let step = IrlsStep {
81 guess: next_guess,
82 like: model_like,
83 };
84 self.history.push(step.clone());
85 self.n_iter += 1;
86 if self.n_iter > self.options.max_iter {
87 return Err(RegressionError::MaxIter {
88 n_iter: self.options.max_iter,
89 history: self.history.clone(),
90 });
91 }
92 Ok(step)
93 }
94
95 /// Returns the (LHS, RHS) of the IRLS update matrix equation. This is a bit
96 /// faster than computing the Fisher matrix and the Jacobian separately.
97 /// The returned matrix and vector are not regularized.
98 // TODO: re-factor to have the distributions compute the fisher information,
99 // as that is useful in the score test as well.
100 fn irls_mat_vec(&self) -> (Array2<F>, Array1<F>) {
101 // The linear predictor without control terms
102 let linear_predictor_no_control: Array1<F> = self.data.x.dot(&self.guess);
103 // the linear predictor given the model, including offsets if present
104 let linear_predictor = match &self.data.linear_offset {
105 Some(off) => &linear_predictor_no_control + off,
106 None => linear_predictor_no_control.clone(),
107 };
108 // The data.linear_predictor() function is not used above because we will use
109 // both versions, with and without the linear offset, and we don't want
110 // to repeat the matrix multiplication.
111
112 // Similarly, we don't use M::get_adjusted_variance(linear_predictor) because intermediate
113 // results are used as well, and we need to correct the error term too.
114
115 // The prediction of y given the current model.
116 // This does cause an unnecessary clone with an identity link, but we
117 // need the linear predictor around for the future.
118 let predictor: Array1<F> = M::mean(&linear_predictor);
119
120 // The variances predicted by the model. This should have weights with
121 // it and must be non-zero.
122 // This could become a full covariance with weights.
123 let var_diag: Array1<F> = predictor.mapv(M::variance);
124
125 // The errors represent the difference between observed and predicted.
126 let errors = &self.data.y - &predictor;
127
128 // Adjust the errors and variance using the appropriate derivatives of
129 // the link function. With the canonical link function, this is a no-op.
130 let (errors, var_diag) =
131 M::Link::adjust_errors_variance(errors, var_diag, &linear_predictor);
132
133 // condition after the adjustment in case the derivatives are zero. Or
134 // should the Hessian itself be conditioned?
135 // TODO: allow the variance conditioning to be a configurable parameter.
136 let var_diag: Array1<F> = var_diag.mapv_into(|v| v + F::epsilon());
137
138 // X weighted by the model variance for each observation
139 // This is really the negative Hessian of the likelihood.
140 let neg_hessian: Array2<F> = (self.data.x_conj() * &var_diag).dot(&self.data.x);
141
142 // This isn't quite the jacobian because the H*beta_old term is subtracted out.
143 let rhs: Array1<F> = {
144 // NOTE: This w*X should not include the linear offset, because it
145 // comes from the Hessian times the last guess.
146 let target: Array1<F> = (var_diag * linear_predictor_no_control) + errors;
147 let target: Array1<F> = self.data.x_conj().dot(&target);
148 target
149 };
150 (neg_hessian, rhs)
151 }
152
153 /// Get the most recent step as a new object. This is most useful as a fallback when there
154 /// are no iterations because the iteration started at the maximum, and we need to recover the
155 /// default with proper regularization. It could also be used within step_with() itself, after
156 /// setting the guess and likelihood, though that could be a little opaque.
157 pub(crate) fn make_last_step(&self) -> IrlsStep<F> {
158 // It's crucial that this regularization is applied the same way here as it is in
159 // step_with().
160 let model_like = self.last_like_data + self.reg.likelihood(&self.guess);
161 IrlsStep {
162 guess: self.guess.clone(),
163 like: model_like,
164 }
165 }
166}
167
168/// Represents a step in the IRLS. Holds the current guess and likelihood.
169#[derive(Clone, Debug)]
170pub struct IrlsStep<F> {
171 /// The current parameter guess.
172 pub guess: Array1<F>,
173 /// The regularized log-likelihood of the current guess.
174 pub like: F,
175 // TODO: Consider tracking data likelihood, regularized likelihood, and augmented likelihood
176 // separately.
177}
178
179impl<'a, M, F> Iterator for Irls<'a, M, F>
180where
181 M: Glm,
182 F: Float,
183 ArrayRef2<F>: SolveH<F>,
184{
185 type Item = RegressionResult<IrlsStep<F>, F>;
186
187 /// Acquire the next IRLS step based on the previous one.
188 fn next(&mut self) -> Option<Self::Item> {
189 // if the last step was an improvement but within tolerance, this step
190 // has been flagged to terminate early.
191 if self.done {
192 return None;
193 }
194
195 let (irls_mat, irls_vec) = self.irls_mat_vec();
196 let next_guess: Array1<F> = match self.reg.next_guess(&self.guess, irls_vec, irls_mat) {
197 Ok(solution) => solution,
198 Err(err) => return Some(Err(err)),
199 };
200
201 // This is the raw, unregularized and unaugmented
202 let next_like_data = M::log_like(self.data, &next_guess);
203
204 // The augmented likelihood to maximize may not be the same as the regularized model
205 // likelihood.
206 // NOTE: This must be computed after self.reg.next_guess() is called, because that step can
207 // change the penalty parameter in ADMM. last_like_obj does not represent the previous
208 // objective; it represents the current version of the objective function using the
209 // previous guess. These may be different because the augmentation parameter and dual
210 // variables for the regularization can change.
211 let last_like_obj = self.last_like_data + self.reg.irls_like(&self.guess);
212 let next_like_obj = next_like_data + self.reg.irls_like(&next_guess);
213
214 // If the regularization isn't convergent (which should only happen with ADMM in
215 // L1/ElasticNet), precision tolerance checks don't matter and we should continue on with
216 // the iteration.
217 if !self.reg.terminate_ok(self.options.tol) {
218 return Some(self.step_with(next_guess, next_like_data));
219 }
220
221 // If the next guess is literally equal to the last, then additional IRLS or step-halving
222 // procedures won't get use anywhere further.
223 // This should be true even under ADMM, given that it's converged per the previous check.
224 // This will fire if the optimum is passed in as the initial guess, in which case the
225 // iteration will have length zero and the GLM regression function will query this IRLS
226 // object for its initial step object.
227 if next_guess == self.guess {
228 return None;
229 }
230
231 // Terminate when both the likelihood change and the parameter step are within tolerance.
232 // This check comes before the strict-improvement shortcut so that problems where every
233 // step is a tiny improvement (e.g. logistic with y = p_true) converge rather than
234 // exhausting max_iter.
235 // If ADMM hasn't converged, we've already iterated to the next step, so we don't need to
236 // check self.reg.terminate_ok() again.
237 let n_obs = F::from(self.data.y.len()).unwrap();
238 if small_delta(next_like_obj, last_like_obj, self.options.tol * n_obs)
239 && small_delta_vec(&next_guess, &self.guess, self.options.tol)
240 {
241 self.done = true;
242 }
243
244 // If this guess is a strict improvement, continue. If we're within convergence tolerance
245 // at this stage, this will be the last step.
246 if next_like_obj > last_like_obj {
247 return Some(self.step_with(next_guess, next_like_data));
248 }
249
250 // apply step halving if the new likelihood is the same or worse as the previous guess.
251 // NOTE: It's difficult to engage the step halving because it's rarely necessary, so this
252 // part of the algorithm is undertested. It may be more common using L1 regularization.
253 // next_guess != self.guess because we've already checked that case above.
254 let f_step = |x: F| {
255 let b = &next_guess * x + &self.guess * (F::one() - x);
256 // The augmented and unaugmented checks should be close to equivalent at this point
257 // because the regularization has reported that the internals have converged via
258 // `terminate_ok()`. Since we are potentially finding the final best guess, look for
259 // the best model likelihood in the step search.
260 // NOTE: It's possible there are some edge cases with an inconsistency, given that the
261 // checks above use the augmented likelihood and this checks directly.
262 M::log_like(self.data, &b) + self.reg.likelihood(&b) // unaugmented
263 // M::log_like(self.data, &b) + self.reg.irls_like(&b) // augmented
264 };
265 let beta_tol_factor = num_traits::Float::sqrt(self.guess.mapv(|b| F::one() + b * b).sum());
266 let step_mult: F = step_scale(&f_step, beta_tol_factor * self.options.tol);
267 if step_mult.is_zero() {
268 // can't find a strictly better minimum if the step multiplier returns zero
269 return None;
270 }
271
272 // If the step multiplier is not zero, it found a better guess
273 let next_guess = &next_guess * step_mult + &self.guess * (F::one() - step_mult);
274 let next_like_data = M::log_like(self.data, &next_guess);
275
276 Some(self.step_with(next_guess, next_like_data))
277 }
278}
279
280fn small_delta<F>(new: F, old: F, tol: F) -> bool
281where
282 F: Float,
283{
284 num_traits::Float::abs(new - old) <= tol
285}
286
287fn small_delta_vec<F>(new: &Array1<F>, old: &Array1<F>, tol: F) -> bool
288where
289 F: Float,
290{
291 // this method interpolates between relative and absolute differences
292 let delta = new - old;
293 let n = F::from(delta.len()).unwrap();
294
295 let new2: F = new.mapv(|d| d * d).sum();
296 let delta2: F = delta.mapv(|d| d * d).sum();
297
298 // use sum of absolute values to indicate magnitude of beta
299 // sum of squares might be better
300 delta2 <= (n + new2) * tol * tol
301}
302
303/// Zero the first element of the array `l` if `use_intercept == true`
304fn zero_first_maybe<F>(mut l: Array1<F>, use_intercept: bool) -> Array1<F>
305where
306 F: Float,
307{
308 // if an intercept term is included it should not be subject to
309 // regularization.
310 if use_intercept {
311 l[0] = F::zero();
312 }
313 l
314}
315
316/// Generate a regularizer from the set of options
317fn get_reg<F: Float>(
318 options: &FitOptions<F>,
319 n: usize,
320 use_intercept: bool,
321) -> Box<dyn IrlsReg<F>> {
322 if options.l1 < F::zero() || options.l2 < F::zero() {
323 eprintln!("WARNING: regularization parameters should not be negative.");
324 }
325 let use_l1 = options.l1 > F::zero();
326 let use_l2 = options.l2 > F::zero();
327
328 if use_l1 && use_l2 {
329 let l1_diag: Array1<F> = Array1::<F>::from_elem(n, options.l1);
330 let l1_diag: Array1<F> = zero_first_maybe(l1_diag, use_intercept);
331 let l2_diag: Array1<F> = Array1::<F>::from_elem(n, options.l2);
332 let l2_diag: Array1<F> = zero_first_maybe(l2_diag, use_intercept);
333 Box::new(ElasticNet::from_diag(l1_diag, l2_diag))
334 } else if use_l2 {
335 let l2_diag: Array1<F> = Array1::<F>::from_elem(n, options.l2);
336 let l2_diag: Array1<F> = zero_first_maybe(l2_diag, use_intercept);
337 Box::new(Ridge::from_diag(l2_diag))
338 } else if use_l1 {
339 let l1_diag: Array1<F> = Array1::<F>::from_elem(n, options.l1);
340 let l1_diag: Array1<F> = zero_first_maybe(l1_diag, use_intercept);
341 Box::new(Lasso::from_diag(l1_diag))
342 } else {
343 Box::new(Null {})
344 }
345}
346
347/// Find a better step scale to optimize and objective function.
348/// Looks for a new solution better than x = 1 looking first at 0 < x < 1 and returning any value
349/// found to be a strict improvement.
350/// If none are found, it will check a single negative step.
351fn step_scale<F: Float>(f: &dyn Fn(F) -> F, tol: F) -> F {
352 let tol = num_traits::Float::abs(tol);
353 // TODO: Add list of values to explicitly try (for instance with zeroed parameters)
354
355 let zero: F = F::zero();
356 let one: F = F::one();
357 // `scale = 0.5` should also work, but using the golden ratio is prettier and might be less
358 // likely to fail in pathological cases.
359 let scale = F::from(0.618033988749894).unwrap();
360 let mut x: F = one;
361 let f0: F = f(zero);
362
363 // start at scale < 1, since 1 has already been checked.
364 while x > tol {
365 x *= scale;
366 let fx = f(x);
367 if fx > f0 {
368 return x;
369 }
370 }
371
372 // If f(1) > f(0), then an improvement has already been found. However, if the optimization is
373 // languishing, it could be useful to try x > 1. It's pretty rare to get to this state,
374 // however.
375
376 // If we're here a strict improvement hasn't been found, but it's possible that the likelihoods
377 // are equal.
378 // check a single step in the negative direction, in case this is an improvement.
379 if f(-scale) > f0 {
380 return -scale;
381 }
382
383 // Nothing checked has been an improvement, so return zero.
384 F::zero()
385}