1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
//! Regularization methods and their effect on the likelihood and the matrix and
//! vector components of the IRLS step.
use crate::{Array1, Array2, error::RegressionResult, num::Float};
use ndarray::ArrayViewMut1;
use ndarray_linalg::SolveH;
/// Penalize the likelihood with a smooth function of the regression parameters.
pub(crate) trait IrlsReg<F>
where
F: Float,
{
/// Defines the impact of the regularization approach on the likelihood. It
/// must be zero when the regressors are zero, otherwise some assumptions in
/// the fitting statistics section may be invalidated.
fn likelihood(&self, regressors: &Array1<F>) -> F;
/// Defines the regularization effect on the gradient of the likelihood with respect
/// to beta.
fn gradient(&self, l: Array1<F>, regressors: &Array1<F>) -> Array1<F>;
/// Processing to do before each step.
fn prepare(&mut self, _guess: &Array1<F>) {}
/// For ADMM, the likelihood in the IRLS step is augmented with a rho term and does not include
/// the L1 component. Without ADMM this should return the actual un-augmented likelihood.
fn irls_like(&self, regressors: &Array1<F>) -> F {
self.likelihood(regressors)
}
/// Defines the adjustment to the vector side of the IRLS update equation.
/// It is the negative gradient of the penalty plus the hessian times the
/// regressors. A default implementation is provided, but this is zero for
/// ridge regression so it is allowed to be overridden.
fn irls_vec(&self, vec: Array1<F>, regressors: &Array1<F>) -> Array1<F>;
/// Defines the change to the matrix side of the IRLS update equation. It
/// subtracts the Hessian of the penalty from the matrix. The difference is
/// typically only on the diagonal.
fn irls_mat(&self, mat: Array2<F>, regressors: &Array1<F>) -> Array2<F>;
/// Return the next guess under regularization given the current guess and the RHS and LHS of
/// the unregularized IRLS matrix solution equation for the next guess.
fn next_guess(
&mut self,
guess: &Array1<F>,
irls_vec: Array1<F>,
irls_mat: Array2<F>,
) -> RegressionResult<Array1<F>, F> {
self.prepare(guess);
// Apply the regularization effects to the Hessian (LHS)
let lhs = self.irls_mat(irls_mat, guess);
// Apply regularization effects to the modified Jacobian (RHS)
let rhs = self.irls_vec(irls_vec, guess);
let next_guess = lhs.solveh_into(rhs)?;
Ok(next_guess)
}
fn terminate_ok(&self, _tol: F) -> bool {
true
}
}
/// Represents a lack of regularization.
pub(crate) struct Null {}
impl<F: Float> IrlsReg<F> for Null {
#[inline]
fn likelihood(&self, _: &Array1<F>) -> F {
F::zero()
}
#[inline]
fn gradient(&self, jac: Array1<F>, _: &Array1<F>) -> Array1<F> {
jac
}
#[inline]
fn irls_vec(&self, vec: Array1<F>, _: &Array1<F>) -> Array1<F> {
vec
}
#[inline]
fn irls_mat(&self, mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
mat
}
}
/// Penalizes the regression by lambda/2 * |beta|^2.
pub(crate) struct Ridge<F: Float> {
l2_vec: Array1<F>,
}
impl<F: Float> Ridge<F> {
/// Create the regularization from the diagonal. This outsources the
/// question of whether to include the first term (usually the intercept) in
/// the regularization.
pub fn from_diag(l2: Array1<F>) -> Self {
Self { l2_vec: l2 }
}
}
impl<F: Float> IrlsReg<F> for Ridge<F> {
/// The likelihood is penalized by 0.5 * lambda_2 * |beta|^2
fn likelihood(&self, beta: &Array1<F>) -> F {
-F::from(0.5).unwrap() * (&self.l2_vec * &beta.mapv(|b| b * b)).sum()
}
/// The gradient is penalized by lambda_2 * beta.
fn gradient(&self, jac: Array1<F>, beta: &Array1<F>) -> Array1<F> {
jac - (&self.l2_vec * beta)
}
/// Ridge regression has no effect on the vector side of the IRLS step equation, because the
/// 1st and 2nd order derivative terms exactly cancel.
#[inline]
fn irls_vec(&self, vec: Array1<F>, _: &Array1<F>) -> Array1<F> {
vec
}
/// Add lambda to the diagonals of the information matrix.
fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
mat_diag += &self.l2_vec;
mat
}
}
/// Penalizes the likelihood by the L1-norm of the parameters.
pub(crate) struct Lasso<F: Float> {
/// The L1 parameters for each element
l1_vec: Array1<F>,
/// The dual solution
dual: Array1<F>,
/// The cumulative sum of residuals for each element
cum_res: Array1<F>,
/// ADMM penalty parameter
rho: F,
/// L2-Norm of primal residuals |r|^2
r_sq: F,
/// L2-Norm of dual residuals |s|^2
s_sq: F,
}
impl<F: Float> Lasso<F> {
/// Create the regularization from the diagonal, outsourcing the question of whether to include
/// the first term (commonly the intercept, which is left out) in the diagonal.
pub fn from_diag(l1: Array1<F>) -> Self {
let n: usize = l1.len();
let gamma = Array1::zeros(n);
let u = Array1::zeros(n);
Self {
l1_vec: l1,
dual: gamma,
cum_res: u,
rho: F::one(),
r_sq: F::infinity(), // or should it be NaN?
s_sq: F::infinity(),
}
}
fn update_rho(&mut self) {
// Can these be declared const?
let mu: F = F::from(8.).unwrap();
let tau: F = F::from(2.).unwrap();
if self.r_sq > mu * mu * self.s_sq {
self.rho *= tau;
self.cum_res /= tau;
}
if self.r_sq * mu * mu < self.s_sq {
self.rho /= tau;
self.cum_res *= tau;
}
}
}
impl<F: Float> IrlsReg<F> for Lasso<F> {
fn likelihood(&self, beta: &Array1<F>) -> F {
-(&self.l1_vec * beta.mapv(num_traits::Float::abs)).sum()
}
// This is used in the fit's score function, for instance. Thus it includes the regularization
// terms and not the augmented term.
fn gradient(&self, jac: Array1<F>, regressors: &Array1<F>) -> Array1<F> {
jac - &self.l1_vec * ®ressors.mapv(F::sign)
}
/// Update the dual solution and the cumulative residuals.
fn prepare(&mut self, beta: &Array1<F>) {
// Apply adaptive penalty term updating
self.update_rho();
let old_dual = self.dual.clone();
self.dual = soft_thresh(beta + &self.cum_res, &self.l1_vec / self.rho);
// the primal residuals
let r: Array1<F> = beta - &self.dual;
// the dual residuals
let s: Array1<F> = (&self.dual - old_dual) * self.rho;
self.cum_res += &r;
self.r_sq = r.mapv(|r| r * r).sum();
self.s_sq = s.mapv(|s| s * s).sum();
}
fn irls_like(&self, regressors: &Array1<F>) -> F {
-F::from(0.5).unwrap()
* self.rho
* (regressors - &self.dual + &self.cum_res)
.mapv(|x| x * x)
.sum()
}
/// The beta term from the gradient is cancelled by the corresponding term from the Hessian.
/// The dual and residual terms remain.
fn irls_vec(&self, vec: Array1<F>, _regressors: &Array1<F>) -> Array1<F> {
let d: Array1<F> = &self.dual - &self.cum_res;
vec + d * self.rho
}
/// Add the constant rho to all elements of the diagonal of the Hessian.
fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
mat_diag += self.rho;
mat
}
fn terminate_ok(&self, tol: F) -> bool {
// Expressed like this, it should perhaps instead be an epsilon^2.
let n: usize = self.dual.len();
let n_sq = F::from((n as f64).sqrt()).unwrap();
let r_pass = self.r_sq < n_sq * tol;
let s_pass = self.s_sq < n_sq * tol;
r_pass && s_pass
}
}
/// Penalizes the likelihood with both an L1-norm and L2-norm.
pub(crate) struct ElasticNet<F: Float> {
/// The L1 parameters for each element
l1_vec: Array1<F>,
/// The L2 parameters for each element
l2_vec: Array1<F>,
/// The dual solution
dual: Array1<F>,
/// The cumulative sum of residuals for each element
cum_res: Array1<F>,
/// ADMM penalty parameter
rho: F,
/// L2-Norm of primal residuals |r|^2
r_sq: F,
/// L2-Norm of dual residuals |s|^2
s_sq: F,
}
impl<F: Float> ElasticNet<F> {
/// Create the regularization from the diagonal, outsourcing the question of whether to include
/// the first term (commonly the intercept, which is left out) in the diagonal.
pub fn from_diag(l1: Array1<F>, l2: Array1<F>) -> Self {
let n: usize = l1.len();
let gamma = Array1::zeros(n);
let u = Array1::zeros(n);
Self {
l1_vec: l1,
l2_vec: l2,
dual: gamma,
cum_res: u,
rho: F::one(),
r_sq: F::infinity(), // or should it be NaN?
s_sq: F::infinity(),
}
}
fn update_rho(&mut self) {
// Can these be declared const?
let mu: F = F::from(8.).unwrap();
let tau: F = F::from(2.).unwrap();
if self.r_sq > mu * mu * self.s_sq {
self.rho *= tau;
self.cum_res /= tau;
}
if self.r_sq * mu * mu < self.s_sq {
self.rho /= tau;
self.cum_res *= tau;
}
}
}
impl<F: Float> IrlsReg<F> for ElasticNet<F> {
fn likelihood(&self, beta: &Array1<F>) -> F {
-(&self.l1_vec * beta.mapv(num_traits::Float::abs)).sum()
- F::from(0.5).unwrap() * (&self.l2_vec * &beta.mapv(|b| b * b)).sum()
}
// This is used in the fit's score function, for instance. Thus it includes the regularization
// terms and not the augmented term.
fn gradient(&self, jac: Array1<F>, regressors: &Array1<F>) -> Array1<F> {
jac - &self.l1_vec * ®ressors.mapv(F::sign) - &self.l2_vec * regressors
}
/// Update the dual solution and the cumulative residuals.
fn prepare(&mut self, beta: &Array1<F>) {
// Apply adaptive penalty term updating
self.update_rho();
let old_dual = self.dual.clone();
self.dual = soft_thresh(beta + &self.cum_res, &self.l1_vec / self.rho);
// the primal residuals
let r: Array1<F> = beta - &self.dual;
// the dual residuals
let s: Array1<F> = (&self.dual - old_dual) * self.rho;
self.cum_res += &r;
self.r_sq = r.mapv(|r| r * r).sum();
self.s_sq = s.mapv(|s| s * s).sum();
}
fn irls_like(&self, regressors: &Array1<F>) -> F {
-F::from(0.5).unwrap()
* self.rho
* (regressors - &self.dual + &self.cum_res)
.mapv(|x| x * x)
.sum()
- F::from(0.5).unwrap() * (&self.l2_vec * ®ressors.mapv(|b| b * b)).sum()
}
/// The beta term from the gradient is cancelled by the corresponding term from the Hessian.
/// The dual and residual terms remain.
fn irls_vec(&self, vec: Array1<F>, _regressors: &Array1<F>) -> Array1<F> {
let d: Array1<F> = &self.dual - &self.cum_res;
vec + d * self.rho
}
/// Add the constant rho to all elements of the diagonal of the Hessian.
fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
mat_diag += &self.l2_vec;
mat_diag += self.rho;
mat
}
fn terminate_ok(&self, tol: F) -> bool {
// Expressed like this, it should perhaps instead be an epsilon^2.
let n: usize = self.dual.len();
let n_sq = F::from((n as f64).sqrt()).unwrap();
let r_pass = self.r_sq < n_sq * tol;
let s_pass = self.s_sq < n_sq * tol;
r_pass && s_pass
}
}
/// The soft thresholding operator
fn soft_thresh<F: Float>(x: Array1<F>, lambda: Array1<F>) -> Array1<F> {
let sign_x = x.mapv(F::sign);
let abs_x = x.mapv(<F as num_traits::Float>::abs);
let red_x = abs_x - lambda;
let clipped = red_x.mapv(|x| if x < F::zero() { F::zero() } else { x });
sign_x * clipped
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use ndarray::array;
#[test]
fn ridge_matrix() {
let l = 1e-4;
let ridge = Ridge::from_diag(array![0., l]);
let mat = array![[0.5, 0.1], [0.1, 0.2]];
let mut target_mat = mat.clone();
target_mat[[1, 1]] += l;
let dummy_beta = array![0., 0.];
assert_eq!(ridge.irls_mat(mat, &dummy_beta), target_mat);
}
#[test]
fn soft_thresh_correct() {
let x = array![0.25, -0.1, -0.4, 0.3, 0.5, -0.5];
let lambda = array![-0., 0.0, 0.1, 0.1, 1.0, 1.0];
let target = array![0.25, -0.1, -0.3, 0.2, 0., 0.];
let output = soft_thresh(x, lambda);
assert_abs_diff_eq!(target, output);
}
}