ndarray_glm/
regularization.rs

1//! Regularization methods and their effect on the likelihood and the matrix and
2//! vector components of the IRLS step.
3use crate::{error::RegressionResult, num::Float, Array1, Array2};
4use ndarray::ArrayViewMut1;
5use ndarray_linalg::SolveH;
6
7/// Penalize the likelihood with a smooth function of the regression parameters.
8pub(crate) trait IrlsReg<F>
9where
10    F: Float,
11{
12    /// Defines the impact of the regularization approach on the likelihood. It
13    /// must be zero when the regressors are zero, otherwise some assumptions in
14    /// the fitting statistics section may be invalidated.
15    fn likelihood(&self, regressors: &Array1<F>) -> F;
16
17    /// Defines the regularization effect on the gradient of the likelihood with respect
18    /// to beta.
19    fn gradient(&self, l: Array1<F>, regressors: &Array1<F>) -> Array1<F>;
20
21    /// Processing to do before each step.
22    fn prepare(&mut self, _guess: &Array1<F>) {}
23
24    /// For ADMM, the likelihood in the IRLS step is augmented with a rho term and does not include
25    /// the L1 component. Without ADMM this should return the actual un-augmented likelihood.
26    fn irls_like(&self, regressors: &Array1<F>) -> F {
27        self.likelihood(regressors)
28    }
29
30    /// Defines the adjustment to the vector side of the IRLS update equation.
31    /// It is the negative gradient of the penalty plus the hessian times the
32    /// regressors. A default implementation is provided, but this is zero for
33    /// ridge regression so it is allowed to be overridden.
34    fn irls_vec(&self, vec: Array1<F>, regressors: &Array1<F>) -> Array1<F>;
35
36    /// Defines the change to the matrix side of the IRLS update equation. It
37    /// subtracts the Hessian of the penalty from the matrix. The difference is
38    /// typically only on the diagonal.
39    fn irls_mat(&self, mat: Array2<F>, regressors: &Array1<F>) -> Array2<F>;
40
41    /// Return the next guess under regularization given the current guess and the RHS and LHS of
42    /// the unregularized IRLS matrix solution equation for the next guess.
43    fn next_guess(
44        &mut self,
45        guess: &Array1<F>,
46        irls_vec: Array1<F>,
47        irls_mat: Array2<F>,
48    ) -> RegressionResult<Array1<F>> {
49        self.prepare(guess);
50        // Apply the regularization effects to the Hessian (LHS)
51        let lhs = self.irls_mat(irls_mat, guess);
52        // Apply regularization effects to the modified Jacobian (RHS)
53        let rhs = self.irls_vec(irls_vec, guess);
54        let next_guess = lhs.solveh_into(rhs)?;
55        Ok(next_guess)
56    }
57
58    fn terminate_ok(&self, _tol: F) -> bool {
59        true
60    }
61}
62
63/// Represents a lack of regularization.
64pub struct Null {}
65
66impl<F: Float> IrlsReg<F> for Null {
67    #[inline]
68    fn likelihood(&self, _: &Array1<F>) -> F {
69        F::zero()
70    }
71    #[inline]
72    fn gradient(&self, jac: Array1<F>, _: &Array1<F>) -> Array1<F> {
73        jac
74    }
75    #[inline]
76    fn irls_vec(&self, vec: Array1<F>, _: &Array1<F>) -> Array1<F> {
77        vec
78    }
79    #[inline]
80    fn irls_mat(&self, mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
81        mat
82    }
83}
84
85/// Penalizes the regression by lambda/2 * |beta|^2.
86pub struct Ridge<F: Float> {
87    l2_vec: Array1<F>,
88}
89
90impl<F: Float> Ridge<F> {
91    /// Create the regularization from the diagonal. This outsources the
92    /// question of whether to include the first term (usually the intercept) in
93    /// the regularization.
94    pub fn from_diag(l2: Array1<F>) -> Self {
95        Self { l2_vec: l2 }
96    }
97}
98
99impl<F: Float> IrlsReg<F> for Ridge<F> {
100    /// The likelihood is penalized by 0.5 * lambda_2 * |beta|^2
101    fn likelihood(&self, beta: &Array1<F>) -> F {
102        -F::from(0.5).unwrap() * (&self.l2_vec * &beta.mapv(|b| b * b)).sum()
103    }
104    /// The gradient is penalized by lambda_2 * beta.
105    fn gradient(&self, jac: Array1<F>, beta: &Array1<F>) -> Array1<F> {
106        jac - (&self.l2_vec * beta)
107    }
108    /// Ridge regression has no effect on the vector side of the IRLS step equation, because the
109    /// 1st and 2nd order derivative terms exactly cancel.
110    #[inline]
111    fn irls_vec(&self, vec: Array1<F>, _: &Array1<F>) -> Array1<F> {
112        vec
113    }
114    /// Add lambda to the diagonals of the information matrix.
115    fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
116        let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
117        mat_diag += &self.l2_vec;
118        mat
119    }
120}
121
122/// Penalizes the likelihood by the L1-norm of the parameters.
123pub struct Lasso<F: Float> {
124    /// The L1 parameters for each element
125    l1_vec: Array1<F>,
126    /// The dual solution
127    dual: Array1<F>,
128    /// The cumulative sum of residuals for each element
129    cum_res: Array1<F>,
130    /// ADMM penalty parameter
131    rho: F,
132    /// L2-Norm of primal residuals |r|^2
133    r_sq: F,
134    /// L2-Norm of dual residuals |s|^2
135    s_sq: F,
136}
137
138impl<F: Float> Lasso<F> {
139    /// Create the regularization from the diagonal, outsourcing the question of whether to include
140    /// the first term (commonly the intercept, which is left out) in the diagonal.
141    pub fn from_diag(l1: Array1<F>) -> Self {
142        let n: usize = l1.len();
143        let gamma = Array1::zeros(n);
144        let u = Array1::zeros(n);
145        Self {
146            l1_vec: l1,
147            dual: gamma,
148            cum_res: u,
149            rho: F::one(),
150            r_sq: F::infinity(), // or should it be NaN?
151            s_sq: F::infinity(),
152        }
153    }
154
155    fn update_rho(&mut self) {
156        // Can these be declared const?
157        let mu: F = F::from(8.).unwrap();
158        let tau: F = F::from(2.).unwrap();
159        if self.r_sq > mu * mu * self.s_sq {
160            self.rho *= tau;
161            self.cum_res /= tau;
162        }
163        if self.r_sq * mu * mu < self.s_sq {
164            self.rho /= tau;
165            self.cum_res *= tau;
166        }
167    }
168}
169
170impl<F: Float> IrlsReg<F> for Lasso<F> {
171    fn likelihood(&self, beta: &Array1<F>) -> F {
172        -(&self.l1_vec * beta.mapv(num_traits::Float::abs)).sum()
173    }
174
175    // This is used in the fit's score function, for instance. Thus it includes the regularization
176    // terms and not the augmented term.
177    fn gradient(&self, jac: Array1<F>, regressors: &Array1<F>) -> Array1<F> {
178        jac - &self.l1_vec * &regressors.mapv(F::sign)
179    }
180
181    /// Update the dual solution and the cumulative residuals.
182    fn prepare(&mut self, beta: &Array1<F>) {
183        // Apply adaptive penalty term updating
184        self.update_rho();
185
186        let old_dual = self.dual.clone();
187
188        self.dual = soft_thresh(beta + &self.cum_res, &self.l1_vec / self.rho);
189        // the primal residuals
190        let r: Array1<F> = beta - &self.dual;
191        // the dual residuals
192        let s: Array1<F> = (&self.dual - old_dual) * self.rho;
193        self.cum_res += &r;
194
195        self.r_sq = r.mapv(|r| r * r).sum();
196        self.s_sq = s.mapv(|s| s * s).sum();
197    }
198
199    fn irls_like(&self, regressors: &Array1<F>) -> F {
200        -F::from(0.5).unwrap()
201            * self.rho
202            * (regressors - &self.dual + &self.cum_res)
203                .mapv(|x| x * x)
204                .sum()
205    }
206
207    /// The beta term from the gradient is cancelled by the corresponding term from the Hessian.
208    /// The dual and residual terms remain.
209    fn irls_vec(&self, vec: Array1<F>, _regressors: &Array1<F>) -> Array1<F> {
210        let d: Array1<F> = &self.dual - &self.cum_res;
211        vec + d * self.rho
212    }
213
214    /// Add the constant rho to all elements of the diagonal of the Hessian.
215    fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
216        let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
217        mat_diag += self.rho;
218        mat
219    }
220
221    fn terminate_ok(&self, tol: F) -> bool {
222        // Expressed like this, it should perhaps instead be an epsilon^2.
223        let n: usize = self.dual.len();
224        let n_sq = F::from((n as f64).sqrt()).unwrap();
225        let r_pass = self.r_sq < n_sq * tol;
226        let s_pass = self.s_sq < n_sq * tol;
227        r_pass && s_pass
228    }
229}
230
231/// Penalizes the likelihood with both an L1-norm and L2-norm.
232pub struct ElasticNet<F: Float> {
233    /// The L1 parameters for each element
234    l1_vec: Array1<F>,
235    /// The L2 parameters for each element
236    l2_vec: Array1<F>,
237    /// The dual solution
238    dual: Array1<F>,
239    /// The cumulative sum of residuals for each element
240    cum_res: Array1<F>,
241    /// ADMM penalty parameter
242    rho: F,
243    /// L2-Norm of primal residuals |r|^2
244    r_sq: F,
245    /// L2-Norm of dual residuals |s|^2
246    s_sq: F,
247}
248
249impl<F: Float> ElasticNet<F> {
250    /// Create the regularization from the diagonal, outsourcing the question of whether to include
251    /// the first term (commonly the intercept, which is left out) in the diagonal.
252    pub fn from_diag(l1: Array1<F>, l2: Array1<F>) -> Self {
253        let n: usize = l1.len();
254        let gamma = Array1::zeros(n);
255        let u = Array1::zeros(n);
256        Self {
257            l1_vec: l1,
258            l2_vec: l2,
259            dual: gamma,
260            cum_res: u,
261            rho: F::one(),
262            r_sq: F::infinity(), // or should it be NaN?
263            s_sq: F::infinity(),
264        }
265    }
266
267    fn update_rho(&mut self) {
268        // Can these be declared const?
269        let mu: F = F::from(8.).unwrap();
270        let tau: F = F::from(2.).unwrap();
271        if self.r_sq > mu * mu * self.s_sq {
272            self.rho *= tau;
273            self.cum_res /= tau;
274        }
275        if self.r_sq * mu * mu < self.s_sq {
276            self.rho /= tau;
277            self.cum_res *= tau;
278        }
279    }
280}
281
282impl<F: Float> IrlsReg<F> for ElasticNet<F> {
283    fn likelihood(&self, beta: &Array1<F>) -> F {
284        -(&self.l1_vec * beta.mapv(num_traits::Float::abs)).sum()
285            -F::from(0.5).unwrap() * (&self.l2_vec * &beta.mapv(|b| b * b)).sum()
286    }
287
288    // This is used in the fit's score function, for instance. Thus it includes the regularization
289    // terms and not the augmented term.
290    fn gradient(&self, jac: Array1<F>, regressors: &Array1<F>) -> Array1<F> {
291        jac - &self.l1_vec * &regressors.mapv(F::sign) - &self.l2_vec * regressors
292    }
293
294    /// Update the dual solution and the cumulative residuals.
295    fn prepare(&mut self, beta: &Array1<F>) {
296        // Apply adaptive penalty term updating
297        self.update_rho();
298
299        let old_dual = self.dual.clone();
300       
301        self.dual = soft_thresh(beta + &self.cum_res, &self.l1_vec / self.rho);
302        // the primal residuals
303        let r: Array1<F> = beta - &self.dual;
304        // the dual residuals
305        let s: Array1<F> = (&self.dual - old_dual) * self.rho;
306        self.cum_res += &r;
307
308        self.r_sq = r.mapv(|r| r * r).sum();
309        self.s_sq = s.mapv(|s| s * s).sum();
310    }
311
312    fn irls_like(&self, regressors: &Array1<F>) -> F {
313        -F::from(0.5).unwrap()
314            * self.rho
315            * (regressors - &self.dual + &self.cum_res)
316                .mapv(|x| x * x)
317                .sum()
318            -F::from(0.5).unwrap() * (&self.l2_vec * &regressors.mapv(|b| b * b)).sum()
319    }
320
321    /// The beta term from the gradient is cancelled by the corresponding term from the Hessian.
322    /// The dual and residual terms remain.
323    fn irls_vec(&self, vec: Array1<F>, _regressors: &Array1<F>) -> Array1<F> {
324        let d: Array1<F> = &self.dual - &self.cum_res;
325        vec + d * self.rho
326    }
327
328    /// Add the constant rho to all elements of the diagonal of the Hessian.
329    fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
330        let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
331        mat_diag += &self.l2_vec;
332        mat_diag += self.rho;
333        mat
334    }
335
336    fn terminate_ok(&self, tol: F) -> bool {
337        // Expressed like this, it should perhaps instead be an epsilon^2.
338        let n: usize = self.dual.len();
339        let n_sq = F::from((n as f64).sqrt()).unwrap();
340        let r_pass = self.r_sq < n_sq * tol;
341        let s_pass = self.s_sq < n_sq * tol;
342        r_pass && s_pass
343    }
344}
345
346/// The soft thresholding operator
347fn soft_thresh<F: Float>(x: Array1<F>, lambda: Array1<F>) -> Array1<F> {
348    let sign_x = x.mapv(F::sign);
349    let abs_x = x.mapv(<F as num_traits::Float>::abs);
350    let red_x = abs_x - lambda;
351    let clipped = red_x.mapv(|x| if x < F::zero() { F::zero() } else { x });
352    sign_x * clipped
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358    use approx::assert_abs_diff_eq;
359    use ndarray::array;
360
361    #[test]
362    fn ridge_matrix() {
363        let l = 1e-4;
364        let ridge = Ridge::from_diag(array![0., l]);
365        let mat = array![[0.5, 0.1], [0.1, 0.2]];
366        let mut target_mat = mat.clone();
367        target_mat[[1, 1]] += l;
368        let dummy_beta = array![0., 0.];
369        assert_eq!(ridge.irls_mat(mat, &dummy_beta), target_mat);
370    }
371
372    #[test]
373    fn soft_thresh_correct() {
374        let x = array![0.25, -0.1, -0.4, 0.3, 0.5, -0.5];
375        let lambda = array![-0., 0.0, 0.1, 0.1, 1.0, 1.0];
376        let target = array![0.25, -0.1, -0.3, 0.2, 0., 0.];
377        let output = soft_thresh(x, lambda);
378        assert_abs_diff_eq!(target, output);
379    }
380}