1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
//! Regularization methods and their effect on the likelihood and the matrix and
//! vector components of the IRLS step.
use crate::{num::Float, Array1, Array2};
use ndarray::ArrayViewMut1;

/// Penalize the likelihood with a smooth function of the regression parameters.
pub trait IrlsReg<F>
where
    F: Float,
{
    /// Defines the impact of the regularization approach on the likelihood. It
    /// must be zero when the regressors are zero, otherwise some assumptions in
    /// the fitting statistics section may be invalidated.
    fn likelihood(&self, l: F, regressors: &Array1<F>) -> F;
    /// Defines the regularization effect on the gradient of the likelihood with respect
    /// to beta.
    fn gradient(&self, l: Array1<F>, regressors: &Array1<F>) -> Array1<F>;
    /// Defines the adjustment to the vector side of the IRLS update equation.
    /// It is the negative gradient of the penalty plus the hessian times the
    /// regressors. A default implementation is provided, but this is zero for
    /// ridge regression so it is allowed to be overridden.
    fn irls_vec(&self, vec: Array1<F>, regressors: &Array1<F>) -> Array1<F>;
    /// Defines the change to the matrix side of the IRLS update equation. It
    /// subtracts the Hessian of the penalty from the matrix. The difference is
    /// typically only on the diagonal.
    fn irls_mat(&self, mat: Array2<F>, regressors: &Array1<F>) -> Array2<F>;

    /// True for the empty regularization
    fn is_null(&self) -> bool;
}

/// Represents a lack of regularization.
pub struct Null {}
impl<F: Float> IrlsReg<F> for Null {
    #[inline]
    fn likelihood(&self, l: F, _: &Array1<F>) -> F {
        l
    }
    #[inline]
    fn gradient(&self, jac: Array1<F>, _: &Array1<F>) -> Array1<F> {
        jac
    }
    #[inline]
    fn irls_vec(&self, vec: Array1<F>, _: &Array1<F>) -> Array1<F> {
        vec
    }
    #[inline]
    fn irls_mat(&self, mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
        mat
    }

    fn is_null(&self) -> bool {
        true
    }
}

/// Penalizes the regression by lambda/2 * |beta|^2.
pub struct Ridge<F: Float> {
    l2_vec: Array1<F>,
}

impl<F: Float> Ridge<F> {
    /// Create the regularization from the diagonal. This outsources the
    /// question of whether to include the first term (usually the intercept) in
    /// the regularization.
    pub fn from_diag(l2: Array1<F>) -> Self {
        Self { l2_vec: l2 }
    }
}

impl<F: Float> IrlsReg<F> for Ridge<F> {
    /// The likelihood is penalized by 0.5 * lambda_2 * |beta|^2
    fn likelihood(&self, l: F, beta: &Array1<F>) -> F {
        l - F::from(0.5).unwrap() * (&self.l2_vec * &beta.mapv(|b| b * b)).sum()
    }
    /// The gradient is penalized by lambda_2 * beta.
    fn gradient(&self, jac: Array1<F>, beta: &Array1<F>) -> Array1<F> {
        jac - (&self.l2_vec * beta)
    }
    /// Ridge regression has no effect on the vector side of the IRLS step equation.
    #[inline]
    fn irls_vec(&self, vec: Array1<F>, _: &Array1<F>) -> Array1<F> {
        vec
    }
    /// Add lambda to the diagonals of the information matrix.
    fn irls_mat(&self, mut mat: Array2<F>, _: &Array1<F>) -> Array2<F> {
        let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
        mat_diag += &self.l2_vec;
        mat
    }

    fn is_null(&self) -> bool {
        false
    }
}

/// Penalizes the regression by lambda * s * log(cosh(beta/s)) which approximates
/// lambda * |beta| for |beta| >> s.
/// This approach is largely untested.
pub struct LassoSmooth<F: Float> {
    l1_vec: Array1<F>,
    /// The smoothing parameter. Regression parameters with a magnitude comparable to or
    /// less than this value can be interpreted as being "zeroed".
    // This could probably be a vector as well.
    s: F,
}

impl<F: Float> LassoSmooth<F> {
    /// Create the regularization from the diagonal. This outsources the
    /// question of whether to include the first term (usually the intercept) in
    /// the regularization.
    pub fn from_diag(l1: Array1<F>, s: F) -> Self {
        Self { l1_vec: l1, s }
    }

    // helper function for the sech^2 terms
    fn sech_sq(&self, beta: F) -> F {
        let sech = (beta / self.s).cosh().recip();
        sech * sech
    }
}

impl<F: Float> IrlsReg<F> for LassoSmooth<F> {
    /// The likelihood is penalized by lambda_1 * s * log(cosh(beta/s))
    fn likelihood(&self, l: F, beta: &Array1<F>) -> F {
        l - (&self.l1_vec * &beta.mapv(|b| self.s * (b / self.s).cosh().ln())).sum()
    }
    /// The gradient is penalized by lambda_1 * tanh(beta/s)
    fn gradient(&self, jac: Array1<F>, beta: &Array1<F>) -> Array1<F> {
        jac - (&self.l1_vec * &beta.mapv(|b| (b / self.s).tanh()))
    }
    /// The gradient and hessian terms of the penalty don't cancel here.
    fn irls_vec(&self, vec: Array1<F>, beta: &Array1<F>) -> Array1<F> {
        vec - &self.l1_vec * &beta.mapv(|b| (b / self.s).tanh() - (b / self.s) * self.sech_sq(b))
    }
    /// Add sech^2 term to diagonals of Hessian.
    fn irls_mat(&self, mut mat: Array2<F>, beta: &Array1<F>) -> Array2<F> {
        let mut mat_diag: ArrayViewMut1<F> = mat.diag_mut();
        mat_diag += &(&self.l1_vec * &beta.mapv(|b| self.s.recip() * self.sech_sq(b)));
        mat
    }

    fn is_null(&self) -> bool {
        false
    }
}

// TODO: Piecewise Lasso with a penalty shaped like \_/
// TODO: Elastic Net (L1 + L2)

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::array;

    #[test]
    fn ridge_matrix() {
        let l = 1e-4;
        let ridge = Ridge::from_diag(array![0., l]);
        let mat = array![[0.5, 0.1], [0.1, 0.2]];
        let mut target_mat = mat.clone();
        target_mat[[1, 1]] += l;
        let dummy_beta = array![0., 0.];
        assert_eq!(ridge.irls_mat(mat, &dummy_beta), target_mat);
    }
}