Skip to main content

lfa/optim/
adam.rs

1use super::*;
2use ndarray::Array1;
3use std::ops::MulAssign;
4
5const EPS: f64 = 1e-7;
6
7/// Adaptive moment estimation gradient descent.
8///
9/// https://arxiv.org/pdf/1412.6980.pdf
10#[derive(Clone, Debug, PartialEq)]
11#[cfg_attr(
12    feature = "serde",
13    derive(Serialize, Deserialize),
14    serde(crate = "serde_crate")
15)]
16pub struct Adam {
17    beta1: f64,
18    beta1_prod: f64,
19
20    beta2: f64,
21    beta2_prod: f64,
22
23    learning_rate: f64,
24
25    exp_avg: Array1<f64>,
26    exp_avg_sq: Array1<f64>,
27}
28
29impl Adam {
30    pub fn new(n_params: usize, learning_rate: f64, beta1: f64, beta2: f64) -> Self {
31        Adam {
32            beta1,
33            beta1_prod: beta1,
34
35            beta2,
36            beta2_prod: beta2,
37
38            learning_rate,
39
40            exp_avg: Array1::zeros(n_params),
41            exp_avg_sq: Array1::zeros(n_params),
42        }
43    }
44}
45
46impl Optimiser<Features> for Adam {
47    fn step_scaled(
48        &mut self,
49        weights: &mut ArrayViewMut1<f64>,
50        features: &Features,
51        scale_factor: f64,
52    ) -> Result<()>
53    {
54        self.beta1_prod *= self.beta1;
55        self.beta2_prod *= self.beta2;
56
57        match features {
58            Features::Dense(da) => {
59                let m1 = self.exp_avg.as_slice_memory_order_mut().unwrap();
60                let m2 = self.exp_avg_sq.as_slice_memory_order_mut().unwrap();
61
62                for (i, a) in da.indexed_iter() {
63                    let g = a * scale_factor;
64
65                    let m1_new = self.beta1 * m1[i] + (1.0 - self.beta1) * g;
66                    let m2_new = self.beta2 * m2[i] + (1.0 - self.beta2) * g * g;
67
68                    let m1_unbiased = m1_new / (1.0 - self.beta1_prod);
69                    let m2_unbiased = m2_new / (1.0 - self.beta2_prod);
70
71                    m1[i] = m1_new;
72                    m2[i] = m2_new;
73                    weights[i] += self.learning_rate * m1_unbiased / (m2_unbiased.sqrt() + EPS);
74                }
75            },
76            Features::Sparse(sa) => {
77                self.exp_avg.mul_assign(self.beta1);
78                self.exp_avg_sq.mul_assign(self.beta2);
79
80                let m1 = self.exp_avg.as_slice_memory_order_mut().unwrap();
81                let m2 = self.exp_avg_sq.as_slice_memory_order_mut().unwrap();
82
83                for (&i, a) in sa.iter() {
84                    let g = a * scale_factor;
85
86                    let m1_new = m1[i] + (1.0 - self.beta1) * g;
87                    let m2_new = m2[i] + (1.0 - self.beta2) * g * g;
88
89                    let m1_unbiased = m1_new / (1.0 - self.beta1_prod);
90                    let m2_unbiased = m2_new / (1.0 - self.beta2_prod);
91
92                    m1[i] = m1_new;
93                    m2[i] = m2_new;
94                    weights[i] += self.learning_rate * m1_unbiased / (m2_unbiased.sqrt() + EPS);
95                }
96            },
97        }
98
99        Ok(())
100    }
101
102    fn reset(&mut self) {
103        self.exp_avg.fill(0.0);
104        self.exp_avg_sq.fill(0.0);
105    }
106}