1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
use ndarray::Array1;
use super::*;
use std::ops::MulAssign;
const EPS: f64 = 1e-7;
#[derive(Clone, Debug, PartialEq)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct Adam {
beta1: f64,
beta1_prod: f64,
beta2: f64,
beta2_prod: f64,
learning_rate: f64,
exp_avg: Array1<f64>,
exp_avg_sq: Array1<f64>,
}
impl Adam {
pub fn new(n_params: usize, learning_rate: f64, beta1: f64, beta2: f64) -> Self {
Adam {
beta1,
beta1_prod: beta1,
beta2,
beta2_prod: beta2,
learning_rate,
exp_avg: Array1::zeros(n_params),
exp_avg_sq: Array1::zeros(n_params),
}
}
}
impl Optimiser<Features> for Adam {
fn step(
&mut self,
weights: &mut ArrayViewMut1<f64>,
features: &Features,
loss: f64
) -> Result<()>
{
self.beta1_prod *= self.beta1;
self.beta2_prod *= self.beta2;
match features {
Features::Dense(activations) => {
let m1 = self.exp_avg.as_slice_memory_order_mut().unwrap();
let m2 = self.exp_avg_sq.as_slice_memory_order_mut().unwrap();
for (i, a) in activations.indexed_iter() {
let g = a * loss;
let m1_new = self.beta1 * m1[i] + (1.0 - self.beta1) * g;
let m2_new = self.beta2 * m2[i] + (1.0 - self.beta2) * g * g;
let m1_unbiased = m1_new / (1.0 - self.beta1_prod);
let m2_unbiased = m2_new / (1.0 - self.beta2_prod);
m1[i] = m1_new;
m2[i] = m2_new;
weights[i] += self.learning_rate * m1_unbiased / (m2_unbiased.sqrt() + EPS);
}
},
Features::Sparse(_, activations) => {
self.exp_avg.mul_assign(self.beta1);
self.exp_avg_sq.mul_assign(self.beta2);
let m1 = self.exp_avg.as_slice_memory_order_mut().unwrap();
let m2 = self.exp_avg_sq.as_slice_memory_order_mut().unwrap();
for (&i, a) in activations.iter() {
let g = a * loss;
let m1_new = m1[i] + (1.0 - self.beta1) * g;
let m2_new = m2[i] + (1.0 - self.beta2) * g * g;
let m1_unbiased = m1_new / (1.0 - self.beta1_prod);
let m2_unbiased = m2_new / (1.0 - self.beta2_prod);
m1[i] = m1_new;
m2[i] = m2_new;
weights[i] += self.learning_rate * m1_unbiased / (m2_unbiased.sqrt() + EPS);
}
},
}
Ok(())
}
fn reset(&mut self) {
self.exp_avg.fill(0.0);
self.exp_avg_sq.fill(0.0);
}
}