1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
use crate::backend::{Backend, BackendScale, BackendAxpy, BackendAdd};
use crate::optimizer::{Optimizer, OptimizerContext};
use crate::tensor::{Tensor, TensorShape};
use core::marker::PhantomData;
pub struct SgdContext<N, B>
where B: Backend<N>
{
moments: B::Tensor,
_m: PhantomData<fn(N, B)>,
}
impl<N, B: Backend<N>> OptimizerContext for SgdContext<N, B> {
fn new<S: Into<TensorShape>>(shape: S) -> Self {
Self {
moments: B::Tensor::new(shape),
_m: Default::default(),
}
}
}
pub struct Sgd<N, B: Backend<N>> {
learning_rate: f32,
momentum: f32,
nesterov: bool,
_m: PhantomData<fn(N, B)>,
}
impl<N, B> Default for Sgd<N, B>
where B: Backend<N>
{
fn default() -> Self {
Self {
learning_rate: 0.01,
momentum: 0.0,
nesterov: false,
_m: Default::default(),
}
}
}
impl<N, B> Sgd<N, B>
where B: Backend<N>
{
pub fn new(learning_rate: f32, momentum: f32, nesterov: bool) -> Self {
Self {
learning_rate,
momentum,
nesterov,
_m: Default::default(),
}
}
}
impl<N, B: Backend<N> + BackendScale<N> + BackendAxpy<N> + BackendAdd<N>> Optimizer<N, B> for Sgd<N, B> {
type Context = SgdContext<N, B>;
fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &mut B::Tensor) {
backend.scale(&mut ctx.moments, backend.scalar_f32(self.momentum));
backend.axpy(&mut ctx.moments, backend.scalar_f32(-self.learning_rate), grads);
if self.nesterov {
backend.axpy(params, backend.scalar_f32(self.momentum), &ctx.moments);
backend.axpy(params, backend.scalar_f32(-self.learning_rate), grads);
} else {
backend.add(params, &ctx.moments);
}
}
}