1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
use super::Optimizer;
use numerics::ArraySlice;
use std::ops::DerefMut;
use std::sync::Arc;
use {numerics, HogwildParameter, ParameterNode, Variable};
use ndarray::Axis;
pub struct SGD {
learning_rate: f32,
clamp: Option<(f32, f32)>,
}
impl Default for SGD {
fn default() -> Self {
Self::new()
}
}
impl SGD {
pub fn new() -> Self {
SGD {
learning_rate: 0.05,
clamp: None,
}
}
pub fn learning_rate(mut self, learning_rate: f32) -> Self {
self.learning_rate = learning_rate;
self
}
pub fn clamp(mut self, min: f32, max: f32) -> Self {
self.clamp = Some((min, max));
self
}
fn inner_step<T: DerefMut<Target = ::nodes::GradientAccumulator>>(
&self,
param: &Arc<HogwildParameter>,
mut sink: T,
) {
let param_value = unsafe { param.value_mut() };
let learning_rate = self.learning_rate;
let sink = sink.deref_mut();
if let Some((min, max)) = self.clamp {
sink.clamp(min, max);
}
if sink.has_dense() {
param_value.scaled_add(-self.learning_rate, sink.gradient());
} else {
for (row_idx, grad) in sink.sparse_iter() {
let mut param_row = param_value.subview_mut(Axis(0), row_idx);
numerics::map_add_assign_slice(
param_row.into_slice().unwrap(),
grad.fast_slice(),
|x| -learning_rate * x,
);
}
}
}
}
impl Optimizer for SGD {
fn step(&self, parameters: &[Variable<ParameterNode>]) {
for parameter in parameters {
self.inner_step(¶meter.node.value, parameter.node.gradient.borrow_mut());
parameter.node.zero_gradient();
}
}
}