rusty_machine/learning/optim/
grad_desc.rs1use learning::optim::{Optimizable, OptimAlgorithm};
12use linalg::Vector;
13use linalg::{Matrix, BaseMatrix};
14use rulinalg::utils;
15
16use learning::toolkit::rand_utils;
17
18const LEARNING_EPS: f64 = 1e-20;
19
20#[derive(Clone, Copy, Debug)]
22pub struct GradientDesc {
23 alpha: f64,
25 iters: usize,
27}
28
29impl Default for GradientDesc {
36 fn default() -> GradientDesc {
37 GradientDesc {
38 alpha: 0.3,
39 iters: 100,
40 }
41 }
42}
43
44impl GradientDesc {
45 pub fn new(alpha: f64, iters: usize) -> GradientDesc {
58 assert!(alpha > 0f64,
59 "The step size (alpha) must be greater than 0.");
60
61 GradientDesc {
62 alpha: alpha,
63 iters: iters,
64 }
65 }
66}
67
68impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
69 fn optimize(&self,
70 model: &M,
71 start: &[f64],
72 inputs: &M::Inputs,
73 targets: &M::Targets)
74 -> Vec<f64> {
75
76 let mut optimizing_val = Vector::new(start.to_vec());
78 let mut start_iter_cost = 0f64;
80
81 for _ in 0..self.iters {
82 let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);
84
85 if (start_iter_cost - cost).abs() < LEARNING_EPS {
87 break;
88 } else {
89 optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
91 start_iter_cost = cost;
93 }
94 }
95 optimizing_val.into_vec()
96 }
97}
98
99#[derive(Clone, Copy, Debug)]
103pub struct StochasticGD {
104 alpha: f64,
106 mu: f64,
108 iters: usize,
110}
111
112impl Default for StochasticGD {
120 fn default() -> StochasticGD {
121 StochasticGD {
122 alpha: 0.1,
123 mu: 0.1,
124 iters: 20,
125 }
126 }
127}
128
129impl StochasticGD {
130 pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
143 assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
144 assert!(mu > 0f64, "The step size (mu) must be greater than 0.");
145
146 StochasticGD {
147 alpha: alpha,
148 mu: mu,
149 iters: iters,
150 }
151 }
152}
153
154impl<M> OptimAlgorithm<M> for StochasticGD
155 where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
156{
157 fn optimize(&self,
158 model: &M,
159 start: &[f64],
160 inputs: &M::Inputs,
161 targets: &M::Targets)
162 -> Vec<f64> {
163
164 let mut optimizing_val = Vector::new(start.to_vec());
166 let mut delta_w = Vector::zeros(start.len());
168
169 let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
171 let mut start_iter_cost = 0f64;
173
174 for _ in 0..self.iters {
175 let mut end_cost = 0f64;
177 rand_utils::in_place_fisher_yates(&mut permutation);
179 for i in &permutation {
180 let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
182 &inputs.select_rows(&[*i]),
183 &targets.select_rows(&[*i]));
184
185 delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
187 optimizing_val = &optimizing_val - &delta_w * self.mu;
189 end_cost += cost;
191 }
192
193 end_cost /= inputs.rows() as f64;
194
195 if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
197 break;
198 } else {
199 start_iter_cost = end_cost;
201 }
202 }
203 optimizing_val.into_vec()
204 }
205}
206
207#[derive(Debug)]
211pub struct AdaGrad {
212 alpha: f64,
213 tau: f64,
214 iters: usize,
215}
216
217impl AdaGrad {
218 pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
230 assert!(alpha > 0f64,
231 "The step size (alpha) must be greater than 0.");
232 assert!(tau >= 0f64,
233 "The adaptive constant (tau) cannot be negative.");
234 AdaGrad {
235 alpha: alpha,
236 tau: tau,
237 iters: iters,
238 }
239 }
240}
241
242impl Default for AdaGrad {
243 fn default() -> AdaGrad {
244 AdaGrad {
245 alpha: 1f64,
246 tau: 3f64,
247 iters: 100,
248 }
249 }
250}
251
252impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
253 fn optimize(&self,
254 model: &M,
255 start: &[f64],
256 inputs: &M::Inputs,
257 targets: &M::Targets)
258 -> Vec<f64> {
259
260 let mut ada_s = Vector::zeros(start.len());
262 let mut optimizing_val = Vector::new(start.to_vec());
264
265 let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
267 let mut start_iter_cost = 0f64;
269
270 for _ in 0..self.iters {
271 let mut end_cost = 0f64;
273 rand_utils::in_place_fisher_yates(&mut permutation);
275 for i in &permutation {
276 let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
278 &inputs.select_rows(&[*i]),
279 &targets.select_rows(&[*i]));
280 utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y);
282
283 utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| {
285 *x = self.alpha * (*x / (self.tau + (y).sqrt()))
286 });
287 optimizing_val = &optimizing_val - Vector::new(vec_data);
289 end_cost += cost;
291 }
292 end_cost /= inputs.rows() as f64;
293
294 if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
296 break;
297 } else {
298 start_iter_cost = end_cost;
300 }
301 }
302 optimizing_val.into_vec()
303 }
304}
305
306#[derive(Debug, Clone, Copy)]
310pub struct RMSProp {
311 learning_rate: f64,
313 decay_rate: f64,
315 epsilon: f64,
317 iters: usize,
319}
320
321impl Default for RMSProp {
330 fn default() -> RMSProp {
331 RMSProp {
332 learning_rate: 0.01,
333 decay_rate: 0.9,
334 epsilon: 1.0e-5,
335 iters: 50
336 }
337 }
338}
339
340impl RMSProp {
341 pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
353 assert!(0f64 < learning_rate, "The learning rate must be positive");
354 assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
355 assert!(0f64 < epsilon, "Epsilon must be positive");
356
357 RMSProp {
358 decay_rate: decay_rate,
359 learning_rate: learning_rate,
360 epsilon: epsilon,
361 iters: iters
362 }
363 }
364}
365
366impl<M> OptimAlgorithm<M> for RMSProp
367 where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
368 fn optimize(&self,
369 model: &M,
370 start: &[f64],
371 inputs: &M::Inputs,
372 targets: &M::Targets)
373 -> Vec<f64> {
374 let mut params = Vector::new(start.to_vec());
376 let mut rmsprop_cache = Vector::zeros(start.len());
378
379 let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
381 let mut prev_cost = 0f64;
383
384 for _ in 0..self.iters {
385 let mut end_cost = 0f64;
387 rand_utils::in_place_fisher_yates(&mut permutation);
389 for i in &permutation {
390 let (cost, grad) = model.compute_grad(params.data(),
391 &inputs.select_rows(&[*i]),
392 &targets.select_rows(&[*i]));
393
394 let mut grad = Vector::new(grad);
395 let grad_squared = grad.clone().apply(&|x| x*x);
396 rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
398 utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
400 *x = *x * self.learning_rate / (y + self.epsilon).sqrt();
401 });
402 params = ¶ms - &grad;
403
404 end_cost += cost;
405 }
406 end_cost /= inputs.rows() as f64;
407
408 if (prev_cost - end_cost).abs() < LEARNING_EPS {
410 break;
411 } else {
412 prev_cost = end_cost;
413 }
414 }
415 params.into_vec()
416 }
417}
418
419#[cfg(test)]
420mod tests {
421
422 use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
423
424 #[test]
425 #[should_panic]
426 fn gd_neg_stepsize() {
427 let _ = GradientDesc::new(-0.5, 0);
428 }
429
430 #[test]
431 #[should_panic]
432 fn stochastic_gd_neg_momentum() {
433 let _ = StochasticGD::new(-0.5, 1f64, 0);
434 }
435
436 #[test]
437 #[should_panic]
438 fn stochastic_gd_neg_stepsize() {
439 let _ = StochasticGD::new(0.5, -1f64, 0);
440 }
441
442 #[test]
443 #[should_panic]
444 fn adagrad_neg_stepsize() {
445 let _ = AdaGrad::new(-0.5, 1f64, 0);
446 }
447
448 #[test]
449 #[should_panic]
450 fn adagrad_neg_adaptive_scale() {
451 let _ = AdaGrad::new(0.5, -1f64, 0);
452 }
453
454 #[test]
455 #[should_panic]
456 fn rmsprop_neg_decay_rate() {
457 let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
458 }
459
460 #[test]
461 #[should_panic]
462 fn rmsprop_neg_epsilon() {
463 let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
464 }
465
466 #[test]
467 #[should_panic]
468 fn rmsprop_neg_learning_rate() {
469 let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
470 }
471}