rusty_machine/learning/optim/
grad_desc.rs

1//! Gradient Descent
2//!
3//! Implementation of gradient descent algorithm. Module contains
4//! the struct `GradientDesc` which is instantiated within models
5//! implementing the Optimizable trait.
6//!
7//! Currently standard batch gradient descent is the only implemented
8//! optimization algorithm but there is flexibility to introduce new
9//! algorithms and git them into the same scheme easily.
10
11use learning::optim::{Optimizable, OptimAlgorithm};
12use linalg::Vector;
13use linalg::{Matrix, BaseMatrix};
14use rulinalg::utils;
15
16use learning::toolkit::rand_utils;
17
18const LEARNING_EPS: f64 = 1e-20;
19
20/// Batch Gradient Descent algorithm
21#[derive(Clone, Copy, Debug)]
22pub struct GradientDesc {
23    /// The step-size for the gradient descent steps.
24    alpha: f64,
25    /// The number of iterations to run.
26    iters: usize,
27}
28
29/// The default gradient descent algorithm.
30///
31/// The defaults are:
32///
33/// - alpha = 0.3
34/// - iters = 100
35impl Default for GradientDesc {
36    fn default() -> GradientDesc {
37        GradientDesc {
38            alpha: 0.3,
39            iters: 100,
40        }
41    }
42}
43
44impl GradientDesc {
45    /// Construct a gradient descent algorithm.
46    ///
47    /// Requires the step size and iteration count
48    /// to be specified.
49    ///
50    /// # Examples
51    ///
52    /// ```
53    /// use rusty_machine::learning::optim::grad_desc::GradientDesc;
54    ///
55    /// let gd = GradientDesc::new(0.3, 10000);
56    /// ```
57    pub fn new(alpha: f64, iters: usize) -> GradientDesc {
58        assert!(alpha > 0f64,
59                "The step size (alpha) must be greater than 0.");
60
61        GradientDesc {
62            alpha: alpha,
63            iters: iters,
64        }
65    }
66}
67
68impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
69    fn optimize(&self,
70                model: &M,
71                start: &[f64],
72                inputs: &M::Inputs,
73                targets: &M::Targets)
74                -> Vec<f64> {
75
76        // Create the initial optimal parameters
77        let mut optimizing_val = Vector::new(start.to_vec());
78        // The cost at the start of each iteration
79        let mut start_iter_cost = 0f64;
80
81        for _ in 0..self.iters {
82            // Compute the cost and gradient for the current parameters
83            let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);
84
85            // Early stopping
86            if (start_iter_cost - cost).abs() < LEARNING_EPS {
87                break;
88            } else {
89                // Update the optimal parameters using gradient descent
90                optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
91                // Update the latest cost
92                start_iter_cost = cost;
93            }
94        }
95        optimizing_val.into_vec()
96    }
97}
98
99/// Stochastic Gradient Descent algorithm.
100///
101/// Uses basic momentum to control the learning rate.
102#[derive(Clone, Copy, Debug)]
103pub struct StochasticGD {
104    /// Controls the momentum of the descent
105    alpha: f64,
106    /// The square root of the raw learning rate.
107    mu: f64,
108    /// The number of passes through the data.
109    iters: usize,
110}
111
112/// The default Stochastic GD algorithm.
113///
114/// The defaults are:
115///
116/// - alpha = 0.1
117/// - mu = 0.1
118/// - iters = 20
119impl Default for StochasticGD {
120    fn default() -> StochasticGD {
121        StochasticGD {
122            alpha: 0.1,
123            mu: 0.1,
124            iters: 20,
125        }
126    }
127}
128
129impl StochasticGD {
130    /// Construct a stochastic gradient descent algorithm.
131    ///
132    /// Requires the learning rate, momentum rate and iteration count
133    /// to be specified.
134    ///
135    /// # Examples
136    ///
137    /// ```
138    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
139    ///
140    /// let sgd = StochasticGD::new(0.1, 0.3, 5);
141    /// ```
142    pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
143        assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
144        assert!(mu > 0f64, "The step size (mu) must be greater than 0.");
145
146        StochasticGD {
147            alpha: alpha,
148            mu: mu,
149            iters: iters,
150        }
151    }
152}
153
154impl<M> OptimAlgorithm<M> for StochasticGD
155    where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
156{
157    fn optimize(&self,
158                model: &M,
159                start: &[f64],
160                inputs: &M::Inputs,
161                targets: &M::Targets)
162                -> Vec<f64> {
163
164        // Create the initial optimal parameters
165        let mut optimizing_val = Vector::new(start.to_vec());
166        // Create the momentum based gradient distance
167        let mut delta_w = Vector::zeros(start.len());
168
169        // Set up the indices for permutation
170        let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
171        // The cost at the start of each iteration
172        let mut start_iter_cost = 0f64;
173
174        for _ in 0..self.iters {
175            // The cost at the end of each stochastic gd pass
176            let mut end_cost = 0f64;
177            // Permute the indices
178            rand_utils::in_place_fisher_yates(&mut permutation);
179            for i in &permutation {
180                // Compute the cost and gradient for this data pair
181                let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
182                                                          &inputs.select_rows(&[*i]),
183                                                          &targets.select_rows(&[*i]));
184
185                // Compute the difference in gradient using momentum
186                delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
187                // Update the parameters
188                optimizing_val = &optimizing_val - &delta_w * self.mu;
189                // Set the end cost (this is only used after the last iteration)
190                end_cost += cost;
191            }
192
193            end_cost /= inputs.rows() as f64;
194
195            // Early stopping
196            if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
197                break;
198            } else {
199                // Update the cost
200                start_iter_cost = end_cost;
201            }
202        }
203        optimizing_val.into_vec()
204    }
205}
206
207/// Adaptive Gradient Descent
208///
209/// The adaptive gradient descent algorithm (Duchi et al. 2010).
210#[derive(Debug)]
211pub struct AdaGrad {
212    alpha: f64,
213    tau: f64,
214    iters: usize,
215}
216
217impl AdaGrad {
218    /// Constructs a new AdaGrad algorithm.
219    ///
220    /// # Examples
221    ///
222    /// ```
223    /// use rusty_machine::learning::optim::grad_desc::AdaGrad;
224    ///
225    /// // Create a new AdaGrad algorithm with step size 0.5
226    /// // and adaptive scaling constant 1.0
227    /// let gd = AdaGrad::new(0.5, 1.0, 100);
228    /// ```
229    pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
230        assert!(alpha > 0f64,
231                "The step size (alpha) must be greater than 0.");
232        assert!(tau >= 0f64,
233                "The adaptive constant (tau) cannot be negative.");
234        AdaGrad {
235            alpha: alpha,
236            tau: tau,
237            iters: iters,
238        }
239    }
240}
241
242impl Default for AdaGrad {
243    fn default() -> AdaGrad {
244        AdaGrad {
245            alpha: 1f64,
246            tau: 3f64,
247            iters: 100,
248        }
249    }
250}
251
252impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
253    fn optimize(&self,
254                model: &M,
255                start: &[f64],
256                inputs: &M::Inputs,
257                targets: &M::Targets)
258                -> Vec<f64> {
259
260        // Initialize the adaptive scaling
261        let mut ada_s = Vector::zeros(start.len());
262        // Initialize the optimal parameters
263        let mut optimizing_val = Vector::new(start.to_vec());
264
265        // Set up the indices for permutation
266        let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
267        // The cost at the start of each iteration
268        let mut start_iter_cost = 0f64;
269
270        for _ in 0..self.iters {
271            // The cost at the end of each stochastic gd pass
272            let mut end_cost = 0f64;
273            // Permute the indices
274            rand_utils::in_place_fisher_yates(&mut permutation);
275            for i in &permutation {
276                // Compute the cost and gradient for this data pair
277                let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
278                                                              &inputs.select_rows(&[*i]),
279                                                              &targets.select_rows(&[*i]));
280                // Update the adaptive scaling by adding the gradient squared
281                utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y);
282
283                // Compute the change in gradient
284                utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| {
285                    *x = self.alpha * (*x / (self.tau + (y).sqrt()))
286                });
287                // Update the parameters
288                optimizing_val = &optimizing_val - Vector::new(vec_data);
289                // Set the end cost (this is only used after the last iteration)
290                end_cost += cost;
291            }
292            end_cost /= inputs.rows() as f64;
293
294            // Early stopping
295            if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
296                break;
297            } else {
298                // Update the cost
299                start_iter_cost = end_cost;
300            }
301        }
302        optimizing_val.into_vec()
303    }
304}
305
306/// RMSProp 
307///
308/// The RMSProp algorithm (Hinton et al. 2012).
309#[derive(Debug, Clone, Copy)]
310pub struct RMSProp {
311    /// The base step size of gradient descent steps 
312    learning_rate: f64,
313    /// Rate at which running total of average square gradients decays
314    decay_rate: f64,
315    /// Small value used to avoid divide by zero
316    epsilon: f64,
317    /// The number of passes through the data
318    iters: usize,
319}
320
321/// The default RMSProp configuration
322///
323/// The defaults are:
324///
325/// - learning_rate = 0.01
326/// - decay_rate = 0.9
327/// - epsilon = 1.0e-5
328/// - iters = 50
329impl Default for RMSProp {
330    fn default() -> RMSProp {
331        RMSProp {
332            learning_rate: 0.01,
333            decay_rate: 0.9,
334            epsilon: 1.0e-5,
335            iters: 50
336        }
337    }
338}
339
340impl RMSProp {
341    /// Construct an RMSProp algorithm.
342    ///
343    /// Requires learning rate, decay rate, epsilon, and iteration count.
344    ///
345    /// #Examples
346    ///
347    /// ```
348    /// use rusty_machine::learning::optim::grad_desc::RMSProp;
349    ///
350    /// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20);
351    /// ```
352    pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
353        assert!(0f64 < learning_rate, "The learning rate must be positive");
354        assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
355        assert!(0f64 < epsilon, "Epsilon must be positive");
356
357        RMSProp {
358            decay_rate: decay_rate,
359            learning_rate: learning_rate,
360            epsilon: epsilon,
361            iters: iters
362        }
363    }
364}
365
366impl<M> OptimAlgorithm<M> for RMSProp
367    where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
368    fn optimize(&self,
369                model: &M,
370                start: &[f64],
371                inputs: &M::Inputs,
372                targets: &M::Targets)
373                -> Vec<f64> {
374        // Initial parameters
375        let mut params = Vector::new(start.to_vec());
376        // Running average of squared gradients
377        let mut rmsprop_cache = Vector::zeros(start.len());
378
379        // Set up indices for permutation
380        let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
381        // The cost from the previous iteration
382        let mut prev_cost = 0f64;
383
384        for _ in 0..self.iters {
385            // The cost at end of each pass
386            let mut end_cost = 0f64;
387            // Permute the vertices
388            rand_utils::in_place_fisher_yates(&mut permutation);
389            for i in &permutation {
390                let (cost, grad) = model.compute_grad(params.data(),
391                                                      &inputs.select_rows(&[*i]),
392                                                      &targets.select_rows(&[*i]));
393
394                let mut grad = Vector::new(grad);
395                let grad_squared = grad.clone().apply(&|x| x*x);
396                // Update cached average of squared gradients
397                rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
398                // RMSProp update rule 
399                utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
400                    *x = *x * self.learning_rate / (y + self.epsilon).sqrt();
401                });
402                params = &params - &grad;
403
404                end_cost += cost;
405            }
406            end_cost /= inputs.rows() as f64;
407
408            // Early stopping
409            if (prev_cost - end_cost).abs() < LEARNING_EPS {
410                break;
411            } else {
412                prev_cost = end_cost;
413            }
414        }
415        params.into_vec()
416    }
417}
418
419#[cfg(test)]
420mod tests {
421
422    use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
423
424    #[test]
425    #[should_panic]
426    fn gd_neg_stepsize() {
427        let _ = GradientDesc::new(-0.5, 0);
428    }
429
430    #[test]
431    #[should_panic]
432    fn stochastic_gd_neg_momentum() {
433        let _ = StochasticGD::new(-0.5, 1f64, 0);
434    }
435
436    #[test]
437    #[should_panic]
438    fn stochastic_gd_neg_stepsize() {
439        let _ = StochasticGD::new(0.5, -1f64, 0);
440    }
441
442    #[test]
443    #[should_panic]
444    fn adagrad_neg_stepsize() {
445        let _ = AdaGrad::new(-0.5, 1f64, 0);
446    }
447
448    #[test]
449    #[should_panic]
450    fn adagrad_neg_adaptive_scale() {
451        let _ = AdaGrad::new(0.5, -1f64, 0);
452    }
453
454    #[test]
455    #[should_panic]
456    fn rmsprop_neg_decay_rate() {
457        let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
458    }
459
460    #[test]
461    #[should_panic]
462    fn rmsprop_neg_epsilon() {
463        let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
464    }
465
466    #[test]
467    #[should_panic]
468    fn rmsprop_neg_learning_rate() {
469        let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
470    }
471}