use learning::optim::{Optimizable, OptimAlgorithm};
use linalg::Vector;
use linalg::{Matrix, BaseMatrix};
use rulinalg::utils;
use learning::toolkit::rand_utils;
const LEARNING_EPS: f64 = 1e-20;
#[derive(Clone, Copy, Debug)]
pub struct GradientDesc {
alpha: f64,
iters: usize,
}
impl Default for GradientDesc {
fn default() -> GradientDesc {
GradientDesc {
alpha: 0.3,
iters: 100,
}
}
}
impl GradientDesc {
pub fn new(alpha: f64, iters: usize) -> GradientDesc {
assert!(alpha > 0f64,
"The step size (alpha) must be greater than 0.");
GradientDesc {
alpha: alpha,
iters: iters,
}
}
}
impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
let mut optimizing_val = Vector::new(start.to_vec());
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);
if (start_iter_cost - cost).abs() < LEARNING_EPS {
break;
} else {
optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
start_iter_cost = cost;
}
}
optimizing_val.into_vec()
}
}
#[derive(Clone, Copy, Debug)]
pub struct StochasticGD {
alpha: f64,
mu: f64,
iters: usize,
}
impl Default for StochasticGD {
fn default() -> StochasticGD {
StochasticGD {
alpha: 0.1,
mu: 0.1,
iters: 20,
}
}
}
impl StochasticGD {
pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
assert!(mu > 0f64, "The step size (mu) must be greater than 0.");
StochasticGD {
alpha: alpha,
mu: mu,
iters: iters,
}
}
}
impl<M> OptimAlgorithm<M> for StochasticGD
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
{
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
let mut optimizing_val = Vector::new(start.to_vec());
let mut delta_w = Vector::zeros(start.len());
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
let mut end_cost = 0f64;
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
optimizing_val = &optimizing_val - &delta_w * self.mu;
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
start_iter_cost = end_cost;
}
}
optimizing_val.into_vec()
}
}
#[derive(Debug)]
pub struct AdaGrad {
alpha: f64,
tau: f64,
iters: usize,
}
impl AdaGrad {
pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
assert!(alpha > 0f64,
"The step size (alpha) must be greater than 0.");
assert!(tau >= 0f64,
"The adaptive constant (tau) cannot be negative.");
AdaGrad {
alpha: alpha,
tau: tau,
iters: iters,
}
}
}
impl Default for AdaGrad {
fn default() -> AdaGrad {
AdaGrad {
alpha: 1f64,
tau: 3f64,
iters: 100,
}
}
}
impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
let mut ada_s = Vector::zeros(start.len());
let mut optimizing_val = Vector::new(start.to_vec());
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
let mut end_cost = 0f64;
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y);
utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| {
*x = self.alpha * (*x / (self.tau + (y).sqrt()))
});
optimizing_val = &optimizing_val - Vector::new(vec_data);
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
start_iter_cost = end_cost;
}
}
optimizing_val.into_vec()
}
}
#[derive(Debug, Clone, Copy)]
pub struct RMSProp {
learning_rate: f64,
decay_rate: f64,
epsilon: f64,
iters: usize,
}
impl Default for RMSProp {
fn default() -> RMSProp {
RMSProp {
learning_rate: 0.01,
decay_rate: 0.9,
epsilon: 1.0e-5,
iters: 50
}
}
}
impl RMSProp {
pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
assert!(0f64 < learning_rate, "The learning rate must be positive");
assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
assert!(0f64 < epsilon, "Epsilon must be positive");
RMSProp {
decay_rate: decay_rate,
learning_rate: learning_rate,
epsilon: epsilon,
iters: iters
}
}
}
impl<M> OptimAlgorithm<M> for RMSProp
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
let mut params = Vector::new(start.to_vec());
let mut rmsprop_cache = Vector::zeros(start.len());
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
let mut prev_cost = 0f64;
for _ in 0..self.iters {
let mut end_cost = 0f64;
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
let (cost, grad) = model.compute_grad(params.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
let mut grad = Vector::new(grad);
let grad_squared = grad.clone().apply(&|x| x*x);
rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
*x = *x * self.learning_rate / (y + self.epsilon).sqrt();
});
params = ¶ms - &grad;
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
if (prev_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
prev_cost = end_cost;
}
}
params.into_vec()
}
}
#[cfg(test)]
mod tests {
use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
#[test]
#[should_panic]
fn gd_neg_stepsize() {
let _ = GradientDesc::new(-0.5, 0);
}
#[test]
#[should_panic]
fn stochastic_gd_neg_momentum() {
let _ = StochasticGD::new(-0.5, 1f64, 0);
}
#[test]
#[should_panic]
fn stochastic_gd_neg_stepsize() {
let _ = StochasticGD::new(0.5, -1f64, 0);
}
#[test]
#[should_panic]
fn adagrad_neg_stepsize() {
let _ = AdaGrad::new(-0.5, 1f64, 0);
}
#[test]
#[should_panic]
fn adagrad_neg_adaptive_scale() {
let _ = AdaGrad::new(0.5, -1f64, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_decay_rate() {
let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_epsilon() {
let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_learning_rate() {
let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
}
}