use crate::{linear_algebra::Vector, Cost, Model, Teacher};
use serde_derive::{Deserialize, Serialize};
fn annealed_learning_rate(num_events: usize, start: f64, t: f64) -> f64 {
start / (1.0 + num_events as f64 / t)
}
fn gradient<T, P, C>(cost: &C, prediction: &P, truth: T, derivative_of_model: &P) -> f64
where
C: Cost<T, P>,
P: Vector,
{
cost.outer_derivative(&prediction, truth)
.dot(derivative_of_model)
}
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct GradientDescent {
pub learning_rate: f64,
}
impl<M> Teacher<M> for GradientDescent
where
M: Model,
M::Target: Vector,
{
type Training = ();
fn new_training(&self, _: &M) {}
fn teach_event<Y, C>(
&self,
_training: &mut (),
model: &mut M,
cost: &C,
features: &M::Features,
truth: Y,
) where
C: Cost<Y, M::Target>,
Y: Copy,
{
let prediction = model.predict(features);
for ci in 0..model.num_coefficients() {
*model.coefficient(ci) = *model.coefficient(ci)
- self.learning_rate
* gradient(cost, &prediction, truth, &model.gradient(ci, features));
}
}
}
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct GradientDescentAl {
pub l0: f64,
pub t: f64,
}
impl<M> Teacher<M> for GradientDescentAl
where
M: Model,
M::Target: Vector,
{
type Training = usize;
fn new_training(&self, _: &M) -> usize {
0
}
fn teach_event<Y, C>(
&self,
num_events: &mut usize,
model: &mut M,
cost: &C,
features: &M::Features,
truth: Y,
) where
C: Cost<Y, M::Target>,
Y: Copy,
{
let prediction = model.predict(features);
let learning_rate = annealed_learning_rate(*num_events, self.l0, self.t);
for ci in 0..model.num_coefficients() {
*model.coefficient(ci) = *model.coefficient(ci)
- learning_rate * gradient(cost, &prediction, truth, &model.gradient(ci, features));
}
*num_events += 1;
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Momentum {
pub l0: f64,
pub t: f64,
pub inertia: f64,
}
impl<M> Teacher<M> for Momentum
where
M: Model,
M::Target: Vector,
{
type Training = (usize, Vec<f64>);
fn new_training(&self, model: &M) -> (usize, Vec<f64>) {
(
0,
(0..model.num_coefficients())
.map(|_| 0.0)
.collect::<Vec<_>>(),
)
}
fn teach_event<Y, C>(
&self,
training: &mut (usize, Vec<f64>),
model: &mut M,
cost: &C,
features: &M::Features,
truth: Y,
) where
C: Cost<Y, M::Target>,
Y: Copy,
{
let (ref mut num_events, ref mut velocity) = *training;
let prediction = model.predict(features);
let learning_rate = annealed_learning_rate(*num_events, self.l0, self.t);
for ci in 0..model.num_coefficients() {
velocity[ci] = self.inertia * velocity[ci]
- learning_rate * gradient(cost, &prediction, truth, &model.gradient(ci, features));
*model.coefficient(ci) = *model.coefficient(ci) + velocity[ci];
}
*num_events += 1;
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Nesterov {
pub l0: f64,
pub t: f64,
pub inertia: f64,
}
impl<M> Teacher<M> for Nesterov
where
M: Model,
M::Target: Vector,
{
type Training = (usize, Vec<f64>);
fn new_training(&self, model: &M) -> (usize, Vec<f64>) {
(
0,
(0..model.num_coefficients())
.map(|_| 0.0)
.collect::<Vec<_>>(),
)
}
fn teach_event<Y, C>(
&self,
training: &mut (usize, Vec<f64>),
model: &mut M,
cost: &C,
features: &M::Features,
truth: Y,
) where
C: Cost<Y, M::Target>,
Y: Copy,
{
let (ref mut num_events, ref mut velocity) = *training;
let prediction = model.predict(features);
let learning_rate = annealed_learning_rate(*num_events, self.l0, self.t);
for ci in 0..model.num_coefficients() {
*model.coefficient(ci) = *model.coefficient(ci) + velocity[ci];
}
for ci in 0..model.num_coefficients() {
let delta =
-learning_rate * gradient(cost, &prediction, truth, &model.gradient(ci, features));
*model.coefficient(ci) = *model.coefficient(ci) + delta;
velocity[ci] = self.inertia * velocity[ci] + delta;
}
*num_events += 1;
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Adagard {
pub learning_rate: f64,
pub epsilon: f64,
}
impl<M> Teacher<M> for Adagard
where
M: Model,
M::Target: Vector,
{
type Training = Vec<f64>;
fn new_training(&self, model: &M) -> Vec<f64> {
(0..model.num_coefficients())
.map(|_| self.epsilon)
.collect::<Vec<_>>()
}
fn teach_event<Y, C>(
&self,
squared_gradients: &mut Vec<f64>,
model: &mut M,
cost: &C,
features: &M::Features,
truth: Y,
) where
C: Cost<Y, M::Target>,
Y: Copy,
{
let prediction = model.predict(features);
for ci in 0..model.num_coefficients() {
let gradient = gradient(cost, &prediction, truth, &model.gradient(ci, features));
let delta = -self.learning_rate * gradient / squared_gradients[ci].sqrt();
*model.coefficient(ci) += delta;
squared_gradients[ci] += gradient.powi(2);
}
}
}