use super::error::TensorError;
use super::tensor::GpuTensor;
pub trait Optimizer {
fn step(&mut self, params: &mut [GpuTensor]) -> Result<(), TensorError>;
fn zero_grad(&mut self, params: &mut [GpuTensor]) {
for p in params.iter_mut() {
p.zero_grad();
}
}
}
#[derive(Debug, Clone)]
pub struct Sgd {
pub lr: f64,
pub momentum: f64,
pub dampening: f64,
pub weight_decay: f64,
pub nesterov: bool,
velocities: Vec<Vec<f64>>,
step_count: u64,
}
impl Sgd {
#[must_use]
pub fn new(lr: f64) -> Self {
Self {
lr,
momentum: 0.0,
dampening: 0.0,
weight_decay: 0.0,
nesterov: false,
velocities: Vec::new(),
step_count: 0,
}
}
#[must_use]
pub fn with_momentum(mut self, momentum: f64) -> Self {
self.momentum = momentum;
self
}
#[must_use]
pub fn with_dampening(mut self, dampening: f64) -> Self {
self.dampening = dampening;
self
}
#[must_use]
pub fn with_weight_decay(mut self, wd: f64) -> Self {
self.weight_decay = wd;
self
}
#[must_use]
pub fn with_nesterov(mut self, nesterov: bool) -> Self {
self.nesterov = nesterov;
self
}
}
impl Optimizer for Sgd {
fn step(&mut self, params: &mut [GpuTensor]) -> Result<(), TensorError> {
if self.velocities.len() < params.len() {
self.velocities.resize(params.len(), Vec::new());
}
for (i, param) in params.iter_mut().enumerate() {
let grad = match param.grad() {
Some(g) => g.host_data().to_vec(),
None => continue,
};
if self.velocities[i].len() != grad.len() {
self.velocities[i] = vec![0.0; grad.len()];
}
let mut d_p = grad;
if self.weight_decay != 0.0 {
for (dp, &p) in d_p.iter_mut().zip(param.host_data.iter()) {
*dp += self.weight_decay * p;
}
}
if self.momentum != 0.0 {
if self.step_count > 0 {
for (v, &dp) in self.velocities[i].iter_mut().zip(d_p.iter()) {
*v = self.momentum * *v + (1.0 - self.dampening) * dp;
}
} else {
self.velocities[i].copy_from_slice(&d_p);
}
if self.nesterov {
for (dp, v) in d_p.iter_mut().zip(self.velocities[i].iter()) {
*dp += self.momentum * v;
}
} else {
d_p.clone_from(&self.velocities[i]);
}
}
for (p, &dp) in param.host_data.iter_mut().zip(d_p.iter()) {
*p -= self.lr * dp;
}
}
self.step_count += 1;
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct Adam {
pub lr: f64,
pub beta1: f64,
pub beta2: f64,
pub eps: f64,
pub weight_decay: f64,
pub amsgrad: bool,
m: Vec<Vec<f64>>,
v: Vec<Vec<f64>>,
v_hat_max: Vec<Vec<f64>>,
t: u64,
}
impl Adam {
#[must_use]
pub fn new(lr: f64) -> Self {
Self {
lr,
beta1: 0.9,
beta2: 0.999,
eps: 1e-8,
weight_decay: 0.0,
amsgrad: false,
m: Vec::new(),
v: Vec::new(),
v_hat_max: Vec::new(),
t: 0,
}
}
#[must_use]
pub fn with_betas(mut self, beta1: f64, beta2: f64) -> Self {
self.beta1 = beta1;
self.beta2 = beta2;
self
}
#[must_use]
pub fn with_eps(mut self, eps: f64) -> Self {
self.eps = eps;
self
}
#[must_use]
pub fn with_weight_decay(mut self, wd: f64) -> Self {
self.weight_decay = wd;
self
}
#[must_use]
pub fn with_amsgrad(mut self, amsgrad: bool) -> Self {
self.amsgrad = amsgrad;
self
}
}
impl Optimizer for Adam {
#[allow(clippy::needless_range_loop)]
fn step(&mut self, params: &mut [GpuTensor]) -> Result<(), TensorError> {
self.t += 1;
if self.m.len() < params.len() {
self.m.resize(params.len(), Vec::new());
self.v.resize(params.len(), Vec::new());
self.v_hat_max.resize(params.len(), Vec::new());
}
let bias_correction1 = 1.0 - self.beta1.powi(self.t as i32);
let bias_correction2 = 1.0 - self.beta2.powi(self.t as i32);
for (i, param) in params.iter_mut().enumerate() {
let grad = match param.grad() {
Some(g) => g.host_data().to_vec(),
None => continue,
};
let n = grad.len();
if self.m[i].len() != n {
self.m[i] = vec![0.0; n];
self.v[i] = vec![0.0; n];
if self.amsgrad {
self.v_hat_max[i] = vec![0.0; n];
}
}
if self.weight_decay != 0.0 {
for p in param.host_data.iter_mut() {
*p *= 1.0 - self.lr * self.weight_decay;
}
}
for j in 0..n {
self.m[i][j] = self.beta1 * self.m[i][j] + (1.0 - self.beta1) * grad[j];
self.v[i][j] = self.beta2 * self.v[i][j] + (1.0 - self.beta2) * grad[j] * grad[j];
}
for j in 0..n {
let m_hat = self.m[i][j] / bias_correction1;
let v_hat = self.v[i][j] / bias_correction2;
let denom = if self.amsgrad {
if self.v_hat_max[i].len() != n {
self.v_hat_max[i] = vec![0.0; n];
}
self.v_hat_max[i][j] = self.v_hat_max[i][j].max(v_hat);
self.v_hat_max[i][j].sqrt() + self.eps
} else {
v_hat.sqrt() + self.eps
};
param.host_data[j] -= self.lr * m_hat / denom;
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct AdaGrad {
pub lr: f64,
pub lr_decay: f64,
pub eps: f64,
pub weight_decay: f64,
sum_sq: Vec<Vec<f64>>,
t: u64,
}
impl AdaGrad {
#[must_use]
pub fn new(lr: f64) -> Self {
Self {
lr,
lr_decay: 0.0,
eps: 1e-10,
weight_decay: 0.0,
sum_sq: Vec::new(),
t: 0,
}
}
#[must_use]
pub fn with_lr_decay(mut self, decay: f64) -> Self {
self.lr_decay = decay;
self
}
#[must_use]
pub fn with_eps(mut self, eps: f64) -> Self {
self.eps = eps;
self
}
#[must_use]
pub fn with_weight_decay(mut self, wd: f64) -> Self {
self.weight_decay = wd;
self
}
}
impl Optimizer for AdaGrad {
#[allow(clippy::needless_range_loop)]
fn step(&mut self, params: &mut [GpuTensor]) -> Result<(), TensorError> {
self.t += 1;
let clr = self.lr / (1.0 + (self.t - 1) as f64 * self.lr_decay);
if self.sum_sq.len() < params.len() {
self.sum_sq.resize(params.len(), Vec::new());
}
for (i, param) in params.iter_mut().enumerate() {
let mut grad = match param.grad() {
Some(g) => g.host_data().to_vec(),
None => continue,
};
let n = grad.len();
if self.sum_sq[i].len() != n {
self.sum_sq[i] = vec![0.0; n];
}
if self.weight_decay != 0.0 {
for (g, &p) in grad.iter_mut().zip(param.host_data.iter()) {
*g += self.weight_decay * p;
}
}
for j in 0..n {
self.sum_sq[i][j] += grad[j] * grad[j];
param.host_data[j] -= clr * grad[j] / (self.sum_sq[i][j].sqrt() + self.eps);
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct RmsProp {
pub lr: f64,
pub alpha: f64,
pub eps: f64,
pub momentum: f64,
pub weight_decay: f64,
pub centered: bool,
v: Vec<Vec<f64>>,
g_avg: Vec<Vec<f64>>,
buf: Vec<Vec<f64>>,
}
impl RmsProp {
#[must_use]
pub fn new(lr: f64) -> Self {
Self {
lr,
alpha: 0.99,
eps: 1e-8,
momentum: 0.0,
weight_decay: 0.0,
centered: false,
v: Vec::new(),
g_avg: Vec::new(),
buf: Vec::new(),
}
}
#[must_use]
pub fn with_alpha(mut self, alpha: f64) -> Self {
self.alpha = alpha;
self
}
#[must_use]
pub fn with_momentum(mut self, momentum: f64) -> Self {
self.momentum = momentum;
self
}
#[must_use]
pub fn with_centered(mut self, centered: bool) -> Self {
self.centered = centered;
self
}
#[must_use]
pub fn with_weight_decay(mut self, wd: f64) -> Self {
self.weight_decay = wd;
self
}
}
impl Optimizer for RmsProp {
#[allow(clippy::needless_range_loop)]
fn step(&mut self, params: &mut [GpuTensor]) -> Result<(), TensorError> {
if self.v.len() < params.len() {
self.v.resize(params.len(), Vec::new());
self.g_avg.resize(params.len(), Vec::new());
self.buf.resize(params.len(), Vec::new());
}
for (i, param) in params.iter_mut().enumerate() {
let mut grad = match param.grad() {
Some(g) => g.host_data().to_vec(),
None => continue,
};
let n = grad.len();
if self.v[i].len() != n {
self.v[i] = vec![0.0; n];
self.g_avg[i] = vec![0.0; n];
self.buf[i] = vec![0.0; n];
}
if self.weight_decay != 0.0 {
for (g, &p) in grad.iter_mut().zip(param.host_data.iter()) {
*g += self.weight_decay * p;
}
}
for j in 0..n {
self.v[i][j] = self.alpha * self.v[i][j] + (1.0 - self.alpha) * grad[j] * grad[j];
}
let avg = if self.centered {
for j in 0..n {
self.g_avg[i][j] = self.alpha * self.g_avg[i][j] + (1.0 - self.alpha) * grad[j];
}
&self.g_avg[i]
} else {
&self.g_avg[i]
};
for j in 0..n {
let v_val = if self.centered {
self.v[i][j] - avg[j] * avg[j]
} else {
self.v[i][j]
};
if self.momentum > 0.0 {
self.buf[i][j] =
self.momentum * self.buf[i][j] + grad[j] / (v_val.sqrt() + self.eps);
param.host_data[j] -= self.lr * self.buf[i][j];
} else {
param.host_data[j] -= self.lr * grad[j] / (v_val.sqrt() + self.eps);
}
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct Lamb {
pub lr: f64,
pub beta1: f64,
pub beta2: f64,
pub eps: f64,
pub weight_decay: f64,
m: Vec<Vec<f64>>,
v: Vec<Vec<f64>>,
t: u64,
}
impl Lamb {
#[must_use]
pub fn new(lr: f64) -> Self {
Self {
lr,
beta1: 0.9,
beta2: 0.999,
eps: 1e-6,
weight_decay: 0.0,
m: Vec::new(),
v: Vec::new(),
t: 0,
}
}
#[must_use]
pub fn with_betas(mut self, beta1: f64, beta2: f64) -> Self {
self.beta1 = beta1;
self.beta2 = beta2;
self
}
#[must_use]
pub fn with_weight_decay(mut self, wd: f64) -> Self {
self.weight_decay = wd;
self
}
}
impl Optimizer for Lamb {
#[allow(clippy::needless_range_loop)]
fn step(&mut self, params: &mut [GpuTensor]) -> Result<(), TensorError> {
self.t += 1;
if self.m.len() < params.len() {
self.m.resize(params.len(), Vec::new());
self.v.resize(params.len(), Vec::new());
}
let bias_correction1 = 1.0 - self.beta1.powi(self.t as i32);
let bias_correction2 = 1.0 - self.beta2.powi(self.t as i32);
for (i, param) in params.iter_mut().enumerate() {
let grad = match param.grad() {
Some(g) => g.host_data().to_vec(),
None => continue,
};
let n = grad.len();
if self.m[i].len() != n {
self.m[i] = vec![0.0; n];
self.v[i] = vec![0.0; n];
}
for j in 0..n {
self.m[i][j] = self.beta1 * self.m[i][j] + (1.0 - self.beta1) * grad[j];
self.v[i][j] = self.beta2 * self.v[i][j] + (1.0 - self.beta2) * grad[j] * grad[j];
}
let mut update = vec![0.0; n];
for j in 0..n {
let m_hat = self.m[i][j] / bias_correction1;
let v_hat = self.v[i][j] / bias_correction2;
update[j] = m_hat / (v_hat.sqrt() + self.eps);
}
if self.weight_decay != 0.0 {
for (u, &p) in update.iter_mut().zip(param.host_data.iter()) {
*u += self.weight_decay * p;
}
}
let w_norm: f64 = param.host_data.iter().map(|&p| p * p).sum::<f64>().sqrt();
let u_norm: f64 = update.iter().map(|&u| u * u).sum::<f64>().sqrt();
let trust_ratio = if w_norm > 0.0 && u_norm > 0.0 {
w_norm / u_norm
} else {
1.0
};
for (p, &u) in param.host_data.iter_mut().zip(update.iter()) {
*p -= self.lr * trust_ratio * u;
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_param(data: &[f64], grad_data: &[f64]) -> GpuTensor {
let mut p = GpuTensor::from_host_f64(data, &[data.len()], 0).unwrap();
p.set_requires_grad(true);
let g = GpuTensor::from_host_f64(grad_data, &[grad_data.len()], 0).unwrap();
p.accumulate_grad(&g).unwrap();
p
}
#[test]
fn test_sgd_basic() {
let mut opt = Sgd::new(0.1);
let mut params = vec![make_param(&[5.0, 10.0], &[1.0, 2.0])];
opt.step(&mut params).unwrap();
assert!((params[0].host_data[0] - 4.9).abs() < 1e-10);
assert!((params[0].host_data[1] - 9.8).abs() < 1e-10);
}
#[test]
fn test_sgd_with_momentum() {
let mut opt = Sgd::new(0.1).with_momentum(0.9);
let mut params = vec![make_param(&[5.0], &[1.0])];
opt.step(&mut params).unwrap();
assert!((params[0].host_data[0] - 4.9).abs() < 1e-10);
}
#[test]
fn test_adam_basic() {
let mut opt = Adam::new(0.01);
let mut params = vec![make_param(&[5.0], &[1.0])];
opt.step(&mut params).unwrap();
assert!(params[0].host_data[0] < 5.0);
}
#[test]
fn test_adam_convergence() {
let mut opt = Adam::new(0.1);
let mut params = vec![make_param(&[3.0], &[6.0])];
for _ in 0..100 {
opt.step(&mut params).unwrap();
let new_grad = 2.0 * params[0].host_data[0];
params[0].zero_grad();
let g = GpuTensor::from_host_f64(&[new_grad], &[1], 0).unwrap();
params[0].accumulate_grad(&g).unwrap();
}
assert!(params[0].host_data[0].abs() < 0.5);
}
#[test]
fn test_adagrad_basic() {
let mut opt = AdaGrad::new(0.1);
let mut params = vec![make_param(&[5.0], &[1.0])];
opt.step(&mut params).unwrap();
assert!(params[0].host_data[0] < 5.0);
}
#[test]
fn test_rmsprop_basic() {
let mut opt = RmsProp::new(0.01);
let mut params = vec![make_param(&[5.0], &[1.0])];
opt.step(&mut params).unwrap();
assert!(params[0].host_data[0] < 5.0);
}
#[test]
fn test_lamb_basic() {
let mut opt = Lamb::new(0.01);
let mut params = vec![make_param(&[5.0], &[1.0])];
opt.step(&mut params).unwrap();
assert!(params[0].host_data[0] < 5.0);
}
#[test]
fn test_zero_grad() {
let mut opt = Sgd::new(0.1);
let mut params = vec![make_param(&[5.0], &[1.0])];
assert!(params[0].grad().is_some());
opt.zero_grad(&mut params);
assert!(params[0].grad().is_none());
}
#[test]
fn test_adam_with_weight_decay() {
let mut opt = Adam::new(0.01).with_weight_decay(0.01);
let mut params = vec![make_param(&[5.0], &[0.0])];
opt.step(&mut params).unwrap();
assert!(params[0].host_data[0] < 5.0);
}
#[test]
fn test_lamb_convergence() {
let mut opt = Lamb::new(0.1);
let mut params = vec![make_param(&[3.0], &[6.0])];
for _ in 0..100 {
opt.step(&mut params).unwrap();
let new_grad = 2.0 * params[0].host_data[0];
params[0].zero_grad();
let g = GpuTensor::from_host_f64(&[new_grad], &[1], 0).unwrap();
params[0].accumulate_grad(&g).unwrap();
}
assert!(params[0].host_data[0].abs() < 1.0);
}
#[test]
fn test_sgd_weight_decay() {
let mut opt = Sgd::new(0.1).with_weight_decay(0.01);
let mut params = vec![make_param(&[10.0], &[0.0])];
opt.step(&mut params).unwrap();
assert!((params[0].host_data[0] - 9.99).abs() < 1e-10);
}
#[test]
fn test_no_grad_no_update() {
let mut opt = Sgd::new(0.1);
let mut p = GpuTensor::from_host_f64(&[5.0], &[1], 0).unwrap();
p.set_requires_grad(true);
let mut params = vec![p];
opt.step(&mut params).unwrap();
assert!((params[0].host_data[0] - 5.0).abs() < 1e-10);
}
}