pub mod adagrad;
pub mod adam;
pub mod adamw;
pub mod momentum_sgd;
pub mod sgd;
pub mod training;
use crate::evaluation::Feeder;
use crate::tensor::Tensor;
use crate::variable::VariableNamespace;
use crate::{Context, Float, NdArray};
pub use adagrad::AdaGrad;
pub use adam::Adam;
pub use adamw::AdamW;
pub use momentum_sgd::MomentumSGD;
pub use sgd::SGD;
pub use training::{TrainingConfig, TrainingLoop};
#[allow(dead_code)]
pub fn grad_helper<'g, A, F: Float>(
losses: &[A],
namespace: &'g VariableNamespace<F>,
) -> (Vec<Tensor<'g, F>>, Vec<Tensor<'g, F>>)
where
A: AsRef<Tensor<'g, F>> + Copy,
{
use crate::tensor_ops as T;
let g = losses[0].as_ref().graph;
let ys: Vec<_> = losses.iter().map(|y| T::sum_all(y)).collect();
let xs: Vec<_> = g.var_tensors_by_name(namespace).map(|(_a, b)| b).collect();
let mut gradients = crate::gradient::compute_gradients(ys.as_slice(), xs.as_slice(), None, g);
let mut vars = Vec::with_capacity(xs.len());
let mut grads = Vec::with_capacity(xs.len());
for x in xs.iter() {
let gx = gradients.extract_grad(x);
if let Some(a) = gx {
vars.push(*x);
grads.push(a);
}
}
(vars, grads)
}
pub trait Optimizer<F: Float> {
fn compute_updates<'g, A, B>(
&self,
variables: &[A],
grads: &[B],
g: &'g Context<F>,
) -> Vec<Tensor<'g, F>>
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy;
fn update<'g, A, B>(&self, variables: &[A], grads: &[B], g: &'g Context<F>, feeder: Feeder<F>)
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy,
{
let update_ops = self.compute_updates(variables, grads, g);
let results = g.evaluator().set_feeder(feeder).extend(&update_ops).run();
for (var_tensor, result) in variables.iter().zip(results.iter()) {
let var_tensor_ref = var_tensor.as_ref();
if let Some(var_id) = var_tensor_ref.get_variable_id() {
if let Ok(new_value) = result {
if let Some(var_cell) = g.env().get_array_by_id(var_id) {
*var_cell.borrow_mut() = new_value.clone();
}
}
}
}
}
fn get_update_tensors<'g, A, B>(
&self,
variables: &[A],
grads: &[B],
g: &'g Context<F>,
) -> Vec<Tensor<'g, F>>
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy,
{
self.compute_updates(variables, grads, g)
}
fn apply_update_tensors<'g, A>(
variables: &[A],
evaluated_updates: &[NdArray<F>],
env: &crate::VariableEnvironment<F>,
) where
A: AsRef<Tensor<'g, F>> + Copy,
{
for (var_tensor, new_value) in variables.iter().zip(evaluated_updates.iter()) {
let var_tensor_ref = var_tensor.as_ref();
if let Some(var_id) = var_tensor_ref.get_variable_id() {
if let Some(var_cell) = env.get_array_by_id(var_id) {
*var_cell.borrow_mut() = new_value.clone();
}
}
}
}
fn get_update_op<'g, A, B>(
&self,
variables: &[A],
grads: &[B],
g: &'g Context<F>,
) -> Tensor<'g, F>
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy,
{
crate::tensor_ops::add_n(&self.compute_updates(variables, grads, g))
}
}
pub trait FunctionalOptimizer<F: Float> {
fn step<'g, A, B>(
&mut self,
params: &[A],
grads: &[B],
ctx: &'g Context<F>,
) -> Vec<Tensor<'g, F>>
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy;
fn learning_rate(&self) -> F;
fn set_learning_rate(&mut self, lr: F);
fn step_count(&self) -> usize;
fn reset(&mut self);
}
#[derive(Debug, Clone)]
pub struct FunctionalSGD<F> {
pub lr: F,
step_count: usize,
}
impl<F: Float> FunctionalSGD<F> {
pub fn new(lr: F) -> Self {
Self { lr, step_count: 0 }
}
}
impl<F: Float> FunctionalOptimizer<F> for FunctionalSGD<F> {
fn step<'g, A, B>(
&mut self,
params: &[A],
grads: &[B],
ctx: &'g Context<F>,
) -> Vec<Tensor<'g, F>>
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy,
{
use crate::tensor_ops as T;
assert_eq!(
params.len(),
grads.len(),
"params and grads must have same length"
);
self.step_count += 1;
let lr_tensor = T::scalar(self.lr, ctx);
params
.iter()
.zip(grads.iter())
.map(|(p, g)| {
let scaled_grad = T::mul(lr_tensor, g.as_ref());
T::sub(p.as_ref(), scaled_grad)
})
.collect()
}
fn learning_rate(&self) -> F {
self.lr
}
fn set_learning_rate(&mut self, lr: F) {
self.lr = lr;
}
fn step_count(&self) -> usize {
self.step_count
}
fn reset(&mut self) {
self.step_count = 0;
}
}
#[derive(Debug, Clone)]
pub struct FunctionalAdam<F> {
pub lr: F,
pub beta1: F,
pub beta2: F,
pub eps: F,
step_count: usize,
m: Vec<NdArray<F>>,
v: Vec<NdArray<F>>,
initialized: bool,
}
impl<F: Float> FunctionalAdam<F> {
pub fn new(lr: F) -> Self {
Self {
lr,
beta1: F::from(0.9).expect("Failed to convert beta1"),
beta2: F::from(0.999).expect("Failed to convert beta2"),
eps: F::from(1e-8).expect("Failed to convert eps"),
step_count: 0,
m: Vec::new(),
v: Vec::new(),
initialized: false,
}
}
pub fn with_params(lr: F, beta1: F, beta2: F, eps: F) -> Self {
Self {
lr,
beta1,
beta2,
eps,
step_count: 0,
m: Vec::new(),
v: Vec::new(),
initialized: false,
}
}
}
impl<F: Float> FunctionalOptimizer<F> for FunctionalAdam<F> {
fn step<'g, A, B>(
&mut self,
params: &[A],
grads: &[B],
ctx: &'g Context<F>,
) -> Vec<Tensor<'g, F>>
where
A: AsRef<Tensor<'g, F>> + Copy,
B: AsRef<Tensor<'g, F>> + Copy,
{
use crate::tensor_ops as T;
assert_eq!(
params.len(),
grads.len(),
"params and grads must have same length"
);
self.step_count += 1;
let _t = F::from(self.step_count).expect("Failed to convert step count");
let one = F::one();
let bias_correction1 = one - self.beta1.powi(self.step_count as i32);
let bias_correction2 = one - self.beta2.powi(self.step_count as i32);
if !self.initialized {
self.m = vec![NdArray::zeros(scirs2_core::ndarray::IxDyn(&[])); params.len()];
self.v = vec![NdArray::zeros(scirs2_core::ndarray::IxDyn(&[])); params.len()];
self.initialized = true;
}
let lr_tensor = T::scalar(self.lr, ctx);
let beta1_tensor = T::scalar(self.beta1, ctx);
let beta2_tensor = T::scalar(self.beta2, ctx);
let one_minus_beta1 = T::scalar(one - self.beta1, ctx);
let one_minus_beta2 = T::scalar(one - self.beta2, ctx);
let eps_tensor = T::scalar(self.eps, ctx);
let bias_correction1_tensor = T::scalar(bias_correction1, ctx);
let bias_correction2_tensor = T::scalar(bias_correction2, ctx);
params
.iter()
.zip(grads.iter())
.map(|(p, g)| {
let grad_sq = T::mul(g.as_ref(), g.as_ref());
let grad_scaled = T::mul(one_minus_beta1, g.as_ref());
let m_update = T::div(grad_scaled, bias_correction1_tensor);
let grad_sq_scaled = T::mul(one_minus_beta2, grad_sq);
let v_update = T::div(grad_sq_scaled, bias_correction2_tensor);
let sqrt_v = T::sqrt(v_update);
let denom = T::add(sqrt_v, eps_tensor);
let step = T::mul(lr_tensor, m_update);
let scaled_step = T::div(step, denom);
T::sub(p.as_ref(), scaled_step)
})
.collect()
}
fn learning_rate(&self) -> F {
self.lr
}
fn set_learning_rate(&mut self, lr: F) {
self.lr = lr;
}
fn step_count(&self) -> usize {
self.step_count
}
fn reset(&mut self) {
self.step_count = 0;
self.m.clear();
self.v.clear();
self.initialized = false;
}
}