use crate::common::{OptimizerState, StateMemoryStats};
use crate::traits::StatefulOptimizer;
use std::collections::HashMap;
use trustformers_core::errors::{Result, TrustformersError};
use trustformers_core::tensor::Tensor;
use trustformers_core::traits::Optimizer;
#[derive(Debug, Clone)]
pub struct AdanConfig {
pub lr: f32,
pub beta1: f32,
pub beta2: f32,
pub beta3: f32,
pub eps: f32,
pub weight_decay: f32,
pub bias_correction: bool,
pub decoupled_weight_decay: bool,
}
impl Default for AdanConfig {
fn default() -> Self {
Self {
lr: 1e-3,
beta1: 0.98,
beta2: 0.92,
beta3: 0.99,
eps: 1e-8,
weight_decay: 0.02,
bias_correction: true,
decoupled_weight_decay: true,
}
}
}
#[derive(Debug)]
pub struct Adan {
config: AdanConfig,
state: OptimizerState,
exp_avg: HashMap<String, Vec<f32>>,
exp_avg_sq: HashMap<String, Vec<f32>>,
exp_avg_diff: HashMap<String, Vec<f32>>,
prev_grad: HashMap<String, Vec<f32>>,
step_count: usize,
}
impl Adan {
pub fn new(lr: f32, beta1: f32, beta2: f32, beta3: f32, eps: f32, weight_decay: f32) -> Self {
let config = AdanConfig {
lr,
beta1,
beta2,
beta3,
eps,
weight_decay,
bias_correction: true,
decoupled_weight_decay: true,
};
Self::with_config(config)
}
pub fn with_config(config: AdanConfig) -> Self {
Self {
config,
state: OptimizerState::new(),
exp_avg: HashMap::new(),
exp_avg_sq: HashMap::new(),
exp_avg_diff: HashMap::new(),
prev_grad: HashMap::new(),
step_count: 0,
}
}
pub fn for_large_models(lr: f32, weight_decay: f32) -> Self {
Self::new(lr, 0.98, 0.92, 0.99, 1e-8, weight_decay)
}
pub fn for_vision(lr: f32, weight_decay: f32) -> Self {
Self::new(lr, 0.9, 0.999, 0.9999, 1e-8, weight_decay)
}
pub fn adam_like(lr: f32, weight_decay: f32) -> Self {
Self::new(lr, 0.9, 0.999, 0.999, 1e-8, weight_decay)
}
pub fn get_lr(&self) -> f32 {
self.config.lr
}
pub fn set_lr(&mut self, lr: f32) {
self.config.lr = lr;
}
pub fn config(&self) -> &AdanConfig {
&self.config
}
pub fn memory_stats(&self) -> StateMemoryStats {
let mut total_parameters = 0;
#[allow(dead_code)]
let mut _total_buffers = 0;
for buffer in self.exp_avg.values() {
total_parameters += buffer.len();
_total_buffers += 1;
}
for buffer in self.exp_avg_sq.values() {
total_parameters += buffer.len();
_total_buffers += 1;
}
for buffer in self.exp_avg_diff.values() {
total_parameters += buffer.len();
_total_buffers += 1;
}
for buffer in self.prev_grad.values() {
total_parameters += buffer.len();
_total_buffers += 1;
}
StateMemoryStats {
momentum_elements: total_parameters,
variance_elements: total_parameters,
third_moment_elements: total_parameters,
total_bytes: total_parameters * 4, num_parameters: total_parameters,
}
}
}
impl Optimizer for Adan {
fn update(&mut self, parameter: &mut Tensor, grad: &Tensor) -> Result<()> {
self.step_count += 1;
match (parameter, grad) {
(Tensor::F32(param), Tensor::F32(grad_data)) => {
let param_id = format!("{:p}", param.as_ptr());
let size = grad_data.len();
let exp_avg =
self.exp_avg.entry(param_id.clone()).or_insert_with(|| vec![0.0; size]);
let exp_avg_sq =
self.exp_avg_sq.entry(param_id.clone()).or_insert_with(|| vec![0.0; size]);
let exp_avg_diff =
self.exp_avg_diff.entry(param_id.clone()).or_insert_with(|| vec![0.0; size]);
let prev_grad =
self.prev_grad.entry(param_id.clone()).or_insert_with(|| vec![0.0; size]);
if exp_avg.len() != size
|| exp_avg_sq.len() != size
|| exp_avg_diff.len() != size
|| prev_grad.len() != size
{
return Err(TrustformersError::tensor_op_error(
"Adan buffer size mismatch",
"Adan::update",
));
}
let bias_correction1 = if self.config.bias_correction {
1.0 - self.config.beta1.powi(self.step_count as i32)
} else {
1.0
};
let bias_correction2 = if self.config.bias_correction {
1.0 - self.config.beta2.powi(self.step_count as i32)
} else {
1.0
};
let bias_correction3 = if self.config.bias_correction {
1.0 - self.config.beta3.powi(self.step_count as i32)
} else {
1.0
};
for ((&g, p), ((m, v), (n, pg))) in grad_data.iter().zip(param.iter_mut()).zip(
exp_avg
.iter_mut()
.zip(exp_avg_sq.iter_mut())
.zip(exp_avg_diff.iter_mut().zip(prev_grad.iter_mut())),
) {
if self.config.decoupled_weight_decay && self.config.weight_decay > 0.0 {
*p *= 1.0 - self.config.lr * self.config.weight_decay;
}
let grad_with_decay =
if !self.config.decoupled_weight_decay && self.config.weight_decay > 0.0 {
g + self.config.weight_decay * (*p)
} else {
g
};
let grad_diff = grad_with_decay - *pg;
*m = self.config.beta1 * (*m) + (1.0 - self.config.beta1) * grad_with_decay;
*v = self.config.beta2 * (*v)
+ (1.0 - self.config.beta2) * grad_with_decay * grad_with_decay;
*n = self.config.beta3 * (*n) + (1.0 - self.config.beta3) * grad_diff;
*pg = grad_with_decay;
let m_hat = *m / bias_correction1;
let v_hat = *v / bias_correction2;
let n_hat = *n / bias_correction3;
let update_direction = m_hat + self.config.beta2 * n_hat;
*p -= self.config.lr * update_direction / (v_hat.sqrt() + self.config.eps);
}
Ok(())
},
_ => Err(TrustformersError::tensor_op_error(
"Adan optimizer only supports F32 tensors",
"Adan::update",
)),
}
}
fn zero_grad(&mut self) {
}
fn step(&mut self) {
}
fn get_lr(&self) -> f32 {
self.config.lr
}
fn set_lr(&mut self, lr: f32) {
self.config.lr = lr;
}
}
impl StatefulOptimizer for Adan {
type Config = AdanConfig;
type State = OptimizerState;
fn state(&self) -> &OptimizerState {
&self.state
}
fn state_mut(&mut self) -> &mut OptimizerState {
&mut self.state
}
fn config(&self) -> &Self::Config {
&self.config
}
fn memory_usage(&self) -> StateMemoryStats {
self.memory_stats()
}
fn reset_state(&mut self) {
self.exp_avg.clear();
self.exp_avg_sq.clear();
self.exp_avg_diff.clear();
self.prev_grad.clear();
self.step_count = 0;
self.state = OptimizerState::new();
}
fn num_parameters(&self) -> usize {
self.exp_avg.values().map(|v| v.len()).sum()
}
fn state_dict(&self) -> Result<HashMap<String, Tensor>> {
let mut dict = HashMap::new();
for (key, value) in &self.exp_avg {
dict.insert(format!("exp_avg_{}", key), Tensor::new(value.clone())?);
}
for (key, value) in &self.exp_avg_sq {
dict.insert(format!("exp_avg_sq_{}", key), Tensor::new(value.clone())?);
}
for (key, value) in &self.exp_avg_diff {
dict.insert(format!("exp_avg_diff_{}", key), Tensor::new(value.clone())?);
}
for (key, value) in &self.prev_grad {
dict.insert(format!("prev_grad_{}", key), Tensor::new(value.clone())?);
}
dict.insert(
"step_count".to_string(),
Tensor::new(vec![self.step_count as f32])?,
);
Ok(dict)
}
fn load_state_dict(&mut self, state_dict: HashMap<String, Tensor>) -> Result<()> {
if let Some(Tensor::F32(data)) = state_dict.get("step_count") {
if !data.is_empty() {
self.step_count = data[0] as usize;
}
}
for (key, value) in &state_dict {
if let Some(param_key) = key.strip_prefix("exp_avg_") {
if let Tensor::F32(data) = value {
self.exp_avg.insert(
param_key.to_string(),
data.as_slice().expect("array must have contiguous layout").to_vec(),
);
}
}
}
for (key, value) in &state_dict {
if let Some(param_key) = key.strip_prefix("exp_avg_sq_") {
if let Tensor::F32(data) = value {
self.exp_avg_sq.insert(
param_key.to_string(),
data.as_slice().expect("array must have contiguous layout").to_vec(),
);
}
}
}
for (key, value) in &state_dict {
if let Some(param_key) = key.strip_prefix("exp_avg_diff_") {
if let Tensor::F32(data) = value {
self.exp_avg_diff.insert(
param_key.to_string(),
data.as_slice().expect("array must have contiguous layout").to_vec(),
);
}
}
}
for (key, value) in &state_dict {
if let Some(param_key) = key.strip_prefix("prev_grad_") {
if let Tensor::F32(data) = value {
self.prev_grad.insert(
param_key.to_string(),
data.as_slice().expect("array must have contiguous layout").to_vec(),
);
}
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use trustformers_core::tensor::Tensor;
#[test]
fn test_adan_creation() {
let optimizer = Adan::new(1e-3, 0.98, 0.92, 0.99, 1e-8, 0.02);
assert_eq!(optimizer.get_lr(), 1e-3);
assert_eq!(optimizer.config().beta1, 0.98);
assert_eq!(optimizer.config().beta2, 0.92);
assert_eq!(optimizer.config().beta3, 0.99);
assert_eq!(optimizer.config().eps, 1e-8);
assert_eq!(optimizer.config().weight_decay, 0.02);
}
#[test]
fn test_adan_for_large_models() {
let optimizer = Adan::for_large_models(1e-3, 0.02);
assert_eq!(optimizer.get_lr(), 1e-3);
assert_eq!(optimizer.config().beta1, 0.98);
assert_eq!(optimizer.config().beta2, 0.92);
assert_eq!(optimizer.config().beta3, 0.99);
assert_eq!(optimizer.config().weight_decay, 0.02);
}
#[test]
fn test_adan_for_vision() {
let optimizer = Adan::for_vision(1e-3, 0.02);
assert_eq!(optimizer.get_lr(), 1e-3);
assert_eq!(optimizer.config().beta1, 0.9);
assert_eq!(optimizer.config().beta2, 0.999);
assert_eq!(optimizer.config().beta3, 0.9999);
}
#[test]
fn test_adan_adam_like() {
let optimizer = Adan::adam_like(1e-3, 0.02);
assert_eq!(optimizer.get_lr(), 1e-3);
assert_eq!(optimizer.config().beta1, 0.9);
assert_eq!(optimizer.config().beta2, 0.999);
assert_eq!(optimizer.config().beta3, 0.999);
}
#[test]
fn test_adan_lr_setter() {
let mut optimizer = Adan::new(1e-3, 0.98, 0.92, 0.99, 1e-8, 0.02);
optimizer.set_lr(2e-3);
assert_eq!(optimizer.get_lr(), 2e-3);
}
#[test]
fn test_adan_memory_stats() {
let optimizer = Adan::new(1e-3, 0.98, 0.92, 0.99, 1e-8, 0.02);
let stats = optimizer.memory_stats();
assert_eq!(stats.num_parameters, 0);
assert_eq!(stats.total_bytes, 0);
}
#[test]
fn test_adan_state_dict() {
let optimizer = Adan::new(1e-3, 0.98, 0.92, 0.99, 1e-8, 0.02);
let state_dict = optimizer.state_dict();
assert!(state_dict.expect("Operation failed in test").contains_key("step_count"));
}
#[test]
fn test_adan_load_state_dict() {
let mut optimizer = Adan::new(1e-3, 0.98, 0.92, 0.99, 1e-8, 0.02);
let mut state_dict = HashMap::new();
state_dict.insert(
"step_count".to_string(),
Tensor::new(vec![10.0]).expect("Failed to create tensor"),
);
optimizer.load_state_dict(state_dict).expect("Failed to load state dict");
assert_eq!(optimizer.step_count, 10);
}
#[test]
fn test_adan_with_config() {
let config = AdanConfig {
lr: 2e-3,
beta1: 0.95,
beta2: 0.90,
beta3: 0.95,
eps: 1e-7,
weight_decay: 0.01,
bias_correction: false,
decoupled_weight_decay: false,
};
let optimizer = Adan::with_config(config);
assert_eq!(optimizer.get_lr(), 2e-3);
assert_eq!(optimizer.config().beta1, 0.95);
assert!(!optimizer.config().bias_correction);
assert!(!optimizer.config().decoupled_weight_decay);
}
}