use crate::common::{OptimizerState, StateMemoryStats};
use crate::traits::StatefulOptimizer;
use std::collections::HashMap;
use trustformers_core::errors::{Result, TrustformersError};
use trustformers_core::tensor::Tensor;
use trustformers_core::traits::Optimizer;
#[derive(Debug, Clone)]
pub struct SophiaConfig {
pub lr: f32,
pub betas: (f32, f32),
pub eps: f32,
pub rho: f32,
pub gamma: f32,
pub weight_decay: f32,
pub hessian_update_freq: usize,
}
impl Default for SophiaConfig {
fn default() -> Self {
Self {
lr: 1e-4,
betas: (0.965, 0.99),
eps: 1e-8,
rho: 0.04,
gamma: 0.01,
weight_decay: 0.01,
hessian_update_freq: 10, }
}
}
#[derive(Debug)]
pub struct Sophia {
config: SophiaConfig,
state: OptimizerState,
momentum: HashMap<String, Vec<f32>>,
hessian_diag: HashMap<String, Vec<f32>>,
prev_grad: HashMap<String, Vec<f32>>,
}
impl Sophia {
pub fn new(lr: f32, betas: (f32, f32), eps: f32, rho: f32, gamma: f32) -> Self {
Self {
config: SophiaConfig {
lr,
betas,
eps,
rho,
gamma,
weight_decay: 0.01,
hessian_update_freq: 10,
},
state: OptimizerState::new(),
momentum: HashMap::new(),
hessian_diag: HashMap::new(),
prev_grad: HashMap::new(),
}
}
pub fn with_config(
lr: f32,
betas: (f32, f32),
eps: f32,
rho: f32,
gamma: f32,
weight_decay: f32,
hessian_update_freq: usize,
) -> Self {
Self {
config: SophiaConfig {
lr,
betas,
eps,
rho,
gamma,
weight_decay,
hessian_update_freq,
},
state: OptimizerState::new(),
momentum: HashMap::new(),
hessian_diag: HashMap::new(),
prev_grad: HashMap::new(),
}
}
pub fn from_config(config: SophiaConfig) -> Self {
Self {
config,
state: OptimizerState::new(),
momentum: HashMap::new(),
hessian_diag: HashMap::new(),
prev_grad: HashMap::new(),
}
}
#[allow(dead_code)]
fn clip_update(&self, update: f32) -> f32 {
update.clamp(-self.config.gamma, self.config.gamma)
}
}
impl Optimizer for Sophia {
fn update(&mut self, parameter: &mut Tensor, grad: &Tensor) -> Result<()> {
match (parameter, grad) {
(Tensor::F32(param), Tensor::F32(grad_arr)) => {
let param_id = format!("{:p}", param.as_ptr());
let size = grad_arr.len();
let momentum =
self.momentum.entry(param_id.clone()).or_insert_with(|| vec![0.0; size]);
let hessian_diag =
self.hessian_diag.entry(param_id.clone()).or_insert_with(|| vec![1e-4; size]); let prev_grad =
self.prev_grad.entry(param_id.clone()).or_insert_with(|| vec![0.0; size]);
if momentum.len() != size || hessian_diag.len() != size || prev_grad.len() != size {
return Err(TrustformersError::tensor_op_error(
"Sophia state buffer size mismatch",
"Sophia::update",
));
}
let should_update_hessian = self.state.step > 0
&& self.state.step.is_multiple_of(self.config.hessian_update_freq);
for (((p, &g), m), (h, prev_g)) in param
.iter_mut()
.zip(grad_arr.iter())
.zip(momentum.iter_mut())
.zip(hessian_diag.iter_mut().zip(prev_grad.iter_mut()))
{
if should_update_hessian {
let hessian_estimate = (g - *prev_g).abs() / self.config.lr.max(1e-8);
*h = self.config.betas.1 * *h
+ (1.0 - self.config.betas.1) * hessian_estimate;
}
if self.config.weight_decay != 0.0 {
*p -= self.config.lr * self.config.weight_decay * *p;
}
*m = self.config.betas.0 * *m + (1.0 - self.config.betas.0) * g;
let step = (self.state.step + 1) as f32;
let bias_correction = 1.0 - self.config.betas.0.powf(step);
let corrected_momentum = *m / bias_correction;
let denom = self.config.rho * *h + self.config.eps;
let raw_update = corrected_momentum / denom;
let clipped_update = raw_update.clamp(-self.config.gamma, self.config.gamma);
*p -= self.config.lr * clipped_update;
*prev_g = g;
}
Ok(())
},
_ => Err(TrustformersError::tensor_op_error(
"Unsupported tensor types for Sophia",
"Sophia::update",
)),
}
}
fn zero_grad(&mut self) {}
fn step(&mut self) {
self.state.step += 1;
}
fn get_lr(&self) -> f32 {
self.config.lr
}
fn set_lr(&mut self, lr: f32) {
self.config.lr = lr;
}
}
impl StatefulOptimizer for Sophia {
type Config = SophiaConfig;
type State = OptimizerState;
fn config(&self) -> &Self::Config {
&self.config
}
fn state(&self) -> &Self::State {
&self.state
}
fn state_mut(&mut self) -> &mut Self::State {
&mut self.state
}
fn state_dict(&self) -> Result<HashMap<String, Tensor>> {
let mut state_dict = HashMap::new();
state_dict.insert("lr".to_string(), Tensor::new(vec![self.config.lr])?);
state_dict.insert("beta1".to_string(), Tensor::new(vec![self.config.betas.0])?);
state_dict.insert("beta2".to_string(), Tensor::new(vec![self.config.betas.1])?);
state_dict.insert("eps".to_string(), Tensor::new(vec![self.config.eps])?);
state_dict.insert("rho".to_string(), Tensor::new(vec![self.config.rho])?);
state_dict.insert("gamma".to_string(), Tensor::new(vec![self.config.gamma])?);
state_dict.insert(
"weight_decay".to_string(),
Tensor::new(vec![self.config.weight_decay])?,
);
state_dict.insert(
"step".to_string(),
Tensor::new(vec![self.state.step as f32])?,
);
for (param_id, momentum) in &self.momentum {
state_dict.insert(
format!("momentum_{}", param_id),
Tensor::new(momentum.clone())?,
);
}
for (param_id, hessian_diag) in &self.hessian_diag {
state_dict.insert(
format!("hessian_diag_{}", param_id),
Tensor::new(hessian_diag.clone())?,
);
}
for (param_id, prev_grad) in &self.prev_grad {
state_dict.insert(
format!("prev_grad_{}", param_id),
Tensor::new(prev_grad.clone())?,
);
}
Ok(state_dict)
}
fn load_state_dict(&mut self, state: HashMap<String, Tensor>) -> Result<()> {
if let Some(lr_tensor) = state.get("lr") {
if let Ok(lr_vec) = lr_tensor.data() {
if !lr_vec.is_empty() {
self.config.lr = lr_vec[0];
}
}
}
if let Some(beta1_tensor) = state.get("beta1") {
if let Ok(beta1_vec) = beta1_tensor.data() {
if !beta1_vec.is_empty() {
self.config.betas.0 = beta1_vec[0];
}
}
}
if let Some(beta2_tensor) = state.get("beta2") {
if let Ok(beta2_vec) = beta2_tensor.data() {
if !beta2_vec.is_empty() {
self.config.betas.1 = beta2_vec[0];
}
}
}
if let Some(eps_tensor) = state.get("eps") {
if let Ok(eps_vec) = eps_tensor.data() {
if !eps_vec.is_empty() {
self.config.eps = eps_vec[0];
}
}
}
if let Some(rho_tensor) = state.get("rho") {
if let Ok(rho_vec) = rho_tensor.data() {
if !rho_vec.is_empty() {
self.config.rho = rho_vec[0];
}
}
}
if let Some(gamma_tensor) = state.get("gamma") {
if let Ok(gamma_vec) = gamma_tensor.data() {
if !gamma_vec.is_empty() {
self.config.gamma = gamma_vec[0];
}
}
}
if let Some(weight_decay_tensor) = state.get("weight_decay") {
if let Ok(weight_decay_vec) = weight_decay_tensor.data() {
if !weight_decay_vec.is_empty() {
self.config.weight_decay = weight_decay_vec[0];
}
}
}
if let Some(step_tensor) = state.get("step") {
if let Ok(step_vec) = step_tensor.data() {
if !step_vec.is_empty() {
self.state.step = step_vec[0] as usize;
}
}
}
for (key, tensor) in state.iter() {
if key.starts_with("momentum_") {
let param_id = key.trim_start_matches("momentum_");
if let Ok(momentum) = tensor.data() {
self.momentum.insert(param_id.to_string(), momentum.clone());
}
} else if key.starts_with("hessian_diag_") {
let param_id = key.trim_start_matches("hessian_diag_");
if let Ok(hessian_diag) = tensor.data() {
self.hessian_diag.insert(param_id.to_string(), hessian_diag.clone());
}
} else if key.starts_with("prev_grad_") {
let param_id = key.trim_start_matches("prev_grad_");
if let Ok(prev_grad) = tensor.data() {
self.prev_grad.insert(param_id.to_string(), prev_grad.clone());
}
}
}
Ok(())
}
fn memory_usage(&self) -> StateMemoryStats {
let mut momentum_elements = 0;
let mut variance_elements = 0; let mut third_moment_elements = 0;
for momentum in self.momentum.values() {
momentum_elements += momentum.len();
}
for hessian_diag in self.hessian_diag.values() {
variance_elements += hessian_diag.len();
}
for prev_grad in self.prev_grad.values() {
third_moment_elements += prev_grad.len();
}
let total_elements = momentum_elements + variance_elements + third_moment_elements;
let total_bytes = total_elements * std::mem::size_of::<f32>();
StateMemoryStats {
momentum_elements,
variance_elements,
third_moment_elements,
total_bytes,
num_parameters: momentum_elements,
}
}
fn reset_state(&mut self) {
self.state.step = 0;
self.momentum.clear();
self.hessian_diag.clear();
self.prev_grad.clear();
}
fn num_parameters(&self) -> usize {
self.momentum.values().map(|v| v.len()).sum()
}
}
impl Default for Sophia {
fn default() -> Self {
Self::new(1e-4, (0.965, 0.99), 1e-8, 0.04, 0.01)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sophia_creation() {
let optimizer = Sophia::new(1e-4, (0.965, 0.99), 1e-8, 0.04, 0.01);
assert_eq!(optimizer.config.lr, 1e-4);
assert_eq!(optimizer.config.betas, (0.965, 0.99));
assert_eq!(optimizer.config.eps, 1e-8);
assert_eq!(optimizer.config.rho, 0.04);
assert_eq!(optimizer.config.gamma, 0.01);
}
#[test]
fn test_sophia_with_config() {
let optimizer = Sophia::with_config(1e-4, (0.965, 0.99), 1e-8, 0.04, 0.01, 0.01, 5);
assert_eq!(optimizer.config.weight_decay, 0.01);
assert_eq!(optimizer.config.hessian_update_freq, 5);
}
#[test]
fn test_sophia_default() {
let optimizer = Sophia::default();
assert_eq!(optimizer.config.lr, 1e-4);
assert_eq!(optimizer.config.betas, (0.965, 0.99));
assert_eq!(optimizer.config.rho, 0.04);
assert_eq!(optimizer.config.gamma, 0.01);
}
#[test]
fn test_hessian_estimation() {
let optimizer = Sophia::default();
let current_grad = 1.0f32;
let previous_grad = 0.8f32;
let hessian_est = (current_grad - previous_grad).abs() / optimizer.config.lr.max(1e-8f32);
assert!(hessian_est >= 0.0); assert!(hessian_est > 0.0); }
#[test]
fn test_clipping() {
let optimizer = Sophia::with_config(1e-4, (0.965, 0.99), 1e-8, 0.04, 0.01, 0.01, 10);
assert_eq!(optimizer.clip_update(0.005), 0.005); assert_eq!(optimizer.clip_update(0.02), 0.01); assert_eq!(optimizer.clip_update(-0.02), -0.01); }
#[test]
fn test_memory_usage() {
let optimizer = Sophia::default();
let stats = optimizer.memory_usage();
assert_eq!(stats.total_bytes, 0);
assert_eq!(stats.momentum_elements, 0);
assert_eq!(stats.variance_elements, 0);
assert_eq!(stats.third_moment_elements, 0);
}
#[test]
fn test_state_persistence() {
let mut optimizer = Sophia::default();
optimizer.state.step = 50;
assert_eq!(optimizer.state.step, 50);
optimizer.reset_state();
assert_eq!(optimizer.state.step, 0);
assert!(optimizer.momentum.is_empty());
assert!(optimizer.hessian_diag.is_empty());
assert!(optimizer.prev_grad.is_empty());
}
#[test]
fn test_hessian_update_frequency() {
let optimizer = Sophia::with_config(1e-4, (0.965, 0.99), 1e-8, 0.04, 0.01, 0.01, 5);
assert_eq!(optimizer.config.hessian_update_freq, 5);
let steps_to_check: Vec<(usize, bool)> = vec![
(1, false), (5, true), (10, true), (3, false), ];
for (step, should_update) in steps_to_check {
assert_eq!(
step % 5 == 0,
should_update,
"Step {step} update check failed"
);
}
}
}