use crate::common::{OptimizerState, StateMemoryStats};
use crate::traits::StatefulOptimizer;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use trustformers_core::errors::Result;
use trustformers_core::tensor::Tensor;
use trustformers_core::traits::Optimizer;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptimizedBGEAdamConfig {
pub learning_rate: f32,
pub beta1: f32,
pub beta2: f32,
pub epsilon: f32,
pub weight_decay: f32,
pub entropy_scaling: f32,
pub beta1_adaptation: f32,
pub beta2_adaptation: f32,
pub min_entropy: f32,
pub bias_correction: bool,
pub entropy_weighting: bool,
pub adaptive_parameters: bool,
pub max_entropy_history: usize,
pub use_vectorized: bool,
}
impl Default for OptimizedBGEAdamConfig {
fn default() -> Self {
Self {
learning_rate: 1e-3,
beta1: 0.9,
beta2: 0.999,
epsilon: 1e-8,
weight_decay: 0.01,
entropy_scaling: 0.1,
beta1_adaptation: 0.05,
beta2_adaptation: 0.05,
min_entropy: 1e-6,
bias_correction: true,
entropy_weighting: true,
adaptive_parameters: true,
max_entropy_history: 100,
use_vectorized: true,
}
}
}
impl OptimizedBGEAdamConfig {
pub fn for_large_models() -> Self {
Self {
learning_rate: 1e-4,
weight_decay: 0.1,
entropy_scaling: 0.05,
beta1_adaptation: 0.02,
beta2_adaptation: 0.02,
max_entropy_history: 50, use_vectorized: true,
..Default::default()
}
}
pub fn for_vision() -> Self {
Self {
learning_rate: 2e-4,
weight_decay: 0.05,
entropy_scaling: 0.15,
beta1_adaptation: 0.08,
beta2_adaptation: 0.08,
max_entropy_history: 75,
use_vectorized: true,
..Default::default()
}
}
pub fn for_high_performance() -> Self {
Self {
entropy_weighting: true,
adaptive_parameters: true,
bias_correction: true,
use_vectorized: true,
max_entropy_history: 32, entropy_scaling: 0.08, beta1_adaptation: 0.03,
beta2_adaptation: 0.03,
..Default::default()
}
}
}
#[derive(Debug)]
pub struct OptimizedBGEAdam {
config: OptimizedBGEAdamConfig,
state: OptimizerState,
step_count: usize,
entropy_history: Vec<f32>,
temp_buffer1: Vec<f32>,
temp_buffer2: Vec<f32>,
temp_buffer3: Vec<f32>,
}
impl OptimizedBGEAdam {
pub fn new() -> Self {
Self::with_config(OptimizedBGEAdamConfig::default())
}
pub fn with_config(config: OptimizedBGEAdamConfig) -> Self {
Self {
config,
state: OptimizerState::default(),
step_count: 0,
entropy_history: Vec::with_capacity(100),
temp_buffer1: Vec::new(),
temp_buffer2: Vec::new(),
temp_buffer3: Vec::new(),
}
}
pub fn for_large_models() -> Self {
Self::with_config(OptimizedBGEAdamConfig::for_large_models())
}
pub fn for_vision() -> Self {
Self::with_config(OptimizedBGEAdamConfig::for_vision())
}
pub fn for_high_performance() -> Self {
Self::with_config(OptimizedBGEAdamConfig::for_high_performance())
}
fn process_gradients_single_pass(
&mut self,
gradients: &[f32],
momentum: &mut [f32],
variance: &mut [f32],
params: &mut [f32],
step_count: f32,
) -> Result<f32> {
let n = gradients.len();
if self.temp_buffer1.len() < n {
self.temp_buffer1.resize(n, 0.0);
self.temp_buffer2.resize(n, 0.0);
self.temp_buffer3.resize(n, 0.0);
}
let eps = self.config.epsilon;
let entropy_scaling = self.config.entropy_scaling;
let mut sum_abs_grads = 0.0f32;
let mut entropy = 0.0f32;
for (i, &grad) in gradients.iter().enumerate() {
let abs_grad = grad.abs();
self.temp_buffer1[i] = abs_grad;
sum_abs_grads += abs_grad;
}
if sum_abs_grads < eps {
return Ok(self.config.min_entropy);
}
let inv_sum = 1.0 / sum_abs_grads;
if self.config.entropy_weighting {
for i in 0..n {
let prob = self.temp_buffer1[i] * inv_sum;
if prob > eps {
entropy -= prob * (prob + eps).ln();
}
self.temp_buffer2[i] = (-entropy_scaling * prob).exp();
}
} else {
for i in 0..n {
self.temp_buffer2[i] = 1.0;
}
}
entropy = entropy.max(self.config.min_entropy);
let (beta1_adaptive, beta2_adaptive) = if self.config.adaptive_parameters {
let beta1 = self.config.beta1 * (1.0 + self.config.beta1_adaptation * entropy);
let beta2 = self.config.beta2 * (1.0 - self.config.beta2_adaptation * entropy);
(beta1.clamp(0.1, 0.99), beta2.clamp(0.9, 0.9999))
} else {
(self.config.beta1, self.config.beta2)
};
let (momentum_correction, variance_correction) = if self.config.bias_correction {
(
1.0 / (1.0 - beta1_adaptive.powf(step_count)),
1.0 / (1.0 - beta2_adaptive.powf(step_count)),
)
} else {
(1.0, 1.0)
};
let lr = self.config.learning_rate;
let weight_decay = self.config.weight_decay;
let one_minus_beta1 = 1.0 - beta1_adaptive;
let one_minus_beta2 = 1.0 - beta2_adaptive;
if self.config.use_vectorized {
let chunks = n / 4;
let remainder = n % 4;
for chunk in 0..chunks {
let start = chunk * 4;
for offset in 0..4 {
let i = start + offset;
let weighted_grad = gradients[i] * self.temp_buffer2[i];
momentum[i] = beta1_adaptive * momentum[i] + one_minus_beta1 * weighted_grad;
variance[i] = beta2_adaptive * variance[i]
+ one_minus_beta2 * weighted_grad * weighted_grad;
let corrected_momentum = momentum[i] * momentum_correction;
let corrected_variance = variance[i] * variance_correction;
let update = corrected_momentum / (corrected_variance.sqrt() + eps);
let weight_decay_term = weight_decay * params[i];
params[i] -= lr * (update + weight_decay_term);
}
}
for i in (chunks * 4)..(chunks * 4 + remainder) {
let weighted_grad = gradients[i] * self.temp_buffer2[i];
momentum[i] = beta1_adaptive * momentum[i] + one_minus_beta1 * weighted_grad;
variance[i] =
beta2_adaptive * variance[i] + one_minus_beta2 * weighted_grad * weighted_grad;
let corrected_momentum = momentum[i] * momentum_correction;
let corrected_variance = variance[i] * variance_correction;
let update = corrected_momentum / (corrected_variance.sqrt() + eps);
let weight_decay_term = weight_decay * params[i];
params[i] -= lr * (update + weight_decay_term);
}
} else {
for i in 0..n {
let weighted_grad = gradients[i] * self.temp_buffer2[i];
momentum[i] = beta1_adaptive * momentum[i] + one_minus_beta1 * weighted_grad;
variance[i] =
beta2_adaptive * variance[i] + one_minus_beta2 * weighted_grad * weighted_grad;
let corrected_momentum = momentum[i] * momentum_correction;
let corrected_variance = variance[i] * variance_correction;
let update = corrected_momentum / (corrected_variance.sqrt() + eps);
let weight_decay_term = weight_decay * params[i];
params[i] -= lr * (update + weight_decay_term);
}
}
Ok(entropy)
}
fn update_entropy_history(&mut self, entropy: f32) {
self.entropy_history.push(entropy);
if self.entropy_history.len() > self.config.max_entropy_history {
let excess = self.entropy_history.len() - self.config.max_entropy_history;
self.entropy_history.drain(0..excess);
}
}
pub fn get_entropy_stats(&self) -> (f32, f32, f32) {
if self.entropy_history.is_empty() {
return (0.0, 0.0, 0.0);
}
let mut min_entropy = f32::INFINITY;
let mut max_entropy = f32::NEG_INFINITY;
let mut sum_entropy = 0.0;
for &entropy in &self.entropy_history {
min_entropy = min_entropy.min(entropy);
max_entropy = max_entropy.max(entropy);
sum_entropy += entropy;
}
let avg_entropy = sum_entropy / self.entropy_history.len() as f32;
(min_entropy, max_entropy, avg_entropy)
}
pub fn performance_stats(&self) -> String {
format!(
"Optimized BGE-Adam Stats:\n\
- Step count: {}\n\
- Entropy history: {}/{} entries\n\
- Vectorized ops: {}\n\
- Buffer capacity: {} elements",
self.step_count,
self.entropy_history.len(),
self.config.max_entropy_history,
self.config.use_vectorized,
self.temp_buffer1.capacity()
)
}
}
impl Default for OptimizedBGEAdam {
fn default() -> Self {
Self::new()
}
}
impl Optimizer for OptimizedBGEAdam {
fn zero_grad(&mut self) {
}
fn update(&mut self, parameter: &mut Tensor, gradient: &Tensor) -> Result<()> {
let param_id = format!("{:p}", parameter as *const _);
self.step_count += 1;
let gradient_data = gradient.data()?;
let mut param_data = parameter.data()?.clone();
let param_size = gradient_data.len();
let mut momentum_data = {
let momentum_buffer = self.state.get_or_create_momentum(param_id.clone(), param_size);
momentum_buffer.clone()
};
let mut variance_data = {
let variance_buffer = self.state.get_or_create_variance(param_id.clone(), param_size);
variance_buffer.clone()
};
let entropy = self.process_gradients_single_pass(
&gradient_data,
&mut momentum_data,
&mut variance_data,
&mut param_data,
self.step_count as f32,
)?;
self.update_entropy_history(entropy);
if let Some(momentum_buffer) = self.state.momentum.get_mut(¶m_id) {
*momentum_buffer = momentum_data;
}
if let Some(variance_buffer) = self.state.variance.get_mut(¶m_id) {
*variance_buffer = variance_data;
}
*parameter = Tensor::new(param_data)?;
Ok(())
}
fn step(&mut self) {
self.state.step();
}
fn set_lr(&mut self, lr: f32) {
self.config.learning_rate = lr;
}
fn get_lr(&self) -> f32 {
self.config.learning_rate
}
}
impl StatefulOptimizer for OptimizedBGEAdam {
type Config = OptimizedBGEAdamConfig;
type State = OptimizerState;
fn config(&self) -> &Self::Config {
&self.config
}
fn state(&self) -> &Self::State {
&self.state
}
fn state_mut(&mut self) -> &mut Self::State {
&mut self.state
}
fn state_dict(&self) -> Result<HashMap<String, Tensor>> {
let mut state_dict = HashMap::new();
for (key, buffer) in &self.state.momentum {
let tensor = Tensor::new(buffer.clone())?;
state_dict.insert(format!("{}_momentum", key), tensor);
}
for (key, buffer) in &self.state.variance {
let tensor = Tensor::new(buffer.clone())?;
state_dict.insert(format!("{}_variance", key), tensor);
}
let entropy_tensor = Tensor::new(self.entropy_history.clone())?;
state_dict.insert("entropy_history".to_string(), entropy_tensor);
let step_tensor = Tensor::new(vec![self.step_count as f32])?;
state_dict.insert("step_count".to_string(), step_tensor);
Ok(state_dict)
}
fn load_state_dict(&mut self, state_dict: HashMap<String, Tensor>) -> Result<()> {
for (key, tensor) in state_dict {
let data = tensor.data()?;
if key == "entropy_history" {
self.entropy_history = data.clone();
} else if key == "step_count" {
if let Some(&count) = data.first() {
self.step_count = count as usize;
}
} else if key.ends_with("_momentum") {
if let Some(param_key) = key.strip_suffix("_momentum") {
self.state.momentum.insert(param_key.to_string(), data.clone());
}
} else if key.ends_with("_variance") {
if let Some(param_key) = key.strip_suffix("_variance") {
self.state.variance.insert(param_key.to_string(), data.clone());
}
}
}
Ok(())
}
fn memory_usage(&self) -> StateMemoryStats {
let momentum_size: usize = self.state.momentum.values().map(|v| v.len()).sum();
let variance_size: usize = self.state.variance.values().map(|v| v.len()).sum();
let entropy_size = self.entropy_history.len();
let buffer_size = self.temp_buffer1.capacity()
+ self.temp_buffer2.capacity()
+ self.temp_buffer3.capacity();
let total_bytes = (momentum_size + variance_size + entropy_size + buffer_size)
* std::mem::size_of::<f32>();
StateMemoryStats {
momentum_elements: momentum_size,
variance_elements: variance_size,
third_moment_elements: 0, total_bytes,
num_parameters: self.state.momentum.len(),
}
}
fn reset_state(&mut self) {
self.state.clear();
self.step_count = 0;
self.entropy_history.clear();
self.temp_buffer1.clear();
self.temp_buffer2.clear();
self.temp_buffer3.clear();
}
fn num_parameters(&self) -> usize {
self.state.momentum.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_optimized_bge_adam_creation() {
let optimizer = OptimizedBGEAdam::new();
assert_eq!(optimizer.get_lr(), 1e-3);
assert_eq!(optimizer.step_count, 0);
}
#[test]
fn test_optimized_bge_adam_presets() {
let llm_opt = OptimizedBGEAdam::for_large_models();
assert_eq!(llm_opt.config.learning_rate, 1e-4);
assert_eq!(llm_opt.config.weight_decay, 0.1);
assert_eq!(llm_opt.config.max_entropy_history, 50);
let vision_opt = OptimizedBGEAdam::for_vision();
assert_eq!(vision_opt.config.learning_rate, 2e-4);
assert_eq!(vision_opt.config.weight_decay, 0.05);
let perf_opt = OptimizedBGEAdam::for_high_performance();
assert_eq!(perf_opt.config.max_entropy_history, 32);
assert!(perf_opt.config.use_vectorized);
}
#[test]
fn test_entropy_history_management() {
let mut optimizer = OptimizedBGEAdam::with_config(OptimizedBGEAdamConfig {
max_entropy_history: 3,
..Default::default()
});
optimizer.update_entropy_history(0.1);
optimizer.update_entropy_history(0.2);
optimizer.update_entropy_history(0.3);
optimizer.update_entropy_history(0.4);
assert_eq!(optimizer.entropy_history.len(), 3);
assert_eq!(optimizer.entropy_history[0], 0.2);
assert_eq!(optimizer.entropy_history[2], 0.4);
}
#[test]
fn test_entropy_stats() {
let mut optimizer = OptimizedBGEAdam::new();
let (min, max, avg) = optimizer.get_entropy_stats();
assert_eq!((min, max, avg), (0.0, 0.0, 0.0));
optimizer.update_entropy_history(0.1);
optimizer.update_entropy_history(0.3);
optimizer.update_entropy_history(0.2);
let (min, max, avg) = optimizer.get_entropy_stats();
assert_eq!(min, 0.1);
assert_eq!(max, 0.3);
assert_eq!(avg, 0.2);
}
#[test]
fn test_performance_stats() {
let optimizer = OptimizedBGEAdam::new();
let stats = optimizer.performance_stats();
assert!(stats.contains("Step count: 0"));
assert!(stats.contains("Vectorized ops: true"));
}
}