optirs_core/optimizers/adagrad.rs
1// Adagrad optimizer implementation
2
3use scirs2_core::ndarray::{Array, Dimension, ScalarOperand};
4use scirs2_core::numeric::Float;
5use std::fmt::Debug;
6
7use crate::error::Result;
8use crate::optimizers::Optimizer;
9
10/// Adagrad optimizer
11///
12/// Implements the Adagrad optimization algorithm from the paper:
13/// "Adaptive Subgradient Methods for Online Learning and Stochastic Optimization" by Duchi et al. (2011)
14///
15/// Adagrad adapts the learning rate to the parameters, performing larger updates for
16/// infrequently updated parameters and smaller updates for frequently updated parameters.
17///
18/// Formula:
19/// G_t = G_{t-1} + g_t^2
20/// param_t = param_{t-1} - learning_rate * g_t / (sqrt(G_t) + epsilon)
21///
22/// # Examples
23///
24/// ```
25/// use scirs2_core::ndarray::Array1;
26/// use optirs_core::optimizers::{Adagrad, Optimizer};
27///
28/// // Initialize parameters and gradients
29/// let params = Array1::zeros(5);
30/// let gradients = Array1::from_vec(vec![0.1, 0.2, -0.3, 0.0, 0.5]);
31///
32/// // Create an Adagrad optimizer with learning rate 0.01
33/// let mut optimizer = Adagrad::new(0.01);
34///
35/// // Update parameters
36/// let new_params = optimizer.step(¶ms, &gradients).unwrap();
37/// ```
38#[derive(Debug, Clone)]
39pub struct Adagrad<A: Float + ScalarOperand + Debug> {
40 /// Learning rate
41 learning_rate: A,
42 /// Small constant for numerical stability
43 epsilon: A,
44 /// Weight decay factor (L2 regularization)
45 weight_decay: A,
46 /// Sum of squared gradients
47 sum_squared_grads: Option<Vec<Array<A, scirs2_core::ndarray::IxDyn>>>,
48}
49
50impl<A: Float + ScalarOperand + Debug + Send + Sync> Adagrad<A> {
51 /// Creates a new Adagrad optimizer with the given learning rate and default settings
52 ///
53 /// # Arguments
54 ///
55 /// * `learning_rate` - The learning rate for parameter updates
56 pub fn new(learning_rate: A) -> Self {
57 Self {
58 learning_rate,
59 epsilon: A::from(1e-10).unwrap(),
60 weight_decay: A::zero(),
61 sum_squared_grads: None,
62 }
63 }
64
65 /// Creates a new Adagrad optimizer with the full configuration
66 ///
67 /// # Arguments
68 ///
69 /// * `learning_rate` - The learning rate for parameter updates
70 /// * `epsilon` - Small constant for numerical stability (default: 1e-10)
71 /// * `weight_decay` - Weight decay factor for L2 regularization (default: 0.0)
72 pub fn new_with_config(learning_rate: A, epsilon: A, weight_decay: A) -> Self {
73 Self {
74 learning_rate,
75 epsilon,
76 weight_decay,
77 sum_squared_grads: None,
78 }
79 }
80
81 /// Sets the epsilon parameter
82 pub fn set_epsilon(&mut self, epsilon: A) -> &mut Self {
83 self.epsilon = epsilon;
84 self
85 }
86
87 /// Gets the epsilon parameter
88 pub fn get_epsilon(&self) -> A {
89 self.epsilon
90 }
91
92 /// Sets the weight decay parameter
93 pub fn set_weight_decay(&mut self, weight_decay: A) -> &mut Self {
94 self.weight_decay = weight_decay;
95 self
96 }
97
98 /// Gets the weight decay parameter
99 pub fn get_weight_decay(&self) -> A {
100 self.weight_decay
101 }
102
103 /// Resets the internal state of the optimizer
104 pub fn reset(&mut self) {
105 self.sum_squared_grads = None;
106 }
107}
108
109impl<A, D> Optimizer<A, D> for Adagrad<A>
110where
111 A: Float + ScalarOperand + Debug + Send + Sync,
112 D: Dimension,
113{
114 fn step(&mut self, params: &Array<A, D>, gradients: &Array<A, D>) -> Result<Array<A, D>> {
115 // Convert to dynamic dimension for storage in state vectors
116 let params_dyn = params.to_owned().into_dyn();
117 let gradients_dyn = gradients.to_owned().into_dyn();
118
119 // Apply weight decay to gradients if needed
120 let adjusted_gradients = if self.weight_decay > A::zero() {
121 &gradients_dyn + &(¶ms_dyn * self.weight_decay)
122 } else {
123 gradients_dyn.clone()
124 };
125
126 // Initialize state if this is the first step
127 if self.sum_squared_grads.is_none() {
128 self.sum_squared_grads = Some(vec![Array::zeros(params_dyn.raw_dim())]);
129 }
130
131 let sum_squared_grads = self.sum_squared_grads.as_mut().unwrap();
132
133 // Ensure we have state for this parameter set
134 if sum_squared_grads.is_empty() {
135 sum_squared_grads.push(Array::zeros(params_dyn.raw_dim()));
136 } else if sum_squared_grads[0].raw_dim() != params_dyn.raw_dim() {
137 // If the parameter dimensions have changed, reset state
138 sum_squared_grads[0] = Array::zeros(params_dyn.raw_dim());
139 }
140
141 // Update sum of squared gradients
142 // G_t = G_{t-1} + g_t^2
143 sum_squared_grads[0] = &sum_squared_grads[0] + &(&adjusted_gradients * &adjusted_gradients);
144
145 // Compute step size
146 // step = learning_rate * g_t / (sqrt(G_t) + epsilon)
147 let g_sqrt = sum_squared_grads[0].mapv(|x| x.sqrt());
148 let step = &adjusted_gradients * self.learning_rate / &(&g_sqrt + self.epsilon);
149
150 // Update parameters
151 let updated_params = ¶ms_dyn - step;
152
153 // Convert back to original dimension
154 Ok(updated_params.into_dimensionality::<D>().unwrap())
155 }
156
157 fn get_learning_rate(&self) -> A {
158 self.learning_rate
159 }
160
161 fn set_learning_rate(&mut self, learning_rate: A) {
162 self.learning_rate = learning_rate;
163 }
164}