optirs_core/optimizers/rmsprop.rs
1// RMSprop optimizer implementation
2
3use scirs2_core::ndarray::{Array, Dimension, ScalarOperand};
4use scirs2_core::numeric::Float;
5use std::fmt::Debug;
6
7use crate::error::Result;
8use crate::optimizers::Optimizer;
9
10/// RMSprop optimizer
11///
12/// Implements the RMSprop optimization algorithm as proposed by Geoffrey Hinton
13/// in his Coursera course "Neural Networks for Machine Learning".
14///
15/// Formula:
16/// v_t = rho * v_{t-1} + (1 - rho) * g_t^2
17/// param_t = param_{t-1} - learning_rate * g_t / (sqrt(v_t) + epsilon)
18///
19/// # Examples
20///
21/// ```
22/// use scirs2_core::ndarray::Array1;
23/// use optirs_core::optimizers::{RMSprop, Optimizer};
24///
25/// // Initialize parameters and gradients
26/// let params = Array1::zeros(5);
27/// let gradients = Array1::from_vec(vec![0.1, 0.2, -0.3, 0.0, 0.5]);
28///
29/// // Create an RMSprop optimizer with learning rate 0.001
30/// let mut optimizer = RMSprop::new(0.001);
31///
32/// // Update parameters
33/// let new_params = optimizer.step(¶ms, &gradients).unwrap();
34/// ```
35#[derive(Debug, Clone)]
36pub struct RMSprop<A: Float + ScalarOperand + Debug> {
37 /// Learning rate
38 learning_rate: A,
39 /// Decay rate for the moving average of squared gradients
40 rho: A,
41 /// Small constant for numerical stability
42 epsilon: A,
43 /// Weight decay factor (L2 regularization)
44 weight_decay: A,
45 /// Moving average of squared gradients
46 v: Option<Vec<Array<A, scirs2_core::ndarray::IxDyn>>>,
47}
48
49impl<A: Float + ScalarOperand + Debug + Send + Sync> RMSprop<A> {
50 /// Creates a new RMSprop optimizer with the given learning rate and default settings
51 ///
52 /// # Arguments
53 ///
54 /// * `learning_rate` - The learning rate for parameter updates
55 pub fn new(learning_rate: A) -> Self {
56 Self {
57 learning_rate,
58 rho: A::from(0.9).unwrap(),
59 epsilon: A::from(1e-8).unwrap(),
60 weight_decay: A::zero(),
61 v: None,
62 }
63 }
64
65 /// Creates a new RMSprop optimizer with the full configuration
66 ///
67 /// # Arguments
68 ///
69 /// * `learning_rate` - The learning rate for parameter updates
70 /// * `rho` - Decay rate for the moving average of squared gradients (default: 0.9)
71 /// * `epsilon` - Small constant for numerical stability (default: 1e-8)
72 /// * `weight_decay` - Weight decay factor for L2 regularization (default: 0.0)
73 pub fn new_with_config(learning_rate: A, rho: A, epsilon: A, weight_decay: A) -> Self {
74 Self {
75 learning_rate,
76 rho,
77 epsilon,
78 weight_decay,
79 v: None,
80 }
81 }
82
83 /// Sets the rho parameter
84 pub fn set_rho(&mut self, rho: A) -> &mut Self {
85 self.rho = rho;
86 self
87 }
88
89 /// Gets the rho parameter
90 pub fn get_rho(&self) -> A {
91 self.rho
92 }
93
94 /// Sets the epsilon parameter
95 pub fn set_epsilon(&mut self, epsilon: A) -> &mut Self {
96 self.epsilon = epsilon;
97 self
98 }
99
100 /// Gets the epsilon parameter
101 pub fn get_epsilon(&self) -> A {
102 self.epsilon
103 }
104
105 /// Sets the weight decay parameter
106 pub fn set_weight_decay(&mut self, weight_decay: A) -> &mut Self {
107 self.weight_decay = weight_decay;
108 self
109 }
110
111 /// Gets the weight decay parameter
112 pub fn get_weight_decay(&self) -> A {
113 self.weight_decay
114 }
115
116 /// Resets the internal state of the optimizer
117 pub fn reset(&mut self) {
118 self.v = None;
119 }
120}
121
122impl<A, D> Optimizer<A, D> for RMSprop<A>
123where
124 A: Float + ScalarOperand + Debug + Send + Sync,
125 D: Dimension,
126{
127 fn step(&mut self, params: &Array<A, D>, gradients: &Array<A, D>) -> Result<Array<A, D>> {
128 // Convert to dynamic dimension for storage in state vectors
129 let params_dyn = params.to_owned().into_dyn();
130 let gradients_dyn = gradients.to_owned().into_dyn();
131
132 // Apply weight decay to gradients if needed
133 let adjusted_gradients = if self.weight_decay > A::zero() {
134 &gradients_dyn + &(¶ms_dyn * self.weight_decay)
135 } else {
136 gradients_dyn
137 };
138
139 // Initialize state if this is the first step
140 if self.v.is_none() {
141 self.v = Some(vec![Array::zeros(params_dyn.raw_dim())]);
142 }
143
144 let v = self.v.as_mut().unwrap();
145
146 // Ensure we have state for this parameter set
147 if v.is_empty() {
148 v.push(Array::zeros(params_dyn.raw_dim()));
149 } else if v[0].raw_dim() != params_dyn.raw_dim() {
150 // If the parameter dimensions have changed, reset state
151 v[0] = Array::zeros(params_dyn.raw_dim());
152 }
153
154 // Update moving average of squared gradients
155 // v_t = rho * v_{t-1} + (1 - rho) * g_t^2
156 v[0] =
157 &v[0] * self.rho + &(&adjusted_gradients * &adjusted_gradients * (A::one() - self.rho));
158
159 // Compute step size
160 // step = learning_rate * g_t / (sqrt(v_t) + epsilon)
161 let v_sqrt = v[0].mapv(|x| x.sqrt());
162 let step = &adjusted_gradients * self.learning_rate / &(&v_sqrt + self.epsilon);
163
164 // Update parameters
165 let updated_params = ¶ms_dyn - step;
166
167 // Convert back to original dimension
168 Ok(updated_params.into_dimensionality::<D>().unwrap())
169 }
170
171 fn get_learning_rate(&self) -> A {
172 self.learning_rate
173 }
174
175 fn set_learning_rate(&mut self, learning_rate: A) {
176 self.learning_rate = learning_rate;
177 }
178}