optirs_core/optimizers/
rmsprop.rs

1// RMSprop optimizer implementation
2
3use scirs2_core::ndarray::{Array, Dimension, ScalarOperand};
4use scirs2_core::numeric::Float;
5use std::fmt::Debug;
6
7use crate::error::Result;
8use crate::optimizers::Optimizer;
9
10/// RMSprop optimizer
11///
12/// Implements the RMSprop optimization algorithm as proposed by Geoffrey Hinton
13/// in his Coursera course "Neural Networks for Machine Learning".
14///
15/// Formula:
16/// v_t = rho * v_{t-1} + (1 - rho) * g_t^2
17/// param_t = param_{t-1} - learning_rate * g_t / (sqrt(v_t) + epsilon)
18///
19/// # Examples
20///
21/// ```
22/// use scirs2_core::ndarray::Array1;
23/// use optirs_core::optimizers::{RMSprop, Optimizer};
24///
25/// // Initialize parameters and gradients
26/// let params = Array1::zeros(5);
27/// let gradients = Array1::from_vec(vec![0.1, 0.2, -0.3, 0.0, 0.5]);
28///
29/// // Create an RMSprop optimizer with learning rate 0.001
30/// let mut optimizer = RMSprop::new(0.001);
31///
32/// // Update parameters
33/// let new_params = optimizer.step(&params, &gradients).unwrap();
34/// ```
35#[derive(Debug, Clone)]
36pub struct RMSprop<A: Float + ScalarOperand + Debug> {
37    /// Learning rate
38    learning_rate: A,
39    /// Decay rate for the moving average of squared gradients
40    rho: A,
41    /// Small constant for numerical stability
42    epsilon: A,
43    /// Weight decay factor (L2 regularization)
44    weight_decay: A,
45    /// Moving average of squared gradients
46    v: Option<Vec<Array<A, scirs2_core::ndarray::IxDyn>>>,
47}
48
49impl<A: Float + ScalarOperand + Debug + Send + Sync> RMSprop<A> {
50    /// Creates a new RMSprop optimizer with the given learning rate and default settings
51    ///
52    /// # Arguments
53    ///
54    /// * `learning_rate` - The learning rate for parameter updates
55    pub fn new(learning_rate: A) -> Self {
56        Self {
57            learning_rate,
58            rho: A::from(0.9).unwrap(),
59            epsilon: A::from(1e-8).unwrap(),
60            weight_decay: A::zero(),
61            v: None,
62        }
63    }
64
65    /// Creates a new RMSprop optimizer with the full configuration
66    ///
67    /// # Arguments
68    ///
69    /// * `learning_rate` - The learning rate for parameter updates
70    /// * `rho` - Decay rate for the moving average of squared gradients (default: 0.9)
71    /// * `epsilon` - Small constant for numerical stability (default: 1e-8)
72    /// * `weight_decay` - Weight decay factor for L2 regularization (default: 0.0)
73    pub fn new_with_config(learning_rate: A, rho: A, epsilon: A, weight_decay: A) -> Self {
74        Self {
75            learning_rate,
76            rho,
77            epsilon,
78            weight_decay,
79            v: None,
80        }
81    }
82
83    /// Sets the rho parameter
84    pub fn set_rho(&mut self, rho: A) -> &mut Self {
85        self.rho = rho;
86        self
87    }
88
89    /// Gets the rho parameter
90    pub fn get_rho(&self) -> A {
91        self.rho
92    }
93
94    /// Sets the epsilon parameter
95    pub fn set_epsilon(&mut self, epsilon: A) -> &mut Self {
96        self.epsilon = epsilon;
97        self
98    }
99
100    /// Gets the epsilon parameter
101    pub fn get_epsilon(&self) -> A {
102        self.epsilon
103    }
104
105    /// Sets the weight decay parameter
106    pub fn set_weight_decay(&mut self, weight_decay: A) -> &mut Self {
107        self.weight_decay = weight_decay;
108        self
109    }
110
111    /// Gets the weight decay parameter
112    pub fn get_weight_decay(&self) -> A {
113        self.weight_decay
114    }
115
116    /// Resets the internal state of the optimizer
117    pub fn reset(&mut self) {
118        self.v = None;
119    }
120}
121
122impl<A, D> Optimizer<A, D> for RMSprop<A>
123where
124    A: Float + ScalarOperand + Debug + Send + Sync,
125    D: Dimension,
126{
127    fn step(&mut self, params: &Array<A, D>, gradients: &Array<A, D>) -> Result<Array<A, D>> {
128        // Convert to dynamic dimension for storage in state vectors
129        let params_dyn = params.to_owned().into_dyn();
130        let gradients_dyn = gradients.to_owned().into_dyn();
131
132        // Apply weight decay to gradients if needed
133        let adjusted_gradients = if self.weight_decay > A::zero() {
134            &gradients_dyn + &(&params_dyn * self.weight_decay)
135        } else {
136            gradients_dyn
137        };
138
139        // Initialize state if this is the first step
140        if self.v.is_none() {
141            self.v = Some(vec![Array::zeros(params_dyn.raw_dim())]);
142        }
143
144        let v = self.v.as_mut().unwrap();
145
146        // Ensure we have state for this parameter set
147        if v.is_empty() {
148            v.push(Array::zeros(params_dyn.raw_dim()));
149        } else if v[0].raw_dim() != params_dyn.raw_dim() {
150            // If the parameter dimensions have changed, reset state
151            v[0] = Array::zeros(params_dyn.raw_dim());
152        }
153
154        // Update moving average of squared gradients
155        // v_t = rho * v_{t-1} + (1 - rho) * g_t^2
156        v[0] =
157            &v[0] * self.rho + &(&adjusted_gradients * &adjusted_gradients * (A::one() - self.rho));
158
159        // Compute step size
160        // step = learning_rate * g_t / (sqrt(v_t) + epsilon)
161        let v_sqrt = v[0].mapv(|x| x.sqrt());
162        let step = &adjusted_gradients * self.learning_rate / &(&v_sqrt + self.epsilon);
163
164        // Update parameters
165        let updated_params = &params_dyn - step;
166
167        // Convert back to original dimension
168        Ok(updated_params.into_dimensionality::<D>().unwrap())
169    }
170
171    fn get_learning_rate(&self) -> A {
172        self.learning_rate
173    }
174
175    fn set_learning_rate(&mut self, learning_rate: A) {
176        self.learning_rate = learning_rate;
177    }
178}