1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
use std::f64;
/// A library-grade implementation of the AdaGrad optimizer.
///
/// AdaGrad adapts the learning rate to the parameters, performing larger updates
/// for infrequent parameters and smaller updates for frequent parameters.
#[derive(Debug, Clone)]
pub struct AdaGrad {
/// The base learning rate.
pub learning_rate: f64,
/// A small constant for numerical stability.
pub epsilon: f64,
/// Accumulated squared gradients.
pub accumulated_grad_sq: Vec<f64>,
}
impl AdaGrad {
/// Creates a new AdaGrad optimizer instance.
///
/// # Arguments
///
/// * `learning_rate` - The base learning rate.
/// * `epsilon` - A small constant for numerical stability.
/// * `param_size` - The number of parameters to optimize.
///
/// # Example
///
/// ```
/// use algos::ml::deep::adagrad::AdaGrad;
/// let optimizer = AdaGrad::new(0.01, 1e-8, 10);
/// ```
pub fn new(learning_rate: f64, epsilon: f64, param_size: usize) -> Self {
AdaGrad {
learning_rate,
epsilon,
accumulated_grad_sq: vec![0.0; param_size],
}
}
/// Updates the parameters using the AdaGrad optimization rule.
///
/// The update rule is as follows:
///
/// acc_grad_sq += grad^2
/// param = param - learning_rate * grad / sqrt(acc_grad_sq + epsilon)
///
/// # Arguments
///
/// * `params` - Mutable slice of parameters to be updated.
/// * `grads` - Slice of gradients corresponding to each parameter.
pub fn update(&mut self, params: &mut [f64], grads: &[f64]) {
assert_eq!(
params.len(),
grads.len(),
"Parameters and gradients must be the same length"
);
assert_eq!(
params.len(),
self.accumulated_grad_sq.len(),
"Parameter size mismatch with initialization"
);
for (i, (param, &grad)) in params.iter_mut().zip(grads.iter()).enumerate() {
// Accumulate squared gradient
self.accumulated_grad_sq[i] += grad * grad;
// Compute adaptive learning rate and update parameter
let adaptive_lr =
self.learning_rate / (self.accumulated_grad_sq[i].sqrt() + self.epsilon);
*param -= adaptive_lr * grad;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
/// Tests initialization with correct parameter sizes.
#[test]
fn test_adagrad_initialization() {
let optimizer = AdaGrad::new(0.01, 1e-8, 5);
assert_eq!(optimizer.accumulated_grad_sq.len(), 5);
assert!(optimizer.accumulated_grad_sq.iter().all(|&x| x == 0.0));
}
/// Tests a single update step to ensure parameters are updated correctly.
#[test]
fn test_adagrad_single_update() {
let mut optimizer = AdaGrad::new(0.1, 1e-8, 1);
let mut params = vec![1.0];
let grads = vec![0.5];
optimizer.update(&mut params, &grads);
// With positive gradient, parameter should decrease
assert!(params[0] < 1.0);
// Accumulated gradient should be positive
assert!(optimizer.accumulated_grad_sq[0] > 0.0);
}
/// Tests that mismatched parameter and gradient lengths cause a panic.
#[test]
#[should_panic(expected = "Parameters and gradients must be the same length")]
fn test_adagrad_mismatched_lengths() {
let mut optimizer = AdaGrad::new(0.1, 1e-8, 2);
let mut params = vec![1.0];
let grads = vec![0.5, 0.3];
optimizer.update(&mut params, &grads);
}
/// Tests that parameter size mismatch with initialization causes a panic.
#[test]
#[should_panic(expected = "Parameter size mismatch with initialization")]
fn test_adagrad_size_mismatch() {
let mut optimizer = AdaGrad::new(0.1, 1e-8, 1);
let mut params = vec![1.0, 2.0];
let grads = vec![0.5, 0.3];
optimizer.update(&mut params, &grads);
}
/// Tests multiple update steps to verify adaptive learning rate behavior.
#[test]
fn test_adagrad_multiple_updates() {
let mut optimizer = AdaGrad::new(0.1, 1e-8, 2);
let mut params = vec![0.0, 0.0];
let grads = vec![1.0, 0.1];
// Perform multiple updates
for _ in 0..100 {
optimizer.update(&mut params, &grads);
}
// Parameter with larger gradient should have smaller accumulated changes
// due to adaptive learning rate
let change_ratio = params[0].abs() / params[1].abs();
assert!(change_ratio < 10.0);
}
/// Tests that updates still work with very small gradients.
#[test]
fn test_adagrad_small_gradients() {
let mut optimizer = AdaGrad::new(0.1, 1e-8, 1);
let mut params = vec![1.0];
let grads = vec![1e-6];
optimizer.update(&mut params, &grads);
assert!(params[0] != 1.0); // Should still update despite small gradient
}
}