advanced_algorithms/optimization/
gradient_descent.rs

1//! Gradient Descent Optimization
2//!
3//! Gradient descent is a first-order iterative optimization algorithm for finding
4//! local minima of differentiable functions. It's the foundation of training neural
5//! networks and machine learning models.
6//!
7//! # Examples
8//!
9//! ```
10//! use advanced_algorithms::optimization::gradient_descent::{GradientDescent, LearningRate};
11//!
12//! // Minimize f(x) = x^2 + 2x + 1
13//! let f = |x: &[f64]| x[0] * x[0] + 2.0 * x[0] + 1.0;
14//! let grad_f = |x: &[f64]| vec![2.0 * x[0] + 2.0];
15//!
16//! let gd = GradientDescent::new()
17//!     .with_learning_rate(LearningRate::Constant(0.1))
18//!     .with_max_iterations(1000);
19//!
20//! let result = gd.minimize(f, grad_f, &[10.0]).unwrap();
21//! // Should converge to x ≈ -1
22//! ```
23
24use crate::Result;
25
26/// Learning rate strategy
27#[derive(Debug, Clone)]
28pub enum LearningRate {
29    /// Constant learning rate
30    Constant(f64),
31    /// Decreasing: initial_rate / (1 + decay * iteration)
32    Decreasing { initial: f64, decay: f64 },
33    /// Exponential decay: initial_rate * (decay ^ iteration)
34    Exponential { initial: f64, decay: f64 },
35    /// Adaptive (AdaGrad-like): adapts per parameter
36    Adaptive { initial: f64, epsilon: f64 },
37}
38
39impl LearningRate {
40    fn get_rate(&self, iteration: usize) -> f64 {
41        match self {
42            LearningRate::Constant(rate) => *rate,
43            LearningRate::Decreasing { initial, decay } => {
44                initial / (1.0 + decay * iteration as f64)
45            }
46            LearningRate::Exponential { initial, decay } => {
47                initial * decay.powi(iteration as i32)
48            }
49            LearningRate::Adaptive { initial, .. } => *initial,
50        }
51    }
52}
53
54/// Gradient descent optimizer configuration
55pub struct GradientDescent {
56    learning_rate: LearningRate,
57    max_iterations: usize,
58    tolerance: f64,
59    momentum: f64,
60    verbose: bool,
61}
62
63impl Default for GradientDescent {
64    fn default() -> Self {
65        GradientDescent {
66            learning_rate: LearningRate::Constant(0.01),
67            max_iterations: 1000,
68            tolerance: 1e-6,
69            momentum: 0.0,
70            verbose: false,
71        }
72    }
73}
74
75impl GradientDescent {
76    /// Creates a new gradient descent optimizer with default settings
77    pub fn new() -> Self {
78        Self::default()
79    }
80    
81    /// Sets the learning rate strategy
82    pub fn with_learning_rate(mut self, lr: LearningRate) -> Self {
83        self.learning_rate = lr;
84        self
85    }
86    
87    /// Sets the maximum number of iterations
88    pub fn with_max_iterations(mut self, max_iter: usize) -> Self {
89        self.max_iterations = max_iter;
90        self
91    }
92    
93    /// Sets the convergence tolerance
94    pub fn with_tolerance(mut self, tol: f64) -> Self {
95        self.tolerance = tol;
96        self
97    }
98    
99    /// Sets the momentum coefficient (0.0 = no momentum)
100    pub fn with_momentum(mut self, momentum: f64) -> Self {
101        self.momentum = momentum;
102        self
103    }
104    
105    /// Enables verbose output
106    pub fn verbose(mut self) -> Self {
107        self.verbose = true;
108        self
109    }
110    
111    /// Minimizes the given function
112    ///
113    /// # Arguments
114    ///
115    /// * `f` - The objective function to minimize
116    /// * `grad_f` - The gradient of the objective function
117    /// * `initial` - Initial parameter values
118    ///
119    /// # Returns
120    ///
121    /// Optimization result containing the final parameters
122    pub fn minimize<F, G>(
123        &self,
124        f: F,
125        grad_f: G,
126        initial: &[f64],
127    ) -> Result<OptimizationResult>
128    where
129        F: Fn(&[f64]) -> f64,
130        G: Fn(&[f64]) -> Vec<f64>,
131    {
132        let mut x = initial.to_vec();
133        let mut velocity = vec![0.0; x.len()];
134        let mut accumulated_squared_gradients = vec![0.0; x.len()];
135        
136        let mut history = Vec::new();
137        
138        for iteration in 0..self.max_iterations {
139            let current_value = f(&x);
140            let gradient = grad_f(&x);
141            
142            if self.verbose {
143                history.push((x.clone(), current_value));
144            }
145            
146            // Check gradient norm for convergence
147            let grad_norm: f64 = gradient.iter().map(|g| g * g).sum::<f64>().sqrt();
148            
149            if grad_norm < self.tolerance {
150                return Ok(OptimizationResult {
151                    parameters: x,
152                    value: current_value,
153                    iterations: iteration,
154                    gradient_norm: grad_norm,
155                    converged: true,
156                    history,
157                });
158            }
159            
160            // Update parameters based on learning rate strategy
161            let lr = self.learning_rate.get_rate(iteration);
162            
163            match &self.learning_rate {
164                LearningRate::Adaptive { epsilon, .. } => {
165                    // AdaGrad-style update
166                    for i in 0..x.len() {
167                        accumulated_squared_gradients[i] += gradient[i] * gradient[i];
168                        let adjusted_lr = lr / (accumulated_squared_gradients[i].sqrt() + epsilon);
169                        
170                        velocity[i] = self.momentum * velocity[i] - adjusted_lr * gradient[i];
171                        x[i] += velocity[i];
172                    }
173                }
174                _ => {
175                    // Standard update with momentum
176                    for i in 0..x.len() {
177                        velocity[i] = self.momentum * velocity[i] - lr * gradient[i];
178                        x[i] += velocity[i];
179                    }
180                }
181            }
182        }
183        
184        let final_value = f(&x);
185        let final_gradient = grad_f(&x);
186        let grad_norm: f64 = final_gradient.iter().map(|g| g * g).sum::<f64>().sqrt();
187        
188        Ok(OptimizationResult {
189            parameters: x,
190            value: final_value,
191            iterations: self.max_iterations,
192            gradient_norm: grad_norm,
193            converged: grad_norm < self.tolerance,
194            history,
195        })
196    }
197}
198
199/// Result of optimization
200#[derive(Debug, Clone)]
201pub struct OptimizationResult {
202    /// Final parameter values
203    pub parameters: Vec<f64>,
204    /// Final objective function value
205    pub value: f64,
206    /// Number of iterations performed
207    pub iterations: usize,
208    /// Final gradient norm
209    pub gradient_norm: f64,
210    /// Whether the algorithm converged
211    pub converged: bool,
212    /// History of (parameters, value) if verbose mode enabled
213    pub history: Vec<(Vec<f64>, f64)>,
214}
215
216/// Stochastic Gradient Descent for batch optimization
217pub struct StochasticGD {
218    learning_rate: LearningRate,
219    max_epochs: usize,
220    batch_size: usize,
221    tolerance: f64,
222    momentum: f64,
223}
224
225impl StochasticGD {
226    /// Creates a new SGD optimizer
227    pub fn new(batch_size: usize) -> Self {
228        StochasticGD {
229            learning_rate: LearningRate::Decreasing {
230                initial: 0.01,
231                decay: 0.001,
232            },
233            max_epochs: 100,
234            batch_size,
235            tolerance: 1e-6,
236            momentum: 0.9,
237        }
238    }
239    
240    /// Minimizes using mini-batch gradient descent
241    ///
242    /// # Arguments
243    ///
244    /// * `grad_f` - Gradient function that takes parameters and data indices
245    /// * `initial` - Initial parameter values
246    /// * `n_samples` - Total number of training samples
247    pub fn minimize<G>(
248        &self,
249        grad_f: G,
250        initial: &[f64],
251        n_samples: usize,
252    ) -> Result<OptimizationResult>
253    where
254        G: Fn(&[f64], &[usize]) -> Vec<f64> + Sync,
255    {
256        let mut x = initial.to_vec();
257        let mut velocity = vec![0.0; x.len()];
258        
259        let mut total_iterations = 0;
260        
261        for _epoch in 0..self.max_epochs {
262            // Shuffle indices (simplified - using sequential batches here)
263            let n_batches = n_samples.div_ceil(self.batch_size);
264            
265            for batch_idx in 0..n_batches {
266                let start = batch_idx * self.batch_size;
267                let end = (start + self.batch_size).min(n_samples);
268                let batch_indices: Vec<usize> = (start..end).collect();
269                
270                let gradient = grad_f(&x, &batch_indices);
271                
272                let lr = self.learning_rate.get_rate(total_iterations);
273                
274                for i in 0..x.len() {
275                    velocity[i] = self.momentum * velocity[i] - lr * gradient[i];
276                    x[i] += velocity[i];
277                }
278                
279                total_iterations += 1;
280            }
281            
282            // Check convergence (simplified - would need objective function)
283            let grad_norm: f64 = velocity.iter().map(|v| v * v).sum::<f64>().sqrt();
284            
285            if grad_norm < self.tolerance {
286                return Ok(OptimizationResult {
287                    parameters: x,
288                    value: 0.0,
289                    iterations: total_iterations,
290                    gradient_norm: grad_norm,
291                    converged: true,
292                    history: Vec::new(),
293                });
294            }
295        }
296        
297        Ok(OptimizationResult {
298            parameters: x,
299            value: 0.0,
300            iterations: total_iterations,
301            gradient_norm: 0.0,
302            converged: false,
303            history: Vec::new(),
304        })
305    }
306}
307
308/// Numerical gradient computation (for testing)
309pub fn numerical_gradient<F>(f: F, x: &[f64], epsilon: f64) -> Vec<f64>
310where
311    F: Fn(&[f64]) -> f64,
312{
313    let mut gradient = Vec::with_capacity(x.len());
314    
315    for i in 0..x.len() {
316        let mut x_plus = x.to_vec();
317        let mut x_minus = x.to_vec();
318        
319        x_plus[i] += epsilon;
320        x_minus[i] -= epsilon;
321        
322        let grad_i = (f(&x_plus) - f(&x_minus)) / (2.0 * epsilon);
323        gradient.push(grad_i);
324    }
325    
326    gradient
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332    
333    #[test]
334    fn test_quadratic() {
335        // Minimize f(x) = (x - 3)^2, minimum at x = 3
336        let f = |x: &[f64]| (x[0] - 3.0) * (x[0] - 3.0);
337        let grad_f = |x: &[f64]| vec![2.0 * (x[0] - 3.0)];
338        
339        let gd = GradientDescent::new()
340            .with_learning_rate(LearningRate::Constant(0.1))
341            .with_max_iterations(1000);
342        
343        let result = gd.minimize(f, grad_f, &[0.0]).unwrap();
344        
345        assert!(result.converged);
346        assert!((result.parameters[0] - 3.0).abs() < 0.01);
347    }
348    
349    #[test]
350    fn test_rosenbrock() {
351        // Rosenbrock function: f(x,y) = (1-x)^2 + 100(y-x^2)^2
352        let f = |x: &[f64]| {
353            let a = 1.0 - x[0];
354            let b = x[1] - x[0] * x[0];
355            a * a + 100.0 * b * b
356        };
357        
358        let grad_f = |x: &[f64]| {
359            vec![
360                -2.0 * (1.0 - x[0]) - 400.0 * x[0] * (x[1] - x[0] * x[0]),
361                200.0 * (x[1] - x[0] * x[0]),
362            ]
363        };
364        
365        let gd = GradientDescent::new()
366            .with_learning_rate(LearningRate::Constant(0.001))
367            .with_momentum(0.9)
368            .with_max_iterations(10000);
369        
370        let result = gd.minimize(f, grad_f, &[0.0, 0.0]).unwrap();
371        
372        // Should converge near (1, 1)
373        assert!(result.value < 0.1);
374    }
375    
376    #[test]
377    fn test_numerical_gradient() {
378        let f = |x: &[f64]| x[0] * x[0] + 2.0 * x[1] * x[1];
379        
380        let num_grad = numerical_gradient(f, &[1.0, 2.0], 1e-5);
381        let analytical_grad = vec![2.0 * 1.0, 4.0 * 2.0];
382        
383        for (n, a) in num_grad.iter().zip(analytical_grad.iter()) {
384            assert!((n - a).abs() < 1e-4);
385        }
386    }
387}