optirs_core/optimizers/
lbfgs.rs

1// L-BFGS optimizer implementation
2//
3// Based on the Limited-memory Broyden-Fletcher-Goldfarb-Shanno algorithm.
4
5use scirs2_core::ndarray::{Array, Array1, Dimension, ScalarOperand};
6use scirs2_core::numeric::Float;
7use std::collections::VecDeque;
8use std::fmt::Debug;
9
10use crate::error::Result;
11use crate::optimizers::Optimizer;
12
13/// L-BFGS optimizer
14///
15/// Implements the Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) algorithm.
16/// This is a quasi-Newton method that approximates the Hessian inverse using a limited
17/// amount of memory by storing only a few vectors from previous iterations.
18///
19/// # Examples
20///
21/// ```no_run
22/// use scirs2_core::ndarray::Array1;
23/// use optirs_core::optimizers::{LBFGS, Optimizer};
24///
25/// // Initialize parameters and gradients
26/// let params = Array1::zeros(5);
27/// let gradients = Array1::from_vec(vec![0.1, 0.2, -0.3, 0.0, 0.5]);
28///
29/// // Create an L-BFGS optimizer
30/// let mut optimizer = LBFGS::new(1.0);
31///
32/// // Update parameters
33/// let new_params = optimizer.step(&params, &gradients).unwrap();
34/// ```
35#[derive(Debug, Clone)]
36pub struct LBFGS<A: Float + ScalarOperand + Debug> {
37    /// Learning rate
38    learning_rate: A,
39    /// History size (number of vectors to store)
40    history_size: usize,
41    /// Tolerance for gradient norm
42    tolerance_grad: A,
43    /// Wolfe line search parameter c1
44    #[allow(dead_code)]
45    c1: A,
46    /// Wolfe line search parameter c2
47    #[allow(dead_code)]
48    c2: A,
49    /// Maximum number of line search iterations
50    #[allow(dead_code)]
51    max_ls: usize,
52    /// History of gradient differences (y = grad_new - grad_old)
53    old_dirs: VecDeque<Array1<A>>,
54    /// History of step vectors (s = params_new - params_old)
55    old_stps: VecDeque<Array1<A>>,
56    /// History of 1/(y·s) values
57    ro: VecDeque<A>,
58    /// Previous gradient
59    prev_grad: Option<Array1<A>>,
60    /// Initial Hessian diagonal value
61    h_diag: A,
62    /// Step counter
63    n_iter: usize,
64    /// Temporary alpha values for two-loop recursion
65    alpha: Vec<A>,
66}
67
68impl<A: Float + ScalarOperand + Debug + Send + Sync> LBFGS<A> {
69    /// Creates a new L-BFGS optimizer with the given learning rate
70    ///
71    /// # Arguments
72    ///
73    /// * `learning_rate` - The learning rate for parameter updates
74    pub fn new(learning_rate: A) -> Self {
75        Self::new_with_config(
76            learning_rate,
77            100,                    // history_size
78            A::from(1e-7).unwrap(), // tolerance_grad
79            A::from(1e-4).unwrap(), // c1
80            A::from(0.9).unwrap(),  // c2
81            25,                     // max_ls
82        )
83    }
84
85    /// Creates a new L-BFGS optimizer with full configuration
86    ///
87    /// # Arguments
88    ///
89    /// * `learning_rate` - The learning rate for parameter updates
90    /// * `history_size` - Number of past gradients/steps to store (default: 100)
91    /// * `tolerance_grad` - Gradient norm tolerance for convergence (default: 1e-7)
92    /// * `c1` - Wolfe line search parameter for Armijo condition (default: 1e-4)
93    /// * `c2` - Wolfe line search parameter for curvature condition (default: 0.9)
94    /// * `max_ls` - Maximum line search iterations (default: 25)
95    pub fn new_with_config(
96        learning_rate: A,
97        history_size: usize,
98        tolerance_grad: A,
99        c1: A,
100        c2: A,
101        max_ls: usize,
102    ) -> Self {
103        Self {
104            learning_rate,
105            history_size,
106            tolerance_grad,
107            c1,
108            c2,
109            max_ls,
110            old_dirs: VecDeque::with_capacity(history_size),
111            old_stps: VecDeque::with_capacity(history_size),
112            ro: VecDeque::with_capacity(history_size),
113            prev_grad: None,
114            h_diag: A::one(),
115            n_iter: 0,
116            alpha: vec![A::zero(); history_size],
117        }
118    }
119
120    /// Gets the current learning rate
121    pub fn learning_rate(&self) -> A {
122        self.learning_rate
123    }
124
125    /// Sets the learning rate
126    pub fn set_lr(&mut self, lr: A) {
127        self.learning_rate = lr;
128    }
129
130    /// Resets the internal state of the optimizer
131    pub fn reset(&mut self) {
132        self.old_dirs.clear();
133        self.old_stps.clear();
134        self.ro.clear();
135        self.prev_grad = None;
136        self.h_diag = A::one();
137        self.n_iter = 0;
138        self.alpha.fill(A::zero());
139    }
140
141    /// Performs the two-loop recursion to compute the search direction
142    fn compute_direction(&mut self, gradient: &Array1<A>) -> Array1<A> {
143        // If first iteration, use negative gradient
144        if self.n_iter == 0 {
145            return gradient.mapv(|x| -x);
146        }
147
148        let num_old = self.old_dirs.len();
149
150        // First loop: compute alpha values and initial direction
151        let mut q = gradient.mapv(|x| -x);
152
153        for i in (0..num_old).rev() {
154            self.alpha[i] = self.old_stps[i].dot(&q) * self.ro[i];
155            q = &q - &self.old_dirs[i] * self.alpha[i];
156        }
157
158        // Scale by initial Hessian
159        let mut r = q * self.h_diag;
160
161        // Second loop: compute final direction
162        for i in 0..num_old {
163            let beta = self.old_dirs[i].dot(&r) * self.ro[i];
164            r = &r + &self.old_stps[i] * (self.alpha[i] - beta);
165        }
166
167        r
168    }
169
170    /// Updates the history with new gradient and step information
171    fn update_history(&mut self, y: Array1<A>, s: Array1<A>) {
172        let ys = y.dot(&s);
173
174        // Only update if y·s is positive (ensures positive definiteness)
175        if ys > A::from(1e-10).unwrap() {
176            // Remove oldest entries if at capacity
177            if self.old_dirs.len() >= self.history_size {
178                self.old_dirs.pop_front();
179                self.old_stps.pop_front();
180                self.ro.pop_front();
181            }
182
183            // Add new entries
184            self.old_dirs.push_back(y.clone());
185            self.old_stps.push_back(s);
186            self.ro.push_back(A::one() / ys);
187
188            // Update initial Hessian approximation
189            let yy = y.dot(&y);
190            if yy > A::zero() {
191                self.h_diag = ys / yy;
192            }
193        }
194    }
195}
196
197impl<A, D> Optimizer<A, D> for LBFGS<A>
198where
199    A: Float + ScalarOperand + Debug + Send + Sync,
200    D: Dimension,
201{
202    fn step(&mut self, params: &Array<A, D>, gradients: &Array<A, D>) -> Result<Array<A, D>> {
203        // Convert to 1D for computation
204        let params_flat = params
205            .to_owned()
206            .into_shape_with_order(params.len())
207            .unwrap();
208        let gradients_flat = gradients
209            .to_owned()
210            .into_shape_with_order(gradients.len())
211            .unwrap();
212
213        // Check convergence
214        let grad_norm = gradients_flat.dot(&gradients_flat).sqrt();
215        if grad_norm <= self.tolerance_grad {
216            return Ok(params.clone());
217        }
218
219        // Update history if we have previous gradient (before computing new direction)
220        if let Some(prev_grad) = self.prev_grad.clone() {
221            let y = &gradients_flat - &prev_grad;
222
223            // Only update if we've moved in parameter space
224            if self.n_iter > 0 {
225                // Compute actual step taken in the previous iteration
226                let direction = self.compute_direction(&prev_grad);
227                let step_size = if self.n_iter == 1 {
228                    self.learning_rate / (A::one() + grad_norm)
229                } else {
230                    self.learning_rate
231                };
232                let s = direction * step_size;
233                self.update_history(y, s);
234            }
235        }
236
237        // Compute search direction using current gradient
238        let direction = self.compute_direction(&gradients_flat);
239
240        // Simple line search with fixed step size (no backtracking for now)
241        let step_size = if self.n_iter == 0 {
242            // First iteration: use smaller step
243            self.learning_rate / (A::one() + grad_norm)
244        } else {
245            self.learning_rate
246        };
247
248        // Update parameters
249        let new_params_flat = &params_flat + &(&direction * step_size);
250
251        // Store current gradient for next iteration
252        self.prev_grad = Some(gradients_flat.clone());
253        self.n_iter += 1;
254
255        // Reshape back to original dimensions
256        let new_params = new_params_flat
257            .into_shape_with_order(params.raw_dim())
258            .unwrap();
259
260        Ok(new_params)
261    }
262
263    fn get_learning_rate(&self) -> A {
264        self.learning_rate
265    }
266
267    fn set_learning_rate(&mut self, learning_rate: A) {
268        self.learning_rate = learning_rate;
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275    use approx::assert_abs_diff_eq;
276    use scirs2_core::ndarray::Array1;
277
278    #[test]
279    fn test_lbfgs_basic_creation() {
280        let optimizer: LBFGS<f64> = LBFGS::new(1.0);
281        assert_abs_diff_eq!(optimizer.learning_rate(), 1.0);
282        assert_eq!(optimizer.history_size, 100);
283        assert_abs_diff_eq!(optimizer.tolerance_grad, 1e-7);
284    }
285
286    #[test]
287    fn test_lbfgs_convergence() {
288        let mut optimizer: LBFGS<f64> = LBFGS::new(0.1);
289
290        // Minimize f(x) = x^2
291        let mut params = Array1::from_vec(vec![10.0]);
292
293        for _ in 0..50 {
294            let gradients = Array1::from_vec(vec![2.0 * params[0]]);
295            params = optimizer.step(&params, &gradients).unwrap();
296        }
297
298        // Should converge close to 0
299        assert!(params[0].abs() < 0.1);
300    }
301
302    #[test]
303    fn test_lbfgs_2d() {
304        let mut optimizer: LBFGS<f64> = LBFGS::new(0.1);
305
306        // Minimize f(x,y) = x^2 + y^2
307        let mut params = Array1::from_vec(vec![5.0, 3.0]);
308
309        for _ in 0..50 {
310            let gradients = Array1::from_vec(vec![2.0 * params[0], 2.0 * params[1]]);
311            params = optimizer.step(&params, &gradients).unwrap();
312        }
313
314        // Should converge close to (0, 0)
315        assert!(params[0].abs() < 0.1);
316        assert!(params[1].abs() < 0.1);
317    }
318
319    #[test]
320    fn test_lbfgs_reset() {
321        let mut optimizer: LBFGS<f64> = LBFGS::new(0.1);
322
323        // Perform some steps
324        let mut params = Array1::from_vec(vec![1.0]);
325        let gradients = Array1::from_vec(vec![2.0]);
326        params = optimizer.step(&params, &gradients).unwrap();
327
328        // Need one more step to actually update history
329        let gradients2 = Array1::from_vec(vec![1.5]);
330        params = optimizer.step(&params, &gradients2).unwrap();
331
332        // Third step to populate history
333        let gradients3 = Array1::from_vec(vec![1.0]);
334        let _ = optimizer.step(&params, &gradients3).unwrap();
335
336        // Verify state exists
337        assert!(!optimizer.old_dirs.is_empty());
338        assert!(optimizer.n_iter > 0);
339
340        // Reset
341        optimizer.reset();
342
343        // Verify state is cleared
344        assert!(optimizer.old_dirs.is_empty());
345        assert!(optimizer.old_stps.is_empty());
346        assert!(optimizer.ro.is_empty());
347        assert!(optimizer.prev_grad.is_none());
348        assert_eq!(optimizer.n_iter, 0);
349    }
350}