cmaes_lbfgsb/
lbfgsb_optimize.rs

1//! Enhanced L-BFGS-B optimizer with stable dot product, adaptive finite differences,
2//! and Strong Wolfe line search for robust convergence.
3
4/// Configuration parameters for L-BFGS-B optimization.
5/// 
6/// This struct contains all parameters that control the behavior of the L-BFGS-B algorithm.
7/// All parameters have sensible defaults suitable for most optimization problems.
8/// 
9/// # Core Parameters
10/// 
11/// The most important parameters for typical usage are:
12/// - `memory_size`: Controls memory usage vs convergence speed trade-off
13/// - `obj_tol` and `step_size_tol`: Control convergence criteria
14/// - `c1` and `c2`: Control line search behavior
15/// 
16/// # Numerical Parameters
17/// 
18/// Parameters related to finite difference gradients and numerical stability:
19/// - `fd_epsilon` and `fd_min_step`: Control gradient approximation accuracy
20/// - `boundary_tol`: Controls handling of bound constraints
21/// 
22/// # Example
23/// 
24/// ```rust
25/// use cmaes_lbfgsb::lbfgsb_optimize::LbfgsbConfig;
26/// 
27/// // Basic configuration (often sufficient)
28/// let config = LbfgsbConfig::default();
29/// 
30/// // High-precision configuration
31/// let precise_config = LbfgsbConfig {
32///     memory_size: 20,
33///     obj_tol: 1e-12,
34///     step_size_tol: 1e-12,
35///     ..Default::default()
36/// };
37/// 
38/// // Configuration for noisy functions
39/// let robust_config = LbfgsbConfig {
40///     c1: 1e-3,
41///     c2: 0.8,
42///     fd_epsilon: 1e-6,
43///     max_line_search_iters: 30,
44///     ..Default::default()
45/// };
46/// ```
47pub struct LbfgsbConfig {
48    /// Memory size for L-BFGS (number of past gradient vectors to store).
49    /// 
50    /// **Default**: 5
51    /// 
52    /// **Typical range**: 3-20
53    /// 
54    /// **Trade-offs**:
55    /// - **Larger values**: Better approximation of Hessian, faster convergence, more memory
56    /// - **Smaller values**: Less memory usage, more robust to non-quadratic functions
57    /// 
58    /// **Guidelines**:
59    /// - Small problems (< 100 parameters): 5-10 is usually sufficient
60    /// - Large problems (> 1000 parameters): 10-20 can help convergence
61    /// - Noisy functions: Use smaller values (3-7) for more robustness
62    /// - Very smooth functions: Can benefit from larger values (15-20)
63    /// 
64    /// **Memory usage**: Each vector stored uses O(n) memory where n is problem dimension.
65    pub memory_size: usize,
66
67    /// Tolerance for relative function improvement (convergence criterion).
68    /// 
69    /// **Default**: 1e-8
70    /// 
71    /// **Typical range**: 1e-12 to 1e-4
72    /// 
73    /// The algorithm terminates when the relative change in objective value falls below this threshold:
74    /// `|f_old - f_new| / max(|f_old|, |f_new|, 1.0) < obj_tol`
75    /// 
76    /// **Guidelines**:
77    /// - **High precision needed**: Use 1e-12 to 1e-10
78    /// - **Standard precision**: Use 1e-8 to 1e-6
79    /// - **Fast approximate solutions**: Use 1e-4 to 1e-2
80    /// - **Noisy functions**: Use larger values to avoid premature termination
81    pub obj_tol: f64,
82
83    /// Tolerance for step size norm (convergence criterion).
84    /// 
85    /// **Default**: 1e-9
86    /// 
87    /// **Typical range**: 1e-12 to 1e-6
88    /// 
89    /// The algorithm terminates when `||step|| < step_size_tol`, indicating that
90    /// parameter changes have become negligibly small.
91    /// 
92    /// **Guidelines**:
93    /// - Should typically be smaller than `obj_tol`
94    /// - For parameters with scale ~1: Use default value
95    /// - For very small parameters: Scale proportionally
96    /// - For very large parameters: May need to increase
97    pub step_size_tol: f64,
98
99    /// First Wolfe condition parameter (sufficient decrease, Armijo condition).
100    /// 
101    /// **Default**: 1e-4
102    /// 
103    /// **Typical range**: 1e-5 to 1e-2
104    /// 
105    /// Controls the required decrease in objective function for accepting a step.
106    /// The condition is: `f(x + α*d) ≤ f(x) + c1*α*∇f(x)ᵀd`
107    /// 
108    /// **Trade-offs**:
109    /// - **Smaller values**: More stringent decrease requirement, shorter steps, more stable
110    /// - **Larger values**: Less stringent requirement, longer steps, faster progress
111    /// 
112    /// **Guidelines**:
113    /// - **Well-conditioned problems**: Can use larger values (1e-3 to 1e-2)
114    /// - **Ill-conditioned problems**: Use smaller values (1e-5 to 1e-4)
115    /// - **Noisy functions**: Use smaller values for stability
116    /// 
117    /// **Must satisfy**: 0 < c1 < c2 < 1
118    pub c1: f64,
119
120    /// Second Wolfe condition parameter (curvature condition).
121    /// 
122    /// **Default**: 0.9
123    /// 
124    /// **Typical range**: 0.1 to 0.9
125    /// 
126    /// Controls the required change in gradient for accepting a step.
127    /// The condition is: `|∇f(x + α*d)ᵀd| ≤ c2*|∇f(x)ᵀd|`
128    /// 
129    /// **Trade-offs**:
130    /// - **Smaller values**: More stringent curvature requirement, shorter steps
131    /// - **Larger values**: Less stringent requirement, longer steps, fewer line search iterations
132    /// 
133    /// **Guidelines**:
134    /// - **Newton-like methods**: Use large values (0.9) to allow long steps
135    /// - **Gradient descent-like**: Use smaller values (0.1-0.5) for more careful steps
136    /// - **Default 0.9**: Good for L-BFGS as it allows the algorithm to take longer steps
137    /// 
138    /// **Must satisfy**: 0 < c1 < c2 < 1
139    pub c2: f64,
140
141    /// Base step size for finite difference gradient estimation.
142    /// 
143    /// **Default**: 1e-8
144    /// 
145    /// **Typical range**: 1e-12 to 1e-4
146    /// 
147    /// The actual step size used is `max(fd_epsilon * |x_i|, fd_min_step)` for each parameter.
148    /// This provides relative scaling for different parameter magnitudes.
149    /// 
150    /// **Trade-offs**:
151    /// - **Smaller values**: More accurate gradients, but risk of numerical cancellation
152    /// - **Larger values**: Less accurate gradients, but more robust to noise
153    /// 
154    /// **Guidelines**:
155    /// - **Smooth functions**: Can use smaller values (1e-10 to 1e-8)
156    /// - **Noisy functions**: Use larger values (1e-6 to 1e-4)
157    /// - **Mixed scales**: Ensure `fd_min_step` handles small parameters appropriately
158    pub fd_epsilon: f64,
159
160    /// Minimum step size for finite difference gradient estimation.
161    /// 
162    /// **Default**: 1e-12
163    /// 
164    /// **Typical range**: 1e-15 to 1e-8
165    /// 
166    /// Ensures that finite difference steps don't become too small for parameters
167    /// near zero, which would lead to poor gradient estimates.
168    /// 
169    /// **Guidelines**:
170    /// - Should be much smaller than typical parameter values
171    /// - Consider the scale of your smallest meaningful parameter changes
172    /// - Too small: Risk numerical precision issues
173    /// - Too large: Poor gradient estimates for small parameters
174    pub fd_min_step: f64,
175
176    /// Initial step size for line search.
177    /// 
178    /// **Default**: 1.0
179    /// 
180    /// **Typical range**: 0.1 to 10.0
181    /// 
182    /// The line search starts with this step size and adjusts based on the Wolfe conditions.
183    /// For L-BFGS, starting with 1.0 often works well as the algorithm approximates Newton steps.
184    /// 
185    /// **Guidelines**:
186    /// - **Well-conditioned problems**: 1.0 is usually optimal
187    /// - **Ill-conditioned problems**: May benefit from smaller initial steps (0.1-0.5)
188    /// - **Functions with large gradients**: Consider smaller values
189    /// - **Functions with small gradients**: Consider larger values
190    pub initial_step: f64,
191
192    /// Maximum number of line search iterations per optimization step.
193    /// 
194    /// **Default**: 20
195    /// 
196    /// **Typical range**: 10-50
197    /// 
198    /// Controls how much effort is spent finding a good step size. If the maximum
199    /// is reached, the algorithm takes the best step found so far.
200    /// 
201    /// **Trade-offs**:
202    /// - **Larger values**: More accurate line search, potentially faster overall convergence
203    /// - **Smaller values**: Less time per iteration, may need more iterations overall
204    /// 
205    /// **Guidelines**:
206    /// - **Smooth functions**: 10-20 iterations usually sufficient
207    /// - **Difficult functions**: May need 30-50 iterations
208    /// - **Time-critical applications**: Use smaller values (5-10)
209    pub max_line_search_iters: usize,
210
211    /// Tolerance for gradient projection to zero at boundaries.
212    /// 
213    /// **Default**: 1e-14
214    /// 
215    /// **Typical range**: 1e-16 to 1e-10
216    /// 
217    /// When a parameter is at a bound and the gradient would push it further beyond
218    /// the bound, the gradient component is projected to zero. This tolerance
219    /// determines when a parameter is considered "at" a bound.
220    /// 
221    /// **Guidelines**:
222    /// - Should be much smaller than the expected precision of your solution
223    /// - Too small: Parameters may never be considered exactly at bounds
224    /// - Too large: May incorrectly project gradients for parameters near bounds
225    /// - Consider the scale of your parameter bounds when setting this
226    pub boundary_tol: f64,
227}
228
229impl Default for LbfgsbConfig {
230    fn default() -> Self {
231        Self {
232            memory_size: 5,
233            obj_tol: 1e-8,
234            step_size_tol: 1e-9,
235            c1: 1e-4,
236            c2: 0.9,
237            fd_epsilon: 1e-8,
238            fd_min_step: 1e-12,
239            initial_step: 1.0,
240            max_line_search_iters: 20,
241            boundary_tol: 1e-14,
242        }
243    }
244}
245
246/// L-BFGS-B optimizer implementation with optional configuration.
247///
248/// # Arguments
249/// * `x` - Initial point (will be modified in-place)
250/// * `bounds` - Box constraints for each parameter
251/// * `objective` - Objective function to minimize
252/// * `max_iterations` - Maximum number of iterations
253/// * `tol` - Convergence tolerance for gradient norm
254/// * `callback` - Optional callback function invoked after each iteration
255/// * `config` - Optional configuration parameters (uses defaults if None)
256///
257/// # Returns
258/// * `Result<(f64, Vec<f64>), Box<dyn std::error::Error>>` - Best objective value and parameters
259pub fn lbfgsb_optimize<F, C>(
260    x: &mut [f64],
261    bounds: &[(f64, f64)],
262    objective: &F,
263    max_iterations: usize,
264    tol: f64,
265    callback: Option<C>,
266    config: Option<LbfgsbConfig>,
267) -> Result<(f64, Vec<f64>), Box<dyn std::error::Error>>
268where
269    F: Fn(&[f64]) -> f64 + Sync,
270    C: Fn(&[f64], f64) + Sync,
271{
272    // Use provided config or default
273    let config = config.unwrap_or_default();
274    
275    let n = x.len();
276    if bounds.len() != n {
277        return Err("Bounds dimension does not match x dimension.".into());
278    }
279
280    let m = config.memory_size;
281    let mut s_store = vec![vec![0.0; n]; m];
282    let mut y_store = vec![vec![0.0; n]; m];
283    let mut rho_store = vec![0.0; m];
284    let mut alpha = vec![0.0; m];
285
286    let mut f_val = objective(x);
287    let mut grad = finite_difference_gradient(x, objective, config.fd_epsilon, config.fd_min_step);
288    project_gradient_bounds(x, &mut grad, bounds, config.boundary_tol);
289
290    let mut best_f = f_val;
291    let mut best_x = x.to_vec();
292
293    let mut old_f_val = f_val;
294    let mut k = 0;
295
296    for _iteration in 0..max_iterations {
297        let gnorm = grad.iter().map(|g| g.abs()).fold(0.0, f64::max);
298        if gnorm < tol {
299            if let Some(ref cb) = callback {
300                cb(x, f_val);
301            }
302            return Ok((f_val, x.to_vec()));
303        }
304
305        // L-BFGS two-loop recursion
306        let mut q = grad.clone();
307        let nm = if k < m { k } else { m };
308        for i_rev in 0..nm {
309            let i_hist = (m + k - 1 - i_rev) % m;
310            alpha[i_hist] = dot_stable(&s_store[i_hist], &q) * rho_store[i_hist];
311            axpy(-alpha[i_hist], &y_store[i_hist], &mut q);
312        }
313        if nm > 0 {
314            let i_last = (m + k - 1) % m;
315            let sy = dot_stable(&s_store[i_last], &y_store[i_last]);
316            let yy = dot_stable(&y_store[i_last], &y_store[i_last]);
317            if yy.abs() > config.boundary_tol {
318                let scale = sy / yy;
319                for qi in q.iter_mut() {
320                    *qi *= scale;
321                }
322            }
323        }
324        for i_fwd in 0..nm {
325            let i_hist = (k + i_fwd) % m;
326            let beta = dot_stable(&y_store[i_hist], &q) * rho_store[i_hist];
327            axpy(alpha[i_hist] - beta, &s_store[i_hist], &mut q);
328        }
329
330        // Descent direction
331        for d in q.iter_mut() {
332            *d = -*d;
333        }
334
335        // Clamp direction
336        let direction_clamped = clamp_direction(x, &q, bounds);
337
338        // Strong Wolfe line search
339        let step_size = strong_wolfe_line_search(
340            x,
341            f_val,
342            &grad,
343            &direction_clamped,
344            objective,
345            config.c1,
346            config.c2,
347            bounds,
348            config.initial_step,
349            config.max_line_search_iters,
350            config.boundary_tol,
351            config.fd_epsilon,
352            config.fd_min_step,
353        );
354
355        let old_x = x.to_vec();
356        for i in 0..n {
357            x[i] += step_size * direction_clamped[i];
358            // Enforce bounds
359            if x[i] < bounds[i].0 {
360                x[i] = bounds[i].0;
361            } else if x[i] > bounds[i].1 {
362                x[i] = bounds[i].1;
363            }
364        }
365
366        // Recompute objective + gradient
367        f_val = objective(x);
368        grad = finite_difference_gradient(x, objective, config.fd_epsilon, config.fd_min_step);
369        project_gradient_bounds(x, &mut grad, bounds, config.boundary_tol);
370
371        // Update best solution
372        if f_val < best_f {
373            best_f = f_val;
374            best_x = x.to_vec();
375        }
376
377        // BFGS update
378        let s_vec: Vec<f64> = x.iter().zip(old_x.iter()).map(|(xi, oi)| xi - oi).collect();
379        let mut y_vec = finite_difference_gradient(x, objective, config.fd_epsilon, config.fd_min_step);
380        let mut old_grad = finite_difference_gradient(&old_x, objective, config.fd_epsilon, config.fd_min_step);
381        project_gradient_bounds(&old_x, &mut old_grad, bounds, config.boundary_tol);
382        for i in 0..n {
383            y_vec[i] -= old_grad[i];
384        }
385        let sy = dot_stable(&s_vec, &y_vec);
386        if sy.abs() > config.boundary_tol {
387            let i_hist = k % m;
388            s_store[i_hist] = s_vec.clone();
389            y_store[i_hist] = y_vec;
390            rho_store[i_hist] = 1.0 / sy;
391            k += 1;
392        }
393
394        if let Some(ref cb) = callback {
395            cb(x, f_val);
396        }
397
398        // Early stopping checks
399        let obj_diff = (old_f_val - f_val).abs();
400        if obj_diff < config.obj_tol {
401            break;
402        }
403        old_f_val = f_val;
404
405        let step_norm = s_vec.iter().map(|si| si * si).sum::<f64>().sqrt();
406        if step_norm < config.step_size_tol {
407            break;
408        }
409    }
410
411    Ok((best_f, best_x))
412}
413
414/// Projects the gradient to zero at the bounds.
415fn project_gradient_bounds(x: &[f64], grad: &mut [f64], bounds: &[(f64, f64)], tol: f64) {
416    for i in 0..x.len() {
417        let (lb, ub) = bounds[i];
418        if ((x[i] - lb).abs() < tol && grad[i] > 0.0) || ((x[i] - ub).abs() < tol && grad[i] < 0.0) {
419            grad[i] = 0.0;
420        }
421    }
422}
423
424/// Computes the finite-difference gradient using central differences with adaptive step.
425fn finite_difference_gradient<F: Fn(&[f64]) -> f64>(
426    x: &[f64], 
427    f: &F, 
428    epsilon: f64, 
429    min_step: f64
430) -> Vec<f64> {
431    let n = x.len();
432    let mut grad = vec![0.0; n];
433    let sqrt_epsilon = epsilon.sqrt();
434
435    let f0 = f(x);
436    for i in 0..n {
437        let step = sqrt_epsilon * x[i].abs().max(epsilon);
438        let step = if step < min_step { min_step } else { step };
439
440        let mut x_forward = x.to_vec();
441        x_forward[i] += step;
442        let f_forward = f(&x_forward);
443
444        let mut x_backward = x.to_vec();
445        x_backward[i] -= step;
446        let f_backward = f(&x_backward);
447
448        grad[i] = (f_forward - f_backward) / (2.0 * step);
449        if !f_backward.is_finite() {
450            grad[i] = (f_forward - f0) / step;
451        }
452    }
453    grad
454}
455
456/// Clamps the search direction so the step does not leave the box bounds.
457fn clamp_direction(x: &[f64], d: &[f64], bounds: &[(f64, f64)]) -> Vec<f64> {
458    x.iter()
459        .zip(d.iter())
460        .zip(bounds.iter())
461        .map(|((&xi, &di), &(lb, ub))| {
462            if di > 0.0 {
463                di.min(ub - xi)
464            } else if di < 0.0 {
465                di.max(lb - xi)
466            } else {
467                0.0
468            }
469        })
470        .collect()
471}
472
473/// Performs y = y + alpha * x.
474fn axpy(alpha: f64, x: &[f64], y: &mut [f64]) {
475    for (yi, xi) in y.iter_mut().zip(x.iter()) {
476        *yi += alpha * xi;
477    }
478}
479
480/// Computes the dot product using Kahan summation for numerical stability.
481fn dot_stable(a: &[f64], b: &[f64]) -> f64 {
482    let mut sum = 0.0;
483    let mut c = 0.0;
484    for (x, y) in a.iter().zip(b.iter()) {
485        let product = x * y;
486        let t = sum + product;
487        let delta = product - (t - sum);
488        c += delta;
489        sum = t;
490    }
491    sum + c
492}
493
494/// Performs a Strong Wolfe line search and returns a suitable step size.
495#[allow(clippy::too_many_arguments)]
496fn strong_wolfe_line_search<F>(
497    x: &[f64],
498    f_val: f64,
499    grad: &[f64],
500    direction: &[f64],
501    objective: &F,
502    c1: f64,
503    c2: f64,
504    bounds: &[(f64, f64)],
505    initial_step: f64,
506    max_iter: usize,
507    tol: f64,
508    fd_epsilon: f64,
509    fd_min_step: f64,
510) -> f64
511where
512    F: Fn(&[f64]) -> f64 + Sync,
513{
514    let mut alpha_lo = 0.0;
515    let mut alpha_hi = initial_step;
516    let mut alpha = initial_step;
517
518    let grad_dot_dir = dot_stable(grad, direction);
519
520    let mut _f_lo = f_val;
521    let mut _x_lo = x.to_vec();
522
523    for _ in 0..max_iter {
524        // Evaluate objective at alpha
525        let trial: Vec<f64> = x.iter().zip(direction.iter())
526            .enumerate()
527            .map(|(idx, (&xi, &di))| {
528                let candidate = xi + alpha * di;
529                candidate.clamp(bounds[idx].0, bounds[idx].1)
530            })
531            .collect();
532
533        let f_new = objective(&trial);
534
535        // Armijo condition
536        if f_new > f_val + c1 * alpha * grad_dot_dir {
537            alpha_hi = alpha;
538        } else {
539            // Compute gradient at new point for curvature check
540            let grad_new = finite_difference_gradient(&trial, objective, fd_epsilon, fd_min_step);
541            let grad_new_dot_dir = dot_stable(&grad_new, direction);
542            // Strong Wolfe second condition
543            if grad_new_dot_dir.abs() <= -c2 * grad_dot_dir {
544                return alpha;
545            }
546            if grad_new_dot_dir >= 0.0 {
547                alpha_hi = alpha;
548            } else {
549                alpha_lo = alpha;
550                _f_lo = f_new;
551                _x_lo = trial;
552            }
553        }
554        if (alpha_hi - alpha_lo).abs() < tol {
555            break;
556        }
557        alpha = 0.5 * (alpha_lo + alpha_hi);
558    }
559
560    0.5 * (alpha_lo + alpha_hi).max(tol)
561}