cmaes_lbfgsb/lbfgsb_optimize.rs
1//! Enhanced L-BFGS-B optimizer with stable dot product, adaptive finite differences,
2//! and Strong Wolfe line search for robust convergence.
3
4/// Configuration parameters for L-BFGS-B optimization.
5///
6/// This struct contains all parameters that control the behavior of the L-BFGS-B algorithm.
7/// All parameters have sensible defaults suitable for most optimization problems.
8///
9/// # Core Parameters
10///
11/// The most important parameters for typical usage are:
12/// - `memory_size`: Controls memory usage vs convergence speed trade-off
13/// - `obj_tol` and `step_size_tol`: Control convergence criteria
14/// - `c1` and `c2`: Control line search behavior
15///
16/// # Numerical Parameters
17///
18/// Parameters related to finite difference gradients and numerical stability:
19/// - `fd_epsilon` and `fd_min_step`: Control gradient approximation accuracy
20/// - `boundary_tol`: Controls handling of bound constraints
21///
22/// # Example
23///
24/// ```rust
25/// use cmaes_lbfgsb::lbfgsb_optimize::LbfgsbConfig;
26///
27/// // Basic configuration (often sufficient)
28/// let config = LbfgsbConfig::default();
29///
30/// // High-precision configuration
31/// let precise_config = LbfgsbConfig {
32/// memory_size: 20,
33/// obj_tol: 1e-12,
34/// step_size_tol: 1e-12,
35/// ..Default::default()
36/// };
37///
38/// // Configuration for noisy functions
39/// let robust_config = LbfgsbConfig {
40/// c1: 1e-3,
41/// c2: 0.8,
42/// fd_epsilon: 1e-6,
43/// max_line_search_iters: 30,
44/// ..Default::default()
45/// };
46/// ```
47pub struct LbfgsbConfig {
48 /// Memory size for L-BFGS (number of past gradient vectors to store).
49 ///
50 /// **Default**: 5
51 ///
52 /// **Typical range**: 3-20
53 ///
54 /// **Trade-offs**:
55 /// - **Larger values**: Better approximation of Hessian, faster convergence, more memory
56 /// - **Smaller values**: Less memory usage, more robust to non-quadratic functions
57 ///
58 /// **Guidelines**:
59 /// - Small problems (< 100 parameters): 5-10 is usually sufficient
60 /// - Large problems (> 1000 parameters): 10-20 can help convergence
61 /// - Noisy functions: Use smaller values (3-7) for more robustness
62 /// - Very smooth functions: Can benefit from larger values (15-20)
63 ///
64 /// **Memory usage**: Each vector stored uses O(n) memory where n is problem dimension.
65 pub memory_size: usize,
66
67 /// Tolerance for relative function improvement (convergence criterion).
68 ///
69 /// **Default**: 1e-8
70 ///
71 /// **Typical range**: 1e-12 to 1e-4
72 ///
73 /// The algorithm terminates when the relative change in objective value falls below this threshold:
74 /// `|f_old - f_new| / max(|f_old|, |f_new|, 1.0) < obj_tol`
75 ///
76 /// **Guidelines**:
77 /// - **High precision needed**: Use 1e-12 to 1e-10
78 /// - **Standard precision**: Use 1e-8 to 1e-6
79 /// - **Fast approximate solutions**: Use 1e-4 to 1e-2
80 /// - **Noisy functions**: Use larger values to avoid premature termination
81 pub obj_tol: f64,
82
83 /// Tolerance for step size norm (convergence criterion).
84 ///
85 /// **Default**: 1e-9
86 ///
87 /// **Typical range**: 1e-12 to 1e-6
88 ///
89 /// The algorithm terminates when `||step|| < step_size_tol`, indicating that
90 /// parameter changes have become negligibly small.
91 ///
92 /// **Guidelines**:
93 /// - Should typically be smaller than `obj_tol`
94 /// - For parameters with scale ~1: Use default value
95 /// - For very small parameters: Scale proportionally
96 /// - For very large parameters: May need to increase
97 pub step_size_tol: f64,
98
99 /// First Wolfe condition parameter (sufficient decrease, Armijo condition).
100 ///
101 /// **Default**: 1e-4
102 ///
103 /// **Typical range**: 1e-5 to 1e-2
104 ///
105 /// Controls the required decrease in objective function for accepting a step.
106 /// The condition is: `f(x + α*d) ≤ f(x) + c1*α*∇f(x)ᵀd`
107 ///
108 /// **Trade-offs**:
109 /// - **Smaller values**: More stringent decrease requirement, shorter steps, more stable
110 /// - **Larger values**: Less stringent requirement, longer steps, faster progress
111 ///
112 /// **Guidelines**:
113 /// - **Well-conditioned problems**: Can use larger values (1e-3 to 1e-2)
114 /// - **Ill-conditioned problems**: Use smaller values (1e-5 to 1e-4)
115 /// - **Noisy functions**: Use smaller values for stability
116 ///
117 /// **Must satisfy**: 0 < c1 < c2 < 1
118 pub c1: f64,
119
120 /// Second Wolfe condition parameter (curvature condition).
121 ///
122 /// **Default**: 0.9
123 ///
124 /// **Typical range**: 0.1 to 0.9
125 ///
126 /// Controls the required change in gradient for accepting a step.
127 /// The condition is: `|∇f(x + α*d)ᵀd| ≤ c2*|∇f(x)ᵀd|`
128 ///
129 /// **Trade-offs**:
130 /// - **Smaller values**: More stringent curvature requirement, shorter steps
131 /// - **Larger values**: Less stringent requirement, longer steps, fewer line search iterations
132 ///
133 /// **Guidelines**:
134 /// - **Newton-like methods**: Use large values (0.9) to allow long steps
135 /// - **Gradient descent-like**: Use smaller values (0.1-0.5) for more careful steps
136 /// - **Default 0.9**: Good for L-BFGS as it allows the algorithm to take longer steps
137 ///
138 /// **Must satisfy**: 0 < c1 < c2 < 1
139 pub c2: f64,
140
141 /// Base step size for finite difference gradient estimation.
142 ///
143 /// **Default**: 1e-8
144 ///
145 /// **Typical range**: 1e-12 to 1e-4
146 ///
147 /// The actual step size used is `max(fd_epsilon * |x_i|, fd_min_step)` for each parameter.
148 /// This provides relative scaling for different parameter magnitudes.
149 ///
150 /// **Trade-offs**:
151 /// - **Smaller values**: More accurate gradients, but risk of numerical cancellation
152 /// - **Larger values**: Less accurate gradients, but more robust to noise
153 ///
154 /// **Guidelines**:
155 /// - **Smooth functions**: Can use smaller values (1e-10 to 1e-8)
156 /// - **Noisy functions**: Use larger values (1e-6 to 1e-4)
157 /// - **Mixed scales**: Ensure `fd_min_step` handles small parameters appropriately
158 pub fd_epsilon: f64,
159
160 /// Minimum step size for finite difference gradient estimation.
161 ///
162 /// **Default**: 1e-12
163 ///
164 /// **Typical range**: 1e-15 to 1e-8
165 ///
166 /// Ensures that finite difference steps don't become too small for parameters
167 /// near zero, which would lead to poor gradient estimates.
168 ///
169 /// **Guidelines**:
170 /// - Should be much smaller than typical parameter values
171 /// - Consider the scale of your smallest meaningful parameter changes
172 /// - Too small: Risk numerical precision issues
173 /// - Too large: Poor gradient estimates for small parameters
174 pub fd_min_step: f64,
175
176 /// Initial step size for line search.
177 ///
178 /// **Default**: 1.0
179 ///
180 /// **Typical range**: 0.1 to 10.0
181 ///
182 /// The line search starts with this step size and adjusts based on the Wolfe conditions.
183 /// For L-BFGS, starting with 1.0 often works well as the algorithm approximates Newton steps.
184 ///
185 /// **Guidelines**:
186 /// - **Well-conditioned problems**: 1.0 is usually optimal
187 /// - **Ill-conditioned problems**: May benefit from smaller initial steps (0.1-0.5)
188 /// - **Functions with large gradients**: Consider smaller values
189 /// - **Functions with small gradients**: Consider larger values
190 pub initial_step: f64,
191
192 /// Maximum number of line search iterations per optimization step.
193 ///
194 /// **Default**: 20
195 ///
196 /// **Typical range**: 10-50
197 ///
198 /// Controls how much effort is spent finding a good step size. If the maximum
199 /// is reached, the algorithm takes the best step found so far.
200 ///
201 /// **Trade-offs**:
202 /// - **Larger values**: More accurate line search, potentially faster overall convergence
203 /// - **Smaller values**: Less time per iteration, may need more iterations overall
204 ///
205 /// **Guidelines**:
206 /// - **Smooth functions**: 10-20 iterations usually sufficient
207 /// - **Difficult functions**: May need 30-50 iterations
208 /// - **Time-critical applications**: Use smaller values (5-10)
209 pub max_line_search_iters: usize,
210
211 /// Tolerance for gradient projection to zero at boundaries.
212 ///
213 /// **Default**: 1e-14
214 ///
215 /// **Typical range**: 1e-16 to 1e-10
216 ///
217 /// When a parameter is at a bound and the gradient would push it further beyond
218 /// the bound, the gradient component is projected to zero. This tolerance
219 /// determines when a parameter is considered "at" a bound.
220 ///
221 /// **Guidelines**:
222 /// - Should be much smaller than the expected precision of your solution
223 /// - Too small: Parameters may never be considered exactly at bounds
224 /// - Too large: May incorrectly project gradients for parameters near bounds
225 /// - Consider the scale of your parameter bounds when setting this
226 pub boundary_tol: f64,
227}
228
229impl Default for LbfgsbConfig {
230 fn default() -> Self {
231 Self {
232 memory_size: 5,
233 obj_tol: 1e-8,
234 step_size_tol: 1e-9,
235 c1: 1e-4,
236 c2: 0.9,
237 fd_epsilon: 1e-8,
238 fd_min_step: 1e-12,
239 initial_step: 1.0,
240 max_line_search_iters: 20,
241 boundary_tol: 1e-14,
242 }
243 }
244}
245
246/// L-BFGS-B optimizer implementation with optional configuration.
247///
248/// # Arguments
249/// * `x` - Initial point (will be modified in-place)
250/// * `bounds` - Box constraints for each parameter
251/// * `objective` - Objective function to minimize
252/// * `max_iterations` - Maximum number of iterations
253/// * `tol` - Convergence tolerance for gradient norm
254/// * `callback` - Optional callback function invoked after each iteration
255/// * `config` - Optional configuration parameters (uses defaults if None)
256///
257/// # Returns
258/// * `Result<(f64, Vec<f64>), Box<dyn std::error::Error>>` - Best objective value and parameters
259pub fn lbfgsb_optimize<F, C>(
260 x: &mut [f64],
261 bounds: &[(f64, f64)],
262 objective: &F,
263 max_iterations: usize,
264 tol: f64,
265 callback: Option<C>,
266 config: Option<LbfgsbConfig>,
267) -> Result<(f64, Vec<f64>), Box<dyn std::error::Error>>
268where
269 F: Fn(&[f64]) -> f64 + Sync,
270 C: Fn(&[f64], f64) + Sync,
271{
272 // Use provided config or default
273 let config = config.unwrap_or_default();
274
275 let n = x.len();
276 if bounds.len() != n {
277 return Err("Bounds dimension does not match x dimension.".into());
278 }
279
280 let m = config.memory_size;
281 let mut s_store = vec![vec![0.0; n]; m];
282 let mut y_store = vec![vec![0.0; n]; m];
283 let mut rho_store = vec![0.0; m];
284 let mut alpha = vec![0.0; m];
285
286 let mut f_val = objective(x);
287 let mut grad = finite_difference_gradient(x, objective, config.fd_epsilon, config.fd_min_step);
288 project_gradient_bounds(x, &mut grad, bounds, config.boundary_tol);
289
290 let mut best_f = f_val;
291 let mut best_x = x.to_vec();
292
293 let mut old_f_val = f_val;
294 let mut k = 0;
295
296 for _iteration in 0..max_iterations {
297 let gnorm = grad.iter().map(|g| g.abs()).fold(0.0, f64::max);
298 if gnorm < tol {
299 if let Some(ref cb) = callback {
300 cb(x, f_val);
301 }
302 return Ok((f_val, x.to_vec()));
303 }
304
305 // L-BFGS two-loop recursion
306 let mut q = grad.clone();
307 let nm = if k < m { k } else { m };
308 for i_rev in 0..nm {
309 let i_hist = (m + k - 1 - i_rev) % m;
310 alpha[i_hist] = dot_stable(&s_store[i_hist], &q) * rho_store[i_hist];
311 axpy(-alpha[i_hist], &y_store[i_hist], &mut q);
312 }
313 if nm > 0 {
314 let i_last = (m + k - 1) % m;
315 let sy = dot_stable(&s_store[i_last], &y_store[i_last]);
316 let yy = dot_stable(&y_store[i_last], &y_store[i_last]);
317 if yy.abs() > config.boundary_tol {
318 let scale = sy / yy;
319 for qi in q.iter_mut() {
320 *qi *= scale;
321 }
322 }
323 }
324 for i_fwd in 0..nm {
325 let i_hist = (k + i_fwd) % m;
326 let beta = dot_stable(&y_store[i_hist], &q) * rho_store[i_hist];
327 axpy(alpha[i_hist] - beta, &s_store[i_hist], &mut q);
328 }
329
330 // Descent direction
331 for d in q.iter_mut() {
332 *d = -*d;
333 }
334
335 // Clamp direction
336 let direction_clamped = clamp_direction(x, &q, bounds);
337
338 // Strong Wolfe line search
339 let step_size = strong_wolfe_line_search(
340 x,
341 f_val,
342 &grad,
343 &direction_clamped,
344 objective,
345 config.c1,
346 config.c2,
347 bounds,
348 config.initial_step,
349 config.max_line_search_iters,
350 config.boundary_tol,
351 config.fd_epsilon,
352 config.fd_min_step,
353 );
354
355 let old_x = x.to_vec();
356 for i in 0..n {
357 x[i] += step_size * direction_clamped[i];
358 // Enforce bounds
359 if x[i] < bounds[i].0 {
360 x[i] = bounds[i].0;
361 } else if x[i] > bounds[i].1 {
362 x[i] = bounds[i].1;
363 }
364 }
365
366 // Recompute objective + gradient
367 f_val = objective(x);
368 grad = finite_difference_gradient(x, objective, config.fd_epsilon, config.fd_min_step);
369 project_gradient_bounds(x, &mut grad, bounds, config.boundary_tol);
370
371 // Update best solution
372 if f_val < best_f {
373 best_f = f_val;
374 best_x = x.to_vec();
375 }
376
377 // BFGS update
378 let s_vec: Vec<f64> = x.iter().zip(old_x.iter()).map(|(xi, oi)| xi - oi).collect();
379 let mut y_vec = finite_difference_gradient(x, objective, config.fd_epsilon, config.fd_min_step);
380 let mut old_grad = finite_difference_gradient(&old_x, objective, config.fd_epsilon, config.fd_min_step);
381 project_gradient_bounds(&old_x, &mut old_grad, bounds, config.boundary_tol);
382 for i in 0..n {
383 y_vec[i] -= old_grad[i];
384 }
385 let sy = dot_stable(&s_vec, &y_vec);
386 if sy.abs() > config.boundary_tol {
387 let i_hist = k % m;
388 s_store[i_hist] = s_vec.clone();
389 y_store[i_hist] = y_vec;
390 rho_store[i_hist] = 1.0 / sy;
391 k += 1;
392 }
393
394 if let Some(ref cb) = callback {
395 cb(x, f_val);
396 }
397
398 // Early stopping checks
399 let obj_diff = (old_f_val - f_val).abs();
400 if obj_diff < config.obj_tol {
401 break;
402 }
403 old_f_val = f_val;
404
405 let step_norm = s_vec.iter().map(|si| si * si).sum::<f64>().sqrt();
406 if step_norm < config.step_size_tol {
407 break;
408 }
409 }
410
411 Ok((best_f, best_x))
412}
413
414/// Projects the gradient to zero at the bounds.
415fn project_gradient_bounds(x: &[f64], grad: &mut [f64], bounds: &[(f64, f64)], tol: f64) {
416 for i in 0..x.len() {
417 let (lb, ub) = bounds[i];
418 if ((x[i] - lb).abs() < tol && grad[i] > 0.0) || ((x[i] - ub).abs() < tol && grad[i] < 0.0) {
419 grad[i] = 0.0;
420 }
421 }
422}
423
424/// Computes the finite-difference gradient using central differences with adaptive step.
425fn finite_difference_gradient<F: Fn(&[f64]) -> f64>(
426 x: &[f64],
427 f: &F,
428 epsilon: f64,
429 min_step: f64
430) -> Vec<f64> {
431 let n = x.len();
432 let mut grad = vec![0.0; n];
433 let sqrt_epsilon = epsilon.sqrt();
434
435 let f0 = f(x);
436 for i in 0..n {
437 let step = sqrt_epsilon * x[i].abs().max(epsilon);
438 let step = if step < min_step { min_step } else { step };
439
440 let mut x_forward = x.to_vec();
441 x_forward[i] += step;
442 let f_forward = f(&x_forward);
443
444 let mut x_backward = x.to_vec();
445 x_backward[i] -= step;
446 let f_backward = f(&x_backward);
447
448 grad[i] = (f_forward - f_backward) / (2.0 * step);
449 if !f_backward.is_finite() {
450 grad[i] = (f_forward - f0) / step;
451 }
452 }
453 grad
454}
455
456/// Clamps the search direction so the step does not leave the box bounds.
457fn clamp_direction(x: &[f64], d: &[f64], bounds: &[(f64, f64)]) -> Vec<f64> {
458 x.iter()
459 .zip(d.iter())
460 .zip(bounds.iter())
461 .map(|((&xi, &di), &(lb, ub))| {
462 if di > 0.0 {
463 di.min(ub - xi)
464 } else if di < 0.0 {
465 di.max(lb - xi)
466 } else {
467 0.0
468 }
469 })
470 .collect()
471}
472
473/// Performs y = y + alpha * x.
474fn axpy(alpha: f64, x: &[f64], y: &mut [f64]) {
475 for (yi, xi) in y.iter_mut().zip(x.iter()) {
476 *yi += alpha * xi;
477 }
478}
479
480/// Computes the dot product using Kahan summation for numerical stability.
481fn dot_stable(a: &[f64], b: &[f64]) -> f64 {
482 let mut sum = 0.0;
483 let mut c = 0.0;
484 for (x, y) in a.iter().zip(b.iter()) {
485 let product = x * y;
486 let t = sum + product;
487 let delta = product - (t - sum);
488 c += delta;
489 sum = t;
490 }
491 sum + c
492}
493
494/// Performs a Strong Wolfe line search and returns a suitable step size.
495#[allow(clippy::too_many_arguments)]
496fn strong_wolfe_line_search<F>(
497 x: &[f64],
498 f_val: f64,
499 grad: &[f64],
500 direction: &[f64],
501 objective: &F,
502 c1: f64,
503 c2: f64,
504 bounds: &[(f64, f64)],
505 initial_step: f64,
506 max_iter: usize,
507 tol: f64,
508 fd_epsilon: f64,
509 fd_min_step: f64,
510) -> f64
511where
512 F: Fn(&[f64]) -> f64 + Sync,
513{
514 let mut alpha_lo = 0.0;
515 let mut alpha_hi = initial_step;
516 let mut alpha = initial_step;
517
518 let grad_dot_dir = dot_stable(grad, direction);
519
520 let mut _f_lo = f_val;
521 let mut _x_lo = x.to_vec();
522
523 for _ in 0..max_iter {
524 // Evaluate objective at alpha
525 let trial: Vec<f64> = x.iter().zip(direction.iter())
526 .enumerate()
527 .map(|(idx, (&xi, &di))| {
528 let candidate = xi + alpha * di;
529 candidate.clamp(bounds[idx].0, bounds[idx].1)
530 })
531 .collect();
532
533 let f_new = objective(&trial);
534
535 // Armijo condition
536 if f_new > f_val + c1 * alpha * grad_dot_dir {
537 alpha_hi = alpha;
538 } else {
539 // Compute gradient at new point for curvature check
540 let grad_new = finite_difference_gradient(&trial, objective, fd_epsilon, fd_min_step);
541 let grad_new_dot_dir = dot_stable(&grad_new, direction);
542 // Strong Wolfe second condition
543 if grad_new_dot_dir.abs() <= -c2 * grad_dot_dir {
544 return alpha;
545 }
546 if grad_new_dot_dir >= 0.0 {
547 alpha_hi = alpha;
548 } else {
549 alpha_lo = alpha;
550 _f_lo = f_new;
551 _x_lo = trial;
552 }
553 }
554 if (alpha_hi - alpha_lo).abs() < tol {
555 break;
556 }
557 alpha = 0.5 * (alpha_lo + alpha_hi);
558 }
559
560 0.5 * (alpha_lo + alpha_hi).max(tol)
561}