liblbfgs/lbfgs.rs
1// [[file:../lbfgs.note::*header][header:1]]
2//! Limited memory BFGS (L-BFGS).
3//
4// Copyright (c) 1990, Jorge Nocedal
5// Copyright (c) 2007-2010 Naoaki Okazaki
6// Copyright (c) 2018-2019 Wenping Guo
7// All rights reserved.
8//
9// Permission is hereby granted, free of charge, to any person obtaining a copy
10// of this software and associated documentation files (the "Software"), to deal
11// in the Software without restriction, including without limitation the rights
12// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13// copies of the Software, and to permit persons to whom the Software is
14// furnished to do so, subject to the following conditions:
15//
16// The above copyright notice and this permission notice shall be included in
17// all copies or substantial portions of the Software.
18//
19// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25// THE SOFTWARE.
26//
27// This library is a C port of the FORTRAN implementation of Limited-memory
28// Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
29// The original FORTRAN source code is available at:
30// http://www.ece.northwestern.edu/~nocedal/lbfgs.html
31//
32// The L-BFGS algorithm is described in:
33// - Jorge Nocedal.
34// Updating Quasi-Newton Matrices with Limited Storage.
35// <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
36// - Dong C. Liu and Jorge Nocedal.
37// On the limited memory BFGS method for large scale optimization.
38// <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
39//
40// The line search algorithms used in this implementation are described in:
41// - John E. Dennis and Robert B. Schnabel.
42// <i>Numerical Methods for Unconstrained Optimization and Nonlinear
43// Equations</i>, Englewood Cliffs, 1983.
44// - Jorge J. More and David J. Thuente.
45// Line search algorithm with guaranteed sufficient decrease.
46// <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
47// pp. 286-307, 1994.
48//
49// This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
50// method presented in:
51// - Galen Andrew and Jianfeng Gao.
52// Scalable training of L1-regularized log-linear models.
53// In <i>Proceedings of the 24th International Conference on Machine
54// Learning (ICML 2007)</i>, pp. 33-40, 2007.
55
56// I would like to thank the original author, Jorge Nocedal, who has been
57// distributing the effieicnt and explanatory implementation in an open source
58// licence.
59// header:1 ends here
60
61// [[file:../lbfgs.note::*imports][imports:1]]
62use crate::core::*;
63
64use crate::math::LbfgsMath;
65use crate::line::*;
66// imports:1 ends here
67
68// [[file:../lbfgs.note::*parameters][parameters:1]]
69/// L-BFGS optimization parameters.
70///
71/// Call lbfgs_parameter_t::default() function to initialize parameters to the
72/// default values.
73#[derive(Copy, Clone, Debug)]
74#[repr(C)]
75pub struct LbfgsParam {
76 /// The number of corrections to approximate the inverse hessian matrix.
77 ///
78 /// The L-BFGS routine stores the computation results of previous \ref m
79 /// iterations to approximate the inverse hessian matrix of the current
80 /// iteration. This parameter controls the size of the limited memories
81 /// (corrections). The default value is 6. Values less than 3 are not
82 /// recommended. Large values will result in excessive computing time.
83 pub m: usize,
84
85 /// Epsilon for convergence test.
86 ///
87 /// This parameter determines the accuracy with which the solution is to be
88 /// found. A minimization terminates when
89 ///
90 /// ||g|| < epsilon * max(1, ||x||),
91 ///
92 /// where ||.|| denotes the Euclidean (L2) norm. The default value is \c
93 /// 1e-5.
94 pub epsilon: f64,
95
96 /// Distance for delta-based convergence test.
97 ///
98 /// This parameter determines the distance, in iterations, to compute the
99 /// rate of decrease of the objective function. If the value of this
100 /// parameter is zero, the library does not perform the delta-based
101 /// convergence test.
102 ///
103 /// The default value is 0.
104 pub past: usize,
105
106 /// Delta for convergence test.
107 ///
108 /// This parameter determines the minimum rate of decrease of the objective
109 /// function. The library stops iterations when the following condition is
110 /// met: |f' - f| / f < delta, where f' is the objective value of past
111 /// iterations ago, and f is the objective value of the current iteration.
112 /// The default value is 1e-5.
113 ///
114 pub delta: f64,
115
116 /// The maximum number of LBFGS iterations.
117 ///
118 /// The lbfgs optimization terminates when the iteration count exceedes this
119 /// parameter.
120 ///
121 /// Setting this parameter to zero continues an optimization process until a
122 /// convergence or error. The default value is 0.
123 pub max_iterations: usize,
124
125 /// The maximum allowed number of evaluations of function value and
126 /// gradients. This number could be larger than max_iterations since line
127 /// search procedure may involve one or more evaluations.
128 ///
129 /// Setting this parameter to zero continues an optimization process until a
130 /// convergence or error. The default value is 0.
131 pub max_evaluations: usize,
132
133 /// The line search options.
134 ///
135 /// This parameter specifies a line search algorithm to be used by the
136 /// L-BFGS routine.
137 ///
138 pub linesearch: LineSearch,
139
140 /// Enable OWL-QN regulation or not
141 pub orthantwise: bool,
142
143 // FIXME: better name
144 pub owlqn: Orthantwise,
145
146 /// A factor for scaling initial step size.
147 pub initial_inverse_hessian: f64,
148
149 /// The maximum allowed step size for each optimization step, useful for
150 /// preventing wild step.
151 pub max_step_size: f64,
152
153 /// Powell damping
154 pub damping: bool,
155}
156
157impl Default for LbfgsParam {
158 /// Initialize L-BFGS parameters to the default values.
159 ///
160 /// Call this function to fill a parameter structure with the default values
161 /// and overwrite parameter values if necessary.
162 fn default() -> Self {
163 LbfgsParam {
164 m: 6,
165 epsilon: 1e-5,
166 past: 0,
167 delta: 1e-5,
168 max_iterations: 0,
169 max_evaluations: 0,
170 orthantwise: false,
171 owlqn: Orthantwise::default(),
172 linesearch: LineSearch::default(),
173 initial_inverse_hessian: 1.0,
174 max_step_size: 1.0,
175 damping: false,
176 }
177 }
178}
179// parameters:1 ends here
180
181// [[file:../lbfgs.note::*problem][problem:1]]
182/// Represents an optimization problem.
183///
184/// `Problem` holds input variables `x`, gradient `gx` arrays, and function value `fx`.
185pub struct Problem<'a, E>
186where
187 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
188{
189 /// x is an array of length n. on input it must contain the base point for
190 /// the line search.
191 pub x: &'a mut [f64],
192
193 /// `fx` is a variable. It must contain the value of problem `f` at
194 /// x.
195 pub fx: f64,
196
197 /// `gx` is an array of length n. It must contain the gradient of `f` at
198 /// x.
199 pub gx: Vec<f64>,
200
201 /// Cached position vector of previous step.
202 xp: Vec<f64>,
203
204 /// Cached gradient vector of previous step.
205 gp: Vec<f64>,
206
207 /// Pseudo gradient for OrthantWise Limited-memory Quasi-Newton (owlqn) algorithm.
208 pg: Vec<f64>,
209
210 /// Search direction
211 d: Vec<f64>,
212
213 /// Store callback function for evaluating objective function.
214 eval_fn: E,
215
216 /// Orthantwise operations
217 owlqn: Option<Orthantwise>,
218
219 /// Evaluated or not
220 evaluated: bool,
221
222 /// The number of evaluation.
223 neval: usize,
224}
225
226impl<'a, E> Problem<'a, E>
227where
228 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
229{
230 /// Initialize problem with array length n
231 pub fn new(x: &'a mut [f64], eval: E, owlqn: Option<Orthantwise>) -> Self {
232 let n = x.len();
233 Problem {
234 fx: 0.0,
235 gx: vec![0.0; n],
236 xp: vec![0.0; n],
237 gp: vec![0.0; n],
238 pg: vec![0.0; n],
239 d: vec![0.0; n],
240 evaluated: false,
241 neval: 0,
242 x,
243 eval_fn: eval,
244 owlqn,
245 }
246 }
247
248 /// Compute the initial gradient in the search direction.
249 pub fn dginit(&self) -> Result<f64> {
250 if self.owlqn.is_none() {
251 let dginit = self.gx.vecdot(&self.d);
252 if dginit > 0.0 {
253 warn!(
254 "The current search direction increases the objective function value. dginit = {:-0.4}",
255 dginit
256 );
257 }
258
259 Ok(dginit)
260 } else {
261 Ok(self.pg.vecdot(&self.d))
262 }
263 }
264
265 /// Update search direction using evaluated gradient.
266 pub fn update_search_direction(&mut self) {
267 if self.owlqn.is_some() {
268 self.d.vecncpy(&self.pg);
269 } else {
270 self.d.vecncpy(&self.gx);
271 }
272 }
273
274 /// Return a reference to current search direction vector
275 pub fn search_direction(&self) -> &[f64] {
276 &self.d
277 }
278
279 /// Return a mutable reference to current search direction vector
280 pub fn search_direction_mut(&mut self) -> &mut [f64] {
281 &mut self.d
282 }
283
284 /// Compute the gradient in the search direction without sign checking.
285 pub fn dg_unchecked(&self) -> f64 {
286 self.gx.vecdot(&self.d)
287 }
288
289 // FIXME: improve
290 pub fn evaluate(&mut self) -> Result<()> {
291 self.fx = (self.eval_fn)(&self.x, &mut self.gx)?;
292 // self.fx = self.eval_fn.evaluate(&self.x, &mut self.gx)?;
293
294 // Compute the L1 norm of the variables and add it to the object value.
295 if let Some(owlqn) = self.owlqn {
296 self.fx += owlqn.x1norm(&self.x)
297 }
298
299 // FIXME: to be better
300 // if self.orthantwise {
301 // Compute the L1 norm of the variable and add it to the object value.
302 // fx += self.owlqn.x1norm(x);
303 // self.owlqn.pseudo_gradient(&mut pg, &x, &g);
304
305 self.evaluated = true;
306 self.neval += 1;
307
308 Ok(())
309 }
310
311 /// Return total number of evaluations.
312 pub fn number_of_evaluation(&self) -> usize {
313 self.neval
314 }
315
316 /// Test if `Problem` has been evaluated or not
317 pub fn evaluated(&self) -> bool {
318 self.evaluated
319 }
320
321 /// Copies all elements from src into self.
322 pub fn clone_from(&mut self, src: &Problem<E>) {
323 self.x.clone_from_slice(&src.x);
324 self.gx.clone_from_slice(&src.gx);
325 self.fx = src.fx;
326 }
327
328 /// Take a line step along search direction.
329 ///
330 /// Compute the current value of x: x <- x + (*step) * d.
331 ///
332 pub fn take_line_step(&mut self, step: f64) {
333 self.x.veccpy(&self.xp);
334 self.x.vecadd(&self.d, step);
335
336 // Choose the orthant for the new point.
337 // The current point is projected onto the orthant.
338 if let Some(owlqn) = self.owlqn {
339 owlqn.project(&mut self.x, &self.xp, &self.gp);
340 }
341 }
342
343 /// Return gradient vector norm: ||gx||
344 pub fn gnorm(&self) -> f64 {
345 if self.owlqn.is_some() {
346 self.pg.vec2norm()
347 } else {
348 self.gx.vec2norm()
349 }
350 }
351
352 /// Return position vector norm: ||x||
353 pub fn xnorm(&self) -> f64 {
354 self.x.vec2norm()
355 }
356
357 pub fn orthantwise(&self) -> bool {
358 self.owlqn.is_some()
359 }
360
361 /// Revert to previous step
362 pub fn revert(&mut self) {
363 self.x.veccpy(&self.xp);
364 self.gx.veccpy(&self.gp);
365 }
366
367 /// Store the current position and gradient vectors.
368 pub fn save_state(&mut self) {
369 self.xp.veccpy(&self.x);
370 self.gp.veccpy(&self.gx);
371 }
372
373 /// Constrain the search direction for orthant-wise updates.
374 pub fn constrain_search_direction(&mut self) {
375 if let Some(owlqn) = self.owlqn {
376 owlqn.constrain(&mut self.d, &self.pg);
377 }
378 }
379
380 // FIXME
381 pub fn update_owlqn_gradient(&mut self) {
382 if let Some(owlqn) = self.owlqn {
383 owlqn.pseudo_gradient(&mut self.pg, &self.x, &self.gx);
384 }
385 }
386}
387// problem:1 ends here
388
389// [[file:../lbfgs.note::*progress][progress:1]]
390/// Store optimization progress data, for progress monitor
391#[repr(C)]
392#[derive(Debug, Clone)]
393pub struct Progress<'a> {
394 /// The current values of variables
395 pub x: &'a [f64],
396
397 /// The current gradient values of variables.
398 pub gx: &'a [f64],
399
400 /// The current value of the objective function.
401 pub fx: f64,
402
403 /// The Euclidean norm of the variables
404 pub xnorm: f64,
405
406 /// The Euclidean norm of the gradients.
407 pub gnorm: f64,
408
409 /// The line-search step used for this iteration.
410 pub step: f64,
411
412 /// The iteration count.
413 pub niter: usize,
414
415 /// The total number of evaluations.
416 pub neval: usize,
417
418 /// The number of function evaluation calls in line search procedure
419 pub ncall: usize,
420}
421
422impl<'a> Progress<'a> {
423 fn new<E>(prb: &'a Problem<E>, niter: usize, ncall: usize, step: f64) -> Self
424 where
425 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
426 {
427 Progress {
428 x: &prb.x,
429 gx: &prb.gx,
430 fx: prb.fx,
431 xnorm: prb.xnorm(),
432 gnorm: prb.gnorm(),
433 neval: prb.number_of_evaluation(),
434 ncall,
435 step,
436 niter,
437 }
438 }
439}
440
441pub struct Report {
442 /// The current value of the objective function.
443 pub fx: f64,
444
445 /// The Euclidean norm of the variables
446 pub xnorm: f64,
447
448 /// The Euclidean norm of the gradients.
449 pub gnorm: f64,
450
451 /// The total number of evaluations.
452 pub neval: usize,
453}
454
455impl Report {
456 fn new<E>(prb: &Problem<E>) -> Self
457 where
458 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
459 {
460 Self {
461 fx: prb.fx,
462 xnorm: prb.xnorm(),
463 gnorm: prb.gnorm(),
464 neval: prb.number_of_evaluation(),
465 }
466 }
467}
468// progress:1 ends here
469
470// [[file:../lbfgs.note::*orthantwise][orthantwise:1]]
471/// Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) algorithm
472#[derive(Copy, Clone, Debug)]
473pub struct Orthantwise {
474 /// Coeefficient for the L1 norm of variables.
475 ///
476 /// Setting this parameter to a positive value activates Orthant-Wise
477 /// Limited-memory Quasi-Newton (OWL-QN) method, which minimizes the
478 /// objective function F(x) combined with the L1 norm |x| of the variables,
479 /// {F(x) + C |x|}. This parameter is the coeefficient for the |x|, i.e.,
480 /// C. As the L1 norm |x| is not differentiable at zero, the library
481 /// modifies function and gradient evaluations from a client program
482 /// suitably; a client program thus have only to return the function value
483 /// F(x) and gradients G(x) as usual. The default value is 1.
484 pub c: f64,
485
486 /// Start index for computing L1 norm of the variables.
487 ///
488 /// This parameter is valid only for OWL-QN method (i.e., orthantwise_c !=
489 /// 0). This parameter b (0 <= b < N) specifies the index number from which
490 /// the library computes the L1 norm of the variables x,
491 ///
492 /// |x| := |x_{b}| + |x_{b+1}| + ... + |x_{N}| .
493 ///
494 /// In other words, variables x_1, ..., x_{b-1} are not used for computing
495 /// the L1 norm. Setting b (0 < b < N), one can protect variables, x_1, ...,
496 /// x_{b-1} (e.g., a bias term of logistic regression) from being
497 /// regularized. The default value is zero.
498 pub start: i32,
499
500 /// End index for computing L1 norm of the variables.
501 ///
502 /// This parameter is valid only for OWL-QN method (i.e., \ref orthantwise_c
503 /// != 0). This parameter e (0 < e <= N) specifies the index number at which
504 /// the library stops computing the L1 norm of the variables x,
505 pub end: i32,
506}
507
508impl Default for Orthantwise {
509 fn default() -> Self {
510 Orthantwise {
511 c: 1.0,
512 start: 0,
513 end: -1,
514 }
515 }
516}
517
518impl Orthantwise {
519 // FIXME: remove
520 // a dirty wrapper for start and end
521 fn start_end(&self, x: &[f64]) -> (usize, usize) {
522 let start = self.start as usize;
523 let end = if self.end < 0 {
524 x.len()
525 } else {
526 self.end as usize
527 };
528
529 (start, end)
530 }
531
532 /// Compute the L1 norm of the variables.
533 fn x1norm(&self, x: &[f64]) -> f64 {
534 let (start, end) = self.start_end(x);
535
536 let mut s = 0.0;
537 for i in start..end {
538 s += self.c * x[i].abs();
539 }
540
541 s
542 }
543
544 /// Compute the psuedo-gradients.
545 fn pseudo_gradient(&self, pg: &mut [f64], x: &[f64], g: &[f64]) {
546 let (start, end) = self.start_end(x);
547 let c = self.c;
548
549 // Compute the negative of gradients.
550 for i in 0..start {
551 pg[i] = g[i];
552 }
553
554 // Compute the psuedo-gradients.
555 for i in start..end {
556 if x[i] < 0.0 {
557 // Differentiable.
558 pg[i] = g[i] - c;
559 } else if 0.0 < x[i] {
560 pg[i] = g[i] + c;
561 } else {
562 if g[i] < -c {
563 // Take the right partial derivative.
564 pg[i] = g[i] + c;
565 } else if c < g[i] {
566 // Take the left partial derivative.
567 pg[i] = g[i] - c;
568 } else {
569 pg[i] = 0.;
570 }
571 }
572 }
573
574 for i in end..g.len() {
575 pg[i] = g[i];
576 }
577 }
578
579 /// Choose the orthant for the new point.
580 ///
581 /// During the line search, each search point is projected onto the orthant
582 /// of the previous point.
583 fn project(&self, x: &mut [f64], xp: &[f64], gp: &[f64]) {
584 let (start, end) = self.start_end(xp);
585
586 for i in start..end {
587 let sign = if xp[i] == 0.0 { -gp[i] } else { xp[i] };
588 if x[i] * sign <= 0.0 {
589 x[i] = 0.0
590 }
591 }
592 }
593
594 fn constrain(&self, d: &mut [f64], pg: &[f64]) {
595 let (start, end) = self.start_end(pg);
596
597 for i in start..end {
598 if d[i] * pg[i] >= 0.0 {
599 d[i] = 0.0;
600 }
601 }
602 }
603}
604// orthantwise:1 ends here
605
606// [[file:../lbfgs.note::*builder][builder:1]]
607#[derive(Default, Debug, Clone)]
608pub struct Lbfgs {
609 param: LbfgsParam,
610}
611
612/// Create lbfgs optimizer with epsilon convergence
613impl Lbfgs {
614 /// Set scaled gradient norm for converence test
615 ///
616 /// This parameter determines the accuracy with which the solution is to be
617 /// found. A minimization terminates when
618 ///
619 /// ||g|| < epsilon * max(1, ||x||),
620 ///
621 /// where ||.|| denotes the Euclidean (L2) norm. The default value is 1e-5.
622 pub fn with_epsilon(mut self, epsilon: f64) -> Self {
623 assert!(epsilon.is_sign_positive(), "Invalid parameter epsilon specified.");
624
625 self.param.epsilon = epsilon;
626
627 self
628 }
629
630 /// Set initial step size for optimization. The default value is 1.0.
631 pub fn with_initial_step_size(mut self, b: f64) -> Self {
632 assert!(
633 b.is_sign_positive(),
634 "Invalid beta parameter for scaling the initial step size."
635 );
636
637 self.param.initial_inverse_hessian = b;
638
639 self
640 }
641
642 /// Set the maximum allowed step size for optimization. The default value is 1.0.
643 pub fn with_max_step_size(mut self, s: f64) -> Self {
644 assert!(s.is_sign_positive(), "Invalid max_step_size parameter.");
645
646 self.param.max_step_size = s;
647
648 self
649 }
650
651 /// Enable Powell damping.
652 pub fn with_damping(mut self, damped: bool) -> Self {
653 self.param.damping = damped;
654
655 self
656 }
657
658 /// Set orthantwise parameters
659 pub fn with_orthantwise(mut self, c: f64, start: usize, end: usize) -> Self {
660 assert!(
661 c.is_sign_positive(),
662 "Invalid parameter orthantwise c parameter specified."
663 );
664 warn!("Only the backtracking line search is available for OWL-QN algorithm.");
665
666 self.param.orthantwise = true;
667 self.param.owlqn.c = c;
668 self.param.owlqn.start = start as i32;
669 self.param.owlqn.end = end as i32;
670
671 self
672 }
673
674 /// A parameter to control the accuracy of the line search routine.
675 ///
676 /// The default value is 1e-4. This parameter should be greater
677 /// than zero and smaller than 0.5.
678 pub fn with_linesearch_ftol(mut self, ftol: f64) -> Self {
679 assert!(ftol >= 0.0, "Invalid parameter ftol specified.");
680 self.param.linesearch.ftol = ftol;
681
682 self
683 }
684
685 /// A parameter to control the accuracy of the line search routine.
686 ///
687 /// The default value is 0.9. If the function and gradient evaluations are
688 /// inexpensive with respect to the cost of the iteration (which is
689 /// sometimes the case when solving very large problems) it may be
690 /// advantageous to set this parameter to a small value. A typical small
691 /// value is 0.1. This parameter shuold be greater than the ftol parameter
692 /// (1e-4) and smaller than 1.0.
693 pub fn with_linesearch_gtol(mut self, gtol: f64) -> Self {
694 assert!(
695 gtol >= 0.0 && gtol < 1.0 && gtol > self.param.linesearch.ftol,
696 "Invalid parameter gtol specified."
697 );
698
699 self.param.linesearch.gtol = gtol;
700
701 self
702 }
703
704 /// Try to follow gradient only during optimization, by allowing object
705 /// value rises, which removes the sufficient decrease condition constrain
706 /// in line search. This option also implies Powell damping and
707 /// BacktrackingStrongWolfe line search for improving robustness.
708 pub fn with_gradient_only(mut self) -> Self {
709 self.param.linesearch.gradient_only = true;
710 self.param.damping = true;
711 self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe;
712
713 self
714 }
715
716 /// Set the max number of iterations for line search.
717 pub fn with_max_linesearch(mut self, n: usize) -> Self {
718 self.param.linesearch.max_linesearch = n;
719
720 self
721 }
722
723 /// xtol is a nonnegative input variable. termination occurs when the
724 /// relative width of the interval of uncertainty is at most xtol.
725 ///
726 /// The machine precision for floating-point values.
727 ///
728 /// This parameter must be a positive value set by a client program to
729 /// estimate the machine precision. The line search routine will terminate
730 /// with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
731 /// of the interval of uncertainty is less than this parameter.
732 pub fn with_linesearch_xtol(mut self, xtol: f64) -> Self {
733 assert!(xtol >= 0.0, "Invalid parameter xtol specified.");
734
735 self.param.linesearch.xtol = xtol;
736 self
737 }
738
739 /// The minimum step of the line search routine.
740 ///
741 /// The default value is 1e-20. This value need not be modified unless the
742 /// exponents are too large for the machine being used, or unless the
743 /// problem is extremely badly scaled (in which case the exponents should be
744 /// increased).
745 pub fn with_linesearch_min_step(mut self, min_step: f64) -> Self {
746 assert!(min_step >= 0.0, "Invalid parameter min_step specified.");
747
748 self.param.linesearch.min_step = min_step;
749 self
750 }
751
752 /// Set the maximum number of iterations.
753 ///
754 /// The lbfgs optimization terminates when the iteration count exceedes this
755 /// parameter. Setting this parameter to zero continues an optimization
756 /// process until a convergence or error.
757 ///
758 /// The default value is 0.
759 pub fn with_max_iterations(mut self, niter: usize) -> Self {
760 self.param.max_iterations = niter;
761 self
762 }
763
764 /// The maximum allowed number of evaluations of function value and
765 /// gradients. This number could be larger than max_iterations since line
766 /// search procedure may involve one or more evaluations.
767 ///
768 /// Setting this parameter to zero continues an optimization process until a
769 /// convergence or error. The default value is 0.
770 pub fn with_max_evaluations(mut self, neval: usize) -> Self {
771 self.param.max_evaluations = neval;
772 self
773 }
774
775 /// This parameter determines the minimum rate of decrease of the objective
776 /// function. The library stops iterations when the following condition is
777 /// met: |f' - f| / f < delta, where f' is the objective value of past
778 /// iterations ago, and f is the objective value of the current iteration.
779 ///
780 /// If `past` is zero, the library does not perform the delta-based
781 /// convergence test.
782 ///
783 /// The default value of delta is 1e-5.
784 ///
785 pub fn with_fx_delta(mut self, delta: f64, past: usize) -> Self {
786 assert!(delta >= 0.0, "Invalid parameter delta specified.");
787
788 self.param.past = past;
789 self.param.delta = delta;
790 self
791 }
792
793 /// Select line search algorithm
794 ///
795 /// The default is "MoreThuente" line search algorithm.
796 pub fn with_linesearch_algorithm(mut self, algo: &str) -> Self {
797 match algo {
798 "MoreThuente" => self.param.linesearch.algorithm = LineSearchAlgorithm::MoreThuente,
799 "BacktrackingArmijo" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingArmijo,
800 "BacktrackingStrongWolfe" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe,
801 "BacktrackingWolfe" | "Backtracking" => {
802 self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingWolfe
803 }
804 _ => unimplemented!(),
805 }
806
807 self
808 }
809}
810// builder:1 ends here
811
812// [[file:../lbfgs.note::*hack][hack:1]]
813impl Lbfgs {
814 /// Start the L-BFGS optimization; this will invoke the callback functions evaluate
815 /// and progress.
816 ///
817 /// # Parameters
818 ///
819 /// * x : The array of input variables.
820 /// * evaluate: A closure for evaluating function value and gradient
821 /// * progress: A closure for monitor progress or defining stopping condition
822 ///
823 /// # Return
824 ///
825 /// * on success, return final evaluated `Problem`.
826 pub fn minimize<E, G>(self, x: &mut [f64], eval_fn: E, mut prgr_fn: G) -> Result<Report>
827 where
828 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
829 G: FnMut(&Progress) -> bool,
830 {
831 let mut state = self.build(x, eval_fn)?;
832 info!("start lbfgs loop...");
833 for _ in 0.. {
834 if state.is_converged() {
835 break;
836 }
837 let prgr = state.get_progress();
838 let cancel = prgr_fn(&prgr);
839 if cancel {
840 info!("The minimization process has been canceled.");
841 break;
842 }
843 state.propagate()?;
844 }
845
846 // Return the final value of the objective function.
847 Ok(state.report())
848 }
849}
850// hack:1 ends here
851
852// [[file:../lbfgs.note::*state][state:1]]
853/// LBFGS optimization state allowing iterative propagation
854pub struct LbfgsState<'a, E>
855where
856 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
857{
858 /// LBFGS parameters
859 vars: LbfgsParam,
860
861 /// Define how to evaluate gradient and value
862 prbl: Option<Problem<'a, E>>,
863 end: usize,
864 step: f64,
865 k: usize,
866 lm_arr: Vec<IterationData>,
867 pf: Vec<f64>,
868 ncall: usize,
869}
870// state:1 ends here
871
872// [[file:../lbfgs.note::*build][build:1]]
873impl Lbfgs {
874 /// Build LBFGS state struct for iteration.
875 pub fn build<'a, E>(self, x: &'a mut [f64], eval_fn: E) -> Result<LbfgsState<'a, E>>
876 where
877 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
878 {
879 // Initialize the limited memory.
880 let param = &self.param;
881 let lm_arr = (0..param.m).map(|_| IterationData::new(x.len())).collect();
882
883 // Allocate working space for LBFGS optimization
884 let owlqn = if param.orthantwise {
885 Some(param.owlqn.clone())
886 } else {
887 None
888 };
889 let mut problem = Problem::new(x, eval_fn, owlqn);
890
891 // Evaluate the function value and its gradient.
892 problem.evaluate()?;
893
894 // Compute the L1 norm of the variable and add it to the object value.
895 problem.update_owlqn_gradient();
896
897 // Compute the search direction with current gradient.
898 problem.update_search_direction();
899
900 // Compute the initial step:
901 let h0 = param.initial_inverse_hessian;
902 let step = problem.search_direction().vec2norminv() * h0;
903
904 // Apply Powell damping or not
905 let damping = param.damping;
906 if damping {
907 info!("Powell damping Enabled.");
908 }
909
910 let state = LbfgsState {
911 vars: self.param.clone(),
912 prbl: Some(problem),
913 end: 0,
914 step,
915 k: 0,
916 lm_arr,
917 pf: vec![],
918 ncall: 0,
919 };
920
921 Ok(state)
922 }
923}
924// build:1 ends here
925
926// [[file:../lbfgs.note::*propagate][propagate:1]]
927impl<'a, E> LbfgsState<'a, E>
928where
929 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
930{
931 /// Check if stopping critera met. Panics if not initialized.
932 pub fn is_converged(&mut self) -> bool {
933 // Monitor the progress.
934 let prgr = self.get_progress();
935 let converged = satisfying_stop_conditions(&self.vars, prgr);
936 converged
937 }
938
939 /// Report minimization progress. Panics if not initialized yet.
940 pub fn report(&self) -> Report {
941 Report::new(self.prbl.as_ref().expect("problem for report"))
942 }
943
944 /// Propagate in next LBFGS step. Return optimization progress on success.
945 /// Panics if not initialized.
946 pub fn propagate(&mut self) -> Result<Progress> {
947 self.k += 1;
948
949 // special case: already converged at the first point
950 if self.k == 1 {
951 let progress = self.get_progress();
952 return Ok(progress);
953 }
954
955 // Store the current position and gradient vectors.
956 let problem = self.prbl.as_mut().expect("problem for propagate");
957 problem.save_state();
958
959 // Search for an optimal step.
960 self.ncall = self
961 .vars
962 .linesearch
963 .find(problem, &mut self.step)
964 .context("Failure during line search")?;
965 problem.update_owlqn_gradient();
966
967 // Update LBFGS iteration data.
968 let it = &mut self.lm_arr[self.end];
969 let gamma = it.update(
970 &problem.x,
971 &problem.xp,
972 &problem.gx,
973 &problem.gp,
974 self.step,
975 self.vars.damping,
976 );
977
978 // Compute the steepest direction
979 problem.update_search_direction();
980 let d = problem.search_direction_mut();
981
982 // Apply LBFGS recursion procedure.
983 self.end = lbfgs_two_loop_recursion(&mut self.lm_arr, d, gamma, self.vars.m, self.k - 1, self.end);
984
985 // Now the search direction d is ready. Constrains the step size to
986 // prevent wild steps.
987 let dnorm = d.vec2norm();
988 self.step = self.vars.max_step_size.min(dnorm) / dnorm;
989
990 // Constrain the search direction for orthant-wise updates.
991 problem.constrain_search_direction();
992
993 let progress = self.get_progress();
994 Ok(progress)
995 }
996
997 fn get_progress(&self) -> Progress {
998 let problem = self.prbl.as_ref().expect("problem for progress");
999 Progress::new(&problem, self.k, self.ncall, self.step)
1000 }
1001}
1002// propagate:1 ends here
1003
1004// [[file:../lbfgs.note::*recursion][recursion:1]]
1005/// Algorithm 7.4, in Nocedal, J.; Wright, S. Numerical Optimization; Springer Science & Business Media, 2006.
1006fn lbfgs_two_loop_recursion(
1007 lm_arr: &mut [IterationData],
1008 d: &mut [f64], // search direction
1009 gamma: f64, // H_k^{0} = \gamma I
1010 m: usize,
1011 k: usize,
1012 end: usize,
1013) -> usize {
1014 let end = (end + 1) % m;
1015 let mut j = end;
1016 let bound = m.min(k);
1017
1018 // L-BFGS two-loop recursion, part1
1019 for _ in 0..bound {
1020 j = (j + m - 1) % m;
1021 let it = &mut lm_arr[j as usize];
1022
1023 // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}.
1024 it.alpha = it.s.vecdot(&d) / it.ys;
1025 // q_{i} = q_{i+1} - \alpha_{i} y_{i}.
1026 d.vecadd(&it.y, -it.alpha);
1027 }
1028 d.vecscale(gamma);
1029
1030 // L-BFGS two-loop recursion, part2
1031 for _ in 0..bound {
1032 let it = &mut lm_arr[j as usize];
1033 // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}.
1034 let beta = it.y.vecdot(d) / it.ys;
1035 // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}.
1036 d.vecadd(&it.s, it.alpha - beta);
1037 j = (j + 1) % m;
1038 }
1039
1040 end
1041}
1042// recursion:1 ends here
1043
1044// [[file:../lbfgs.note::*iteration data][iteration data:1]]
1045/// Internal iternation data for L-BFGS
1046#[derive(Clone)]
1047struct IterationData {
1048 alpha: f64,
1049
1050 s: Vec<f64>,
1051
1052 y: Vec<f64>,
1053
1054 /// vecdot(y, s)
1055 ys: f64,
1056}
1057
1058impl IterationData {
1059 fn new(n: usize) -> Self {
1060 IterationData {
1061 alpha: 0.0,
1062 ys: 0.0,
1063 s: vec![0.0; n],
1064 y: vec![0.0; n],
1065 }
1066 }
1067
1068 /// Updates L-BFGS correction pairs, returns Cholesky factor \gamma for
1069 /// scaling the initial inverse Hessian matrix $H_k^0$
1070 ///
1071 /// # Arguments
1072 ///
1073 /// * x, xp: current position, and previous position
1074 /// * gx, gp: current gradient and previous gradient
1075 /// * step: step size along search direction
1076 /// * damping: applying Powell damping to the gradient difference `y` helps
1077 /// stabilize L-BFGS from numerical noise in function value and gradient
1078 ///
1079 fn update(&mut self, x: &[f64], xp: &[f64], gx: &[f64], gp: &[f64], step: f64, damping: bool) -> f64 {
1080 // Update vectors s and y:
1081 // s_{k} = x_{k+1} - x_{k} = \alpha * d_{k}.
1082 // y_{k} = g_{k+1} - g_{k}.
1083 self.s.vecdiff(x, xp);
1084 self.y.vecdiff(gx, gp);
1085
1086 // Compute scalars ys and yy:
1087 // ys = y^t \cdot s = 1 / \rho.
1088 // yy = y^t \cdot y.
1089 // Notice that yy is used for scaling the intial inverse hessian matrix H_0 (Cholesky factor).
1090 let ys = self.y.vecdot(&self.s);
1091 let yy = self.y.vecdot(&self.y);
1092 self.ys = ys;
1093
1094 // Al-Baali2014JOTA: Damped Techniques for the Limited Memory BFGS
1095 // Method for Large-Scale Optimization. J. Optim. Theory Appl. 2014,
1096 // 161 (2), 688–699.
1097 //
1098 // Nocedal suggests an equivalent value of 0.8 for sigma2 (Damped BFGS
1099 // updating)
1100 let sigma2 = 0.6;
1101 let sigma3 = 3.0;
1102 if damping {
1103 debug!("Applying Powell damping, sigma2 = {}, sigma3 = {}", sigma2, sigma3);
1104
1105 // B_k * Sk = B_k * (x_k + step*d_k - x_k) = B_k * step * d_k = -g_k * step
1106 let mut bs = gp.to_vec();
1107 bs.vecscale(-step);
1108 // s_k^T * B_k * s_k
1109 let sbs = self.s.vecdot(&bs);
1110
1111 if ys < (1.0 - sigma2) * sbs {
1112 trace!("damping case1");
1113 let theta = sigma2 * sbs / (sbs - ys);
1114 bs.vecscale(1.0 - theta);
1115 bs.vecadd(&self.y, theta);
1116 self.y.veccpy(&bs);
1117 } else if ys > (1.0 + sigma3) * sbs {
1118 trace!("damping case2");
1119 let theta = sigma3 * sbs / (ys - sbs);
1120 bs.vecscale(1.0 - theta);
1121 bs.vecadd(&self.y, theta);
1122 } else {
1123 trace!("damping case3");
1124 // for theta = 1.0, yk = yk, so do nothing here.
1125 }
1126 }
1127
1128 ys / yy
1129 }
1130}
1131// iteration data:1 ends here
1132
1133// [[file:../lbfgs.note::*stopping conditions][stopping conditions:1]]
1134/// test if progress satisfying stop condition
1135#[inline]
1136fn satisfying_stop_conditions(param: &LbfgsParam, prgr: Progress) -> bool {
1137 // Buildin tests for stopping conditions
1138 if satisfying_max_iterations(&prgr, param.max_iterations)
1139 || satisfying_max_evaluations(&prgr, param.max_evaluations)
1140 || satisfying_scaled_gnorm(&prgr, param.epsilon)
1141 // || satisfying_delta(&prgr, pf, param.delta)
1142 // || satisfying_max_gnorm(&prgr, self.param.max_gnorm)
1143 {
1144 return true;
1145 }
1146
1147 false
1148}
1149
1150/// The criterion is given by the following formula:
1151/// |g(x)| / \max(1, |x|) < \epsilon
1152#[inline]
1153fn satisfying_scaled_gnorm(prgr: &Progress, epsilon: f64) -> bool {
1154 if prgr.gnorm / prgr.xnorm.max(1.0) <= epsilon {
1155 // Convergence.
1156 info!("L-BFGS reaches convergence.");
1157 true
1158 } else {
1159 false
1160 }
1161}
1162
1163/// Maximum number of lbfgs iterations.
1164#[inline]
1165fn satisfying_max_iterations(prgr: &Progress, max_iterations: usize) -> bool {
1166 if max_iterations == 0 {
1167 false
1168 } else if prgr.niter >= max_iterations {
1169 warn!("max iterations reached!");
1170 true
1171 } else {
1172 false
1173 }
1174}
1175
1176/// Maximum number of function evaluations
1177#[inline]
1178fn satisfying_max_evaluations(prgr: &Progress, max_evaluations: usize) -> bool {
1179 if max_evaluations == 0 {
1180 false
1181 } else if prgr.neval >= max_evaluations {
1182 warn!("Max allowed evaluations reached!");
1183 true
1184 } else {
1185 false
1186 }
1187}
1188
1189#[inline]
1190fn satisfying_max_gnorm(prgr: &Progress, max_gnorm: f64) -> bool {
1191 prgr.gx.vec2norm() <= max_gnorm
1192}
1193
1194/// Functiona value (fx) delta based stopping criterion
1195///
1196/// Test for stopping criterion.
1197/// The criterion is given by the following formula:
1198/// |f(past_x) - f(x)| / f(x) < delta
1199///
1200/// # Parameters
1201///
1202/// * pf: an array for storing previous values of the objective function.
1203/// * delta: max fx delta allowed
1204///
1205#[inline]
1206fn satisfying_delta<'a>(prgr: &Progress, pf: &'a mut [f64], delta: f64) -> bool {
1207 let k = prgr.niter;
1208 let fx = prgr.fx;
1209 let past = pf.len();
1210 if past < 1 {
1211 return false;
1212 }
1213
1214 // We don't test the stopping criterion while k < past.
1215 if past <= k {
1216 // Compute the relative improvement from the past.
1217 let rate = (pf[(k % past) as usize] - fx).abs() / fx;
1218 // The stopping criterion.
1219 if rate < delta {
1220 info!("The stopping criterion.");
1221 return true;
1222 }
1223 }
1224 // Store the current value of the objective function.
1225 pf[(k % past) as usize] = fx;
1226
1227 false
1228}
1229// stopping conditions:1 ends here