gosh_lbfgs/lbfgs.rs
1// [[file:../lbfgs.note::*header][header:1]]
2//! Limited memory BFGS (L-BFGS).
3//
4// Copyright (c) 1990, Jorge Nocedal
5// Copyright (c) 2007-2010 Naoaki Okazaki
6// Copyright (c) 2018-2019 Wenping Guo
7// All rights reserved.
8//
9// Permission is hereby granted, free of charge, to any person obtaining a copy
10// of this software and associated documentation files (the "Software"), to deal
11// in the Software without restriction, including without limitation the rights
12// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13// copies of the Software, and to permit persons to whom the Software is
14// furnished to do so, subject to the following conditions:
15//
16// The above copyright notice and this permission notice shall be included in
17// all copies or substantial portions of the Software.
18//
19// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25// THE SOFTWARE.
26//
27// This library is a C port of the FORTRAN implementation of Limited-memory
28// Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
29// The original FORTRAN source code is available at:
30// http://www.ece.northwestern.edu/~nocedal/lbfgs.html
31//
32// The L-BFGS algorithm is described in:
33// - Jorge Nocedal.
34// Updating Quasi-Newton Matrices with Limited Storage.
35// <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
36// - Dong C. Liu and Jorge Nocedal.
37// On the limited memory BFGS method for large scale optimization.
38// <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
39//
40// The line search algorithms used in this implementation are described in:
41// - John E. Dennis and Robert B. Schnabel.
42// <i>Numerical Methods for Unconstrained Optimization and Nonlinear
43// Equations</i>, Englewood Cliffs, 1983.
44// - Jorge J. More and David J. Thuente.
45// Line search algorithm with guaranteed sufficient decrease.
46// <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
47// pp. 286-307, 1994.
48//
49// This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
50// method presented in:
51// - Galen Andrew and Jianfeng Gao.
52// Scalable training of L1-regularized log-linear models.
53// In <i>Proceedings of the 24th International Conference on Machine
54// Learning (ICML 2007)</i>, pp. 33-40, 2007.
55
56// I would like to thank the original author, Jorge Nocedal, who has been
57// distributing the effieicnt and explanatory implementation in an open source
58// licence.
59// header:1 ends here
60
61// [[file:../lbfgs.note::*imports][imports:1]]
62use crate::core::*;
63
64use crate::math::LbfgsMath;
65use crate::line::*;
66// imports:1 ends here
67
68// [[file:../lbfgs.note::*parameters][parameters:1]]
69/// L-BFGS optimization parameters.
70///
71/// Call lbfgs_parameter_t::default() function to initialize parameters to the
72/// default values.
73#[derive(Copy, Clone, Debug)]
74#[repr(C)]
75pub struct LbfgsParam {
76 /// The number of corrections to approximate the inverse hessian matrix.
77 ///
78 /// The L-BFGS routine stores the computation results of previous \ref m
79 /// iterations to approximate the inverse hessian matrix of the current
80 /// iteration. This parameter controls the size of the limited memories
81 /// (corrections). The default value is 6. Values less than 3 are not
82 /// recommended. Large values will result in excessive computing time.
83 pub m: usize,
84
85 /// Epsilon for convergence test.
86 ///
87 /// This parameter determines the accuracy with which the solution is to be
88 /// found. A minimization terminates when
89 ///
90 /// ||g|| < epsilon * max(1, ||x||),
91 ///
92 /// where ||.|| denotes the Euclidean (L2) norm. The default value is \c
93 /// 1e-5.
94 pub epsilon: f64,
95
96 /// Distance for delta-based convergence test.
97 ///
98 /// This parameter determines the distance, in iterations, to compute the
99 /// rate of decrease of the objective function. If the value of this
100 /// parameter is zero, the library does not perform the delta-based
101 /// convergence test.
102 ///
103 /// The default value is 0.
104 pub past: usize,
105
106 /// Delta for convergence test.
107 ///
108 /// This parameter determines the minimum rate of decrease of the objective
109 /// function. The library stops iterations when the following condition is
110 /// met: |f' - f| / f < delta, where f' is the objective value of past
111 /// iterations ago, and f is the objective value of the current iteration.
112 /// The default value is 1e-5.
113 ///
114 pub delta: f64,
115
116 /// The maximum number of LBFGS iterations.
117 ///
118 /// The lbfgs optimization terminates when the iteration count exceedes this
119 /// parameter.
120 ///
121 /// Setting this parameter to zero continues an optimization process until a
122 /// convergence or error. The default value is 0.
123 pub max_iterations: usize,
124
125 /// The maximum allowed number of evaluations of function value and
126 /// gradients. This number could be larger than max_iterations since line
127 /// search procedure may involve one or more evaluations.
128 ///
129 /// Setting this parameter to zero continues an optimization process until a
130 /// convergence or error. The default value is 0.
131 pub max_evaluations: usize,
132
133 /// The line search options.
134 ///
135 /// This parameter specifies a line search algorithm to be used by the
136 /// L-BFGS routine.
137 ///
138 pub linesearch: LineSearch,
139
140 /// Enable OWL-QN regulation or not
141 pub orthantwise: bool,
142
143 // FIXME: better name
144 pub owlqn: Orthantwise,
145
146 /// A factor for scaling initial step size.
147 pub initial_inverse_hessian: f64,
148
149 /// The maximum allowed step size for each optimization step, useful for
150 /// preventing wild step.
151 pub max_step_size: f64,
152
153 /// Powell damping
154 pub damping: bool,
155}
156
157impl Default for LbfgsParam {
158 /// Initialize L-BFGS parameters to the default values.
159 ///
160 /// Call this function to fill a parameter structure with the default values
161 /// and overwrite parameter values if necessary.
162 fn default() -> Self {
163 LbfgsParam {
164 m: 6,
165 epsilon: 1e-5,
166 past: 0,
167 delta: 1e-5,
168 max_iterations: 0,
169 max_evaluations: 0,
170 orthantwise: false,
171 owlqn: Orthantwise::default(),
172 linesearch: LineSearch::default(),
173 initial_inverse_hessian: 1.0,
174 max_step_size: 1.0,
175 damping: false,
176 }
177 }
178}
179// parameters:1 ends here
180
181// [[file:../lbfgs.note::*problem][problem:1]]
182/// Represents an optimization problem.
183///
184/// `Problem` holds input variables `x`, gradient `gx` arrays, and function value `fx`.
185pub struct Problem<'a, E>
186where
187 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
188{
189 /// x is an array of length n. on input it must contain the base point for
190 /// the line search.
191 pub x: &'a mut [f64],
192
193 /// `fx` is a variable. It must contain the value of problem `f` at
194 /// x.
195 pub fx: f64,
196
197 /// `gx` is an array of length n. It must contain the gradient of `f` at
198 /// x.
199 pub gx: Vec<f64>,
200
201 /// Cached position vector of previous step.
202 xp: Vec<f64>,
203
204 /// Cached gradient vector of previous step.
205 gp: Vec<f64>,
206
207 /// Pseudo gradient for OrthantWise Limited-memory Quasi-Newton (owlqn) algorithm.
208 pg: Vec<f64>,
209
210 /// Search direction
211 d: Vec<f64>,
212
213 /// Store callback function for evaluating objective function.
214 eval_fn: E,
215
216 /// Orthantwise operations
217 owlqn: Option<Orthantwise>,
218
219 /// Evaluated or not
220 evaluated: bool,
221
222 /// The number of evaluation.
223 neval: usize,
224}
225
226impl<'a, E> Problem<'a, E>
227where
228 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
229{
230 /// Initialize problem with array length n
231 pub fn new(x: &'a mut [f64], eval: E, owlqn: Option<Orthantwise>) -> Self {
232 let n = x.len();
233 Problem {
234 fx: 0.0,
235 gx: vec![0.0; n],
236 xp: vec![0.0; n],
237 gp: vec![0.0; n],
238 pg: vec![0.0; n],
239 d: vec![0.0; n],
240 evaluated: false,
241 neval: 0,
242 x,
243 eval_fn: eval,
244 owlqn,
245 }
246 }
247
248 /// Compute the initial gradient in the search direction.
249 pub fn dginit(&self) -> Result<f64> {
250 if self.owlqn.is_none() {
251 let dginit = self.gx.vecdot(&self.d);
252 if dginit > 0.0 {
253 warn!(
254 "The current search direction increases the objective function value. dginit = {:-0.4}",
255 dginit
256 );
257 }
258
259 Ok(dginit)
260 } else {
261 Ok(self.pg.vecdot(&self.d))
262 }
263 }
264
265 /// Update search direction using evaluated gradient.
266 pub fn update_search_direction(&mut self) {
267 if self.owlqn.is_some() {
268 self.d.vecncpy(&self.pg);
269 } else {
270 self.d.vecncpy(&self.gx);
271 }
272 }
273
274 /// Return a reference to current search direction vector
275 pub fn search_direction(&self) -> &[f64] {
276 &self.d
277 }
278
279 /// Return a mutable reference to current search direction vector
280 pub fn search_direction_mut(&mut self) -> &mut [f64] {
281 &mut self.d
282 }
283
284 /// Compute the gradient in the search direction without sign checking.
285 pub fn dg_unchecked(&self) -> f64 {
286 self.gx.vecdot(&self.d)
287 }
288
289 // FIXME: improve
290 pub fn evaluate(&mut self) -> Result<()> {
291 self.fx = (self.eval_fn)(&self.x, &mut self.gx)?;
292 // self.fx = self.eval_fn.evaluate(&self.x, &mut self.gx)?;
293
294 // Compute the L1 norm of the variables and add it to the object value.
295 if let Some(owlqn) = self.owlqn {
296 self.fx += owlqn.x1norm(&self.x)
297 }
298
299 // FIXME: to be better
300 // if self.orthantwise {
301 // Compute the L1 norm of the variable and add it to the object value.
302 // fx += self.owlqn.x1norm(x);
303 // self.owlqn.pseudo_gradient(&mut pg, &x, &g);
304
305 self.evaluated = true;
306 self.neval += 1;
307
308 Ok(())
309 }
310
311 /// Return total number of evaluations.
312 pub fn number_of_evaluation(&self) -> usize {
313 self.neval
314 }
315
316 /// Test if `Problem` has been evaluated or not
317 pub fn evaluated(&self) -> bool {
318 self.evaluated
319 }
320
321 /// Copies all elements from src into self.
322 pub fn clone_from(&mut self, src: &Problem<E>) {
323 self.x.clone_from_slice(&src.x);
324 self.gx.clone_from_slice(&src.gx);
325 self.fx = src.fx;
326 }
327
328 /// Take a line step along search direction.
329 ///
330 /// Compute the current value of x: x <- x + (*step) * d.
331 ///
332 pub fn take_line_step(&mut self, step: f64) {
333 self.x.veccpy(&self.xp);
334 self.x.vecadd(&self.d, step);
335
336 // Choose the orthant for the new point.
337 // The current point is projected onto the orthant.
338 if let Some(owlqn) = self.owlqn {
339 owlqn.project(&mut self.x, &self.xp, &self.gp);
340 }
341 }
342
343 /// Return gradient vector norm: ||gx||
344 pub fn gnorm(&self) -> f64 {
345 if self.owlqn.is_some() {
346 self.pg.vec2norm()
347 } else {
348 self.gx.vec2norm()
349 }
350 }
351
352 /// Return position vector norm: ||x||
353 pub fn xnorm(&self) -> f64 {
354 self.x.vec2norm()
355 }
356
357 pub fn orthantwise(&self) -> bool {
358 self.owlqn.is_some()
359 }
360
361 /// Revert to previous step
362 pub fn revert(&mut self) {
363 self.x.veccpy(&self.xp);
364 self.gx.veccpy(&self.gp);
365 }
366
367 /// Store the current position and gradient vectors.
368 pub fn save_state(&mut self) {
369 self.xp.veccpy(&self.x);
370 self.gp.veccpy(&self.gx);
371 }
372
373 /// Constrain the search direction for orthant-wise updates.
374 pub fn constrain_search_direction(&mut self) {
375 if let Some(owlqn) = self.owlqn {
376 owlqn.constrain(&mut self.d, &self.pg);
377 }
378 }
379
380 // FIXME
381 pub fn update_owlqn_gradient(&mut self) {
382 if let Some(owlqn) = self.owlqn {
383 owlqn.pseudo_gradient(&mut self.pg, &self.x, &self.gx);
384 }
385 }
386}
387// problem:1 ends here
388
389// [[file:../lbfgs.note::*progress][progress:1]]
390/// Store optimization progress data, for progress monitor
391#[repr(C)]
392#[derive(Debug, Clone)]
393pub struct Progress<'a> {
394 /// The current values of variables
395 pub x: &'a [f64],
396
397 /// The current gradient values of variables.
398 pub gx: &'a [f64],
399
400 /// The current value of the objective function.
401 pub fx: f64,
402
403 /// The Euclidean norm of the variables
404 pub xnorm: f64,
405
406 /// The Euclidean norm of the gradients.
407 pub gnorm: f64,
408
409 /// The line-search step used for this iteration.
410 pub step: f64,
411
412 /// The iteration count.
413 pub niter: usize,
414
415 /// The total number of evaluations.
416 pub neval: usize,
417
418 /// The number of function evaluation calls in line search procedure
419 pub ncall: usize,
420}
421
422impl<'a> Progress<'a> {
423 fn new<E>(prb: &'a Problem<E>, niter: usize, ncall: usize, step: f64) -> Self
424 where
425 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
426 {
427 Progress {
428 x: &prb.x,
429 gx: &prb.gx,
430 fx: prb.fx,
431 xnorm: prb.xnorm(),
432 gnorm: prb.gnorm(),
433 neval: prb.number_of_evaluation(),
434 ncall,
435 step,
436 niter,
437 }
438 }
439}
440
441pub struct Report {
442 /// The current value of the objective function.
443 pub fx: f64,
444
445 /// The Euclidean norm of the variables
446 pub xnorm: f64,
447
448 /// The Euclidean norm of the gradients.
449 pub gnorm: f64,
450
451 /// The total number of evaluations.
452 pub neval: usize,
453}
454
455impl Report {
456 fn new<E>(prb: &Problem<E>) -> Self
457 where
458 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
459 {
460 Self {
461 fx: prb.fx,
462 xnorm: prb.xnorm(),
463 gnorm: prb.gnorm(),
464 neval: prb.number_of_evaluation(),
465 }
466 }
467}
468// progress:1 ends here
469
470// [[file:../lbfgs.note::*orthantwise][orthantwise:1]]
471use crate::base::Orthantwise;
472// orthantwise:1 ends here
473
474// [[file:../lbfgs.note::*builder][builder:1]]
475#[derive(Default, Debug, Clone)]
476pub struct Lbfgs {
477 param: LbfgsParam,
478}
479
480/// Create lbfgs optimizer with epsilon convergence
481impl Lbfgs {
482 /// Set scaled gradient norm for converence test
483 ///
484 /// This parameter determines the accuracy with which the solution is to be
485 /// found. A minimization terminates when
486 ///
487 /// ||g|| < epsilon * max(1, ||x||),
488 ///
489 /// where ||.|| denotes the Euclidean (L2) norm. The default value is 1e-5.
490 pub fn with_epsilon(mut self, epsilon: f64) -> Self {
491 assert!(epsilon.is_sign_positive(), "Invalid parameter epsilon specified.");
492
493 self.param.epsilon = epsilon;
494
495 self
496 }
497
498 /// Set initial step size for optimization. The default value is 1.0.
499 pub fn with_initial_step_size(mut self, b: f64) -> Self {
500 assert!(
501 b.is_sign_positive(),
502 "Invalid beta parameter for scaling the initial step size."
503 );
504
505 self.param.initial_inverse_hessian = b;
506
507 self
508 }
509
510 /// Set the maximum allowed step size for optimization. The default value is 1.0.
511 pub fn with_max_step_size(mut self, s: f64) -> Self {
512 assert!(s.is_sign_positive(), "Invalid max_step_size parameter.");
513
514 self.param.max_step_size = s;
515
516 self
517 }
518
519 /// Enable Powell damping.
520 pub fn with_damping(mut self, damped: bool) -> Self {
521 self.param.damping = damped;
522
523 self
524 }
525
526 /// Set orthantwise parameters
527 pub fn with_orthantwise(mut self, c: f64, start: usize, end: usize) -> Self {
528 assert!(
529 c.is_sign_positive(),
530 "Invalid parameter orthantwise c parameter specified."
531 );
532 warn!("Only the backtracking line search is available for OWL-QN algorithm.");
533
534 self.param.orthantwise = true;
535 self.param.owlqn.c = c;
536 self.param.owlqn.start = start as i32;
537 self.param.owlqn.end = end as i32;
538
539 self
540 }
541
542 /// A parameter to control the accuracy of the line search routine.
543 ///
544 /// The default value is 1e-4. This parameter should be greater
545 /// than zero and smaller than 0.5.
546 pub fn with_linesearch_ftol(mut self, ftol: f64) -> Self {
547 assert!(ftol >= 0.0, "Invalid parameter ftol specified.");
548 self.param.linesearch.ftol = ftol;
549
550 self
551 }
552
553 /// A parameter to control the accuracy of the line search routine.
554 ///
555 /// The default value is 0.9. If the function and gradient evaluations are
556 /// inexpensive with respect to the cost of the iteration (which is
557 /// sometimes the case when solving very large problems) it may be
558 /// advantageous to set this parameter to a small value. A typical small
559 /// value is 0.1. This parameter shuold be greater than the ftol parameter
560 /// (1e-4) and smaller than 1.0.
561 pub fn with_linesearch_gtol(mut self, gtol: f64) -> Self {
562 assert!(
563 gtol >= 0.0 && gtol < 1.0 && gtol > self.param.linesearch.ftol,
564 "Invalid parameter gtol specified."
565 );
566
567 self.param.linesearch.gtol = gtol;
568
569 self
570 }
571
572 /// Try to follow gradient only during optimization, by allowing object
573 /// value rises, which removes the sufficient decrease condition constrain
574 /// in line search. This option also implies Powell damping and
575 /// BacktrackingStrongWolfe line search for improving robustness.
576 pub fn with_gradient_only(mut self) -> Self {
577 self.param.linesearch.gradient_only = true;
578 self.param.damping = true;
579 self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe;
580
581 self
582 }
583
584 /// Set the max number of iterations for line search.
585 pub fn with_max_linesearch(mut self, n: usize) -> Self {
586 self.param.linesearch.max_linesearch = n;
587
588 self
589 }
590
591 /// xtol is a nonnegative input variable. termination occurs when the
592 /// relative width of the interval of uncertainty is at most xtol.
593 ///
594 /// The machine precision for floating-point values.
595 ///
596 /// This parameter must be a positive value set by a client program to
597 /// estimate the machine precision. The line search routine will terminate
598 /// with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
599 /// of the interval of uncertainty is less than this parameter.
600 pub fn with_linesearch_xtol(mut self, xtol: f64) -> Self {
601 assert!(xtol >= 0.0, "Invalid parameter xtol specified.");
602
603 self.param.linesearch.xtol = xtol;
604 self
605 }
606
607 /// The minimum step of the line search routine.
608 ///
609 /// The default value is 1e-20. This value need not be modified unless the
610 /// exponents are too large for the machine being used, or unless the
611 /// problem is extremely badly scaled (in which case the exponents should be
612 /// increased).
613 pub fn with_linesearch_min_step(mut self, min_step: f64) -> Self {
614 assert!(min_step >= 0.0, "Invalid parameter min_step specified.");
615
616 self.param.linesearch.min_step = min_step;
617 self
618 }
619
620 /// Set the maximum number of iterations.
621 ///
622 /// The lbfgs optimization terminates when the iteration count exceedes this
623 /// parameter. Setting this parameter to zero continues an optimization
624 /// process until a convergence or error.
625 ///
626 /// The default value is 0.
627 pub fn with_max_iterations(mut self, niter: usize) -> Self {
628 self.param.max_iterations = niter;
629 self
630 }
631
632 /// The maximum allowed number of evaluations of function value and
633 /// gradients. This number could be larger than max_iterations since line
634 /// search procedure may involve one or more evaluations.
635 ///
636 /// Setting this parameter to zero continues an optimization process until a
637 /// convergence or error. The default value is 0.
638 pub fn with_max_evaluations(mut self, neval: usize) -> Self {
639 self.param.max_evaluations = neval;
640 self
641 }
642
643 /// This parameter determines the minimum rate of decrease of the objective
644 /// function. The library stops iterations when the following condition is
645 /// met: |f' - f| / f < delta, where f' is the objective value of past
646 /// iterations ago, and f is the objective value of the current iteration.
647 ///
648 /// If `past` is zero, the library does not perform the delta-based
649 /// convergence test.
650 ///
651 /// The default value of delta is 1e-5.
652 ///
653 pub fn with_fx_delta(mut self, delta: f64, past: usize) -> Self {
654 assert!(delta >= 0.0, "Invalid parameter delta specified.");
655
656 self.param.past = past;
657 self.param.delta = delta;
658 self
659 }
660
661 /// Select line search algorithm
662 ///
663 /// The default is "MoreThuente" line search algorithm.
664 pub fn with_linesearch_algorithm(mut self, algo: &str) -> Self {
665 match algo {
666 "MoreThuente" => self.param.linesearch.algorithm = LineSearchAlgorithm::MoreThuente,
667 "BacktrackingArmijo" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingArmijo,
668 "BacktrackingStrongWolfe" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe,
669 "BacktrackingWolfe" | "Backtracking" => {
670 self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingWolfe
671 }
672 _ => unimplemented!(),
673 }
674
675 self
676 }
677}
678// builder:1 ends here
679
680// [[file:../lbfgs.note::*hack][hack:1]]
681impl Lbfgs {
682 /// Start the L-BFGS optimization; this will invoke the callback functions evaluate
683 /// and progress.
684 ///
685 /// # Parameters
686 ///
687 /// * x : The array of input variables.
688 /// * evaluate: A closure for evaluating function value and gradient
689 /// * progress: A closure for monitor progress or defining stopping condition
690 ///
691 /// # Return
692 ///
693 /// * on success, return final evaluated `Problem`.
694 pub fn minimize<E, G>(self, x: &mut [f64], eval_fn: E, mut prgr_fn: G) -> Result<Report>
695 where
696 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
697 G: FnMut(&Progress) -> bool,
698 {
699 let mut state = self.build(x, eval_fn)?;
700 info!("start lbfgs loop...");
701 for _ in 0.. {
702 if state.is_converged() {
703 break;
704 }
705 let prgr = state.get_progress();
706 let cancel = prgr_fn(&prgr);
707 if cancel {
708 info!("The minimization process has been canceled.");
709 break;
710 }
711 state.propagate()?;
712 }
713
714 // Return the final value of the objective function.
715 Ok(state.report())
716 }
717}
718// hack:1 ends here
719
720// [[file:../lbfgs.note::*state][state:1]]
721/// LBFGS optimization state allowing iterative propagation
722pub struct LbfgsState<'a, E>
723where
724 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
725{
726 /// LBFGS parameters
727 vars: LbfgsParam,
728
729 /// Define how to evaluate gradient and value
730 prbl: Option<Problem<'a, E>>,
731 end: usize,
732 step: f64,
733 k: usize,
734 lm_arr: Vec<IterationData>,
735 pf: Vec<f64>,
736 ncall: usize,
737}
738// state:1 ends here
739
740// [[file:../lbfgs.note::*build][build:1]]
741impl Lbfgs {
742 /// Build LBFGS state struct for iteration.
743 pub fn build<'a, E>(self, x: &'a mut [f64], eval_fn: E) -> Result<LbfgsState<'a, E>>
744 where
745 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
746 {
747 // Initialize the limited memory.
748 let param = &self.param;
749 let lm_arr = (0..param.m).map(|_| IterationData::new(x.len())).collect();
750
751 // Allocate working space for LBFGS optimization
752 let owlqn = if param.orthantwise {
753 Some(param.owlqn.clone())
754 } else {
755 None
756 };
757 let mut problem = Problem::new(x, eval_fn, owlqn);
758
759 // Evaluate the function value and its gradient.
760 problem.evaluate()?;
761
762 // Compute the L1 norm of the variable and add it to the object value.
763 problem.update_owlqn_gradient();
764
765 // Compute the search direction with current gradient.
766 problem.update_search_direction();
767
768 // Compute the initial step:
769 let h0 = param.initial_inverse_hessian;
770 let step = problem.search_direction().vec2norminv() * h0;
771
772 // Apply Powell damping or not
773 let damping = param.damping;
774 if damping {
775 info!("Powell damping Enabled.");
776 }
777
778 let state = LbfgsState {
779 vars: self.param.clone(),
780 prbl: Some(problem),
781 end: 0,
782 step,
783 k: 0,
784 lm_arr,
785 pf: vec![],
786 ncall: 0,
787 };
788
789 Ok(state)
790 }
791}
792// build:1 ends here
793
794// [[file:../lbfgs.note::*propagate][propagate:1]]
795impl<'a, E> LbfgsState<'a, E>
796where
797 E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
798{
799 /// Check if stopping critera met. Panics if not initialized.
800 pub fn is_converged(&mut self) -> bool {
801 // Monitor the progress.
802 let prgr = self.get_progress();
803 let converged = satisfying_stop_conditions(&self.vars, prgr);
804 converged
805 }
806
807 /// Report minimization progress. Panics if not initialized yet.
808 pub fn report(&self) -> Report {
809 Report::new(self.prbl.as_ref().expect("problem for report"))
810 }
811
812 /// Propagate in next LBFGS step. Return optimization progress on success.
813 /// Panics if not initialized.
814 pub fn propagate(&mut self) -> Result<Progress> {
815 self.k += 1;
816
817 // special case: already converged at the first point
818 if self.k == 1 {
819 let progress = self.get_progress();
820 return Ok(progress);
821 }
822
823 // Store the current position and gradient vectors.
824 let problem = self.prbl.as_mut().expect("problem for propagate");
825 problem.save_state();
826
827 // Search for an optimal step.
828 self.ncall = self
829 .vars
830 .linesearch
831 .find(problem, &mut self.step)
832 .context("Failure during line search")?;
833 problem.update_owlqn_gradient();
834
835 // Update LBFGS iteration data.
836 let it = &mut self.lm_arr[self.end];
837 let gamma = it.update(
838 &problem.x,
839 &problem.xp,
840 &problem.gx,
841 &problem.gp,
842 self.step,
843 self.vars.damping,
844 );
845
846 // Compute the steepest direction
847 problem.update_search_direction();
848 let d = problem.search_direction_mut();
849
850 // Apply LBFGS recursion procedure.
851 self.end = lbfgs_two_loop_recursion(&mut self.lm_arr, d, gamma, self.vars.m, self.k - 1, self.end);
852
853 // Now the search direction d is ready. Constrains the step size to
854 // prevent wild steps.
855 let dnorm = d.vec2norm();
856 self.step = self.vars.max_step_size.min(dnorm) / dnorm;
857
858 // Constrain the search direction for orthant-wise updates.
859 problem.constrain_search_direction();
860
861 let progress = self.get_progress();
862 Ok(progress)
863 }
864
865 fn get_progress(&self) -> Progress {
866 let problem = self.prbl.as_ref().expect("problem for progress");
867 Progress::new(&problem, self.k, self.ncall, self.step)
868 }
869}
870// propagate:1 ends here
871
872// [[file:../lbfgs.note::*recursion][recursion:1]]
873/// Algorithm 7.4, in Nocedal, J.; Wright, S. Numerical Optimization; Springer Science & Business Media, 2006.
874fn lbfgs_two_loop_recursion(
875 lm_arr: &mut [IterationData],
876 d: &mut [f64], // search direction
877 gamma: f64, // H_k^{0} = \gamma I
878 m: usize,
879 k: usize,
880 end: usize,
881) -> usize {
882 let end = (end + 1) % m;
883 let mut j = end;
884 let bound = m.min(k);
885
886 // L-BFGS two-loop recursion, part1
887 for _ in 0..bound {
888 j = (j + m - 1) % m;
889 let it = &mut lm_arr[j as usize];
890
891 // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}.
892 it.alpha = it.s.vecdot(&d) / it.ys;
893 // q_{i} = q_{i+1} - \alpha_{i} y_{i}.
894 d.vecadd(&it.y, -it.alpha);
895 }
896 d.vecscale(gamma);
897
898 // L-BFGS two-loop recursion, part2
899 for _ in 0..bound {
900 let it = &mut lm_arr[j as usize];
901 // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}.
902 let beta = it.y.vecdot(d) / it.ys;
903 // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}.
904 d.vecadd(&it.s, it.alpha - beta);
905 j = (j + 1) % m;
906 }
907
908 end
909}
910// recursion:1 ends here
911
912// [[file:../lbfgs.note::*iteration data][iteration data:1]]
913/// Internal iternation data for L-BFGS
914#[derive(Clone)]
915struct IterationData {
916 alpha: f64,
917
918 s: Vec<f64>,
919
920 y: Vec<f64>,
921
922 /// vecdot(y, s)
923 ys: f64,
924}
925
926impl IterationData {
927 fn new(n: usize) -> Self {
928 IterationData {
929 alpha: 0.0,
930 ys: 0.0,
931 s: vec![0.0; n],
932 y: vec![0.0; n],
933 }
934 }
935
936 /// Updates L-BFGS correction pairs, returns Cholesky factor \gamma for
937 /// scaling the initial inverse Hessian matrix $H_k^0$
938 ///
939 /// # Arguments
940 ///
941 /// * x, xp: current position, and previous position
942 /// * gx, gp: current gradient and previous gradient
943 /// * step: step size along search direction
944 /// * damping: applying Powell damping to the gradient difference `y` helps
945 /// stabilize L-BFGS from numerical noise in function value and gradient
946 ///
947 fn update(&mut self, x: &[f64], xp: &[f64], gx: &[f64], gp: &[f64], step: f64, damping: bool) -> f64 {
948 // Update vectors s and y:
949 // s_{k} = x_{k+1} - x_{k} = \alpha * d_{k}.
950 // y_{k} = g_{k+1} - g_{k}.
951 self.s.vecdiff(x, xp);
952 self.y.vecdiff(gx, gp);
953
954 // Compute scalars ys and yy:
955 // ys = y^t \cdot s = 1 / \rho.
956 // yy = y^t \cdot y.
957 // Notice that yy is used for scaling the intial inverse hessian matrix H_0 (Cholesky factor).
958 let ys = self.y.vecdot(&self.s);
959 let yy = self.y.vecdot(&self.y);
960 self.ys = ys;
961
962 // Al-Baali2014JOTA: Damped Techniques for the Limited Memory BFGS
963 // Method for Large-Scale Optimization. J. Optim. Theory Appl. 2014,
964 // 161 (2), 688–699.
965 //
966 // Nocedal suggests an equivalent value of 0.8 for sigma2 (Damped BFGS
967 // updating)
968 let sigma2 = 0.6;
969 let sigma3 = 3.0;
970 if damping {
971 debug!("Applying Powell damping, sigma2 = {}, sigma3 = {}", sigma2, sigma3);
972
973 // B_k * Sk = B_k * (x_k + step*d_k - x_k) = B_k * step * d_k = -g_k * step
974 let mut bs = gp.to_vec();
975 bs.vecscale(-step);
976 // s_k^T * B_k * s_k
977 let sbs = self.s.vecdot(&bs);
978
979 if ys < (1.0 - sigma2) * sbs {
980 trace!("damping case1");
981 let theta = sigma2 * sbs / (sbs - ys);
982 bs.vecscale(1.0 - theta);
983 bs.vecadd(&self.y, theta);
984 self.y.veccpy(&bs);
985 } else if ys > (1.0 + sigma3) * sbs {
986 trace!("damping case2");
987 let theta = sigma3 * sbs / (ys - sbs);
988 bs.vecscale(1.0 - theta);
989 bs.vecadd(&self.y, theta);
990 } else {
991 trace!("damping case3");
992 // for theta = 1.0, yk = yk, so do nothing here.
993 }
994 }
995
996 ys / yy
997 }
998}
999// iteration data:1 ends here
1000
1001// [[file:../lbfgs.note::*stopping conditions][stopping conditions:1]]
1002/// test if progress satisfying stop condition
1003#[inline]
1004fn satisfying_stop_conditions(param: &LbfgsParam, prgr: Progress) -> bool {
1005 // Buildin tests for stopping conditions
1006 if satisfying_max_iterations(&prgr, param.max_iterations)
1007 || satisfying_max_evaluations(&prgr, param.max_evaluations)
1008 || satisfying_scaled_gnorm(&prgr, param.epsilon)
1009 // || satisfying_delta(&prgr, pf, param.delta)
1010 // || satisfying_max_gnorm(&prgr, self.param.max_gnorm)
1011 {
1012 return true;
1013 }
1014
1015 false
1016}
1017
1018/// The criterion is given by the following formula:
1019/// |g(x)| / \max(1, |x|) < \epsilon
1020#[inline]
1021fn satisfying_scaled_gnorm(prgr: &Progress, epsilon: f64) -> bool {
1022 if prgr.gnorm / prgr.xnorm.max(1.0) <= epsilon {
1023 // Convergence.
1024 info!("L-BFGS reaches convergence.");
1025 true
1026 } else {
1027 false
1028 }
1029}
1030
1031/// Maximum number of lbfgs iterations.
1032#[inline]
1033fn satisfying_max_iterations(prgr: &Progress, max_iterations: usize) -> bool {
1034 if max_iterations == 0 {
1035 false
1036 } else if prgr.niter >= max_iterations {
1037 warn!("max iterations reached!");
1038 true
1039 } else {
1040 false
1041 }
1042}
1043
1044/// Maximum number of function evaluations
1045#[inline]
1046fn satisfying_max_evaluations(prgr: &Progress, max_evaluations: usize) -> bool {
1047 if max_evaluations == 0 {
1048 false
1049 } else if prgr.neval >= max_evaluations {
1050 warn!("Max allowed evaluations reached!");
1051 true
1052 } else {
1053 false
1054 }
1055}
1056
1057#[inline]
1058fn satisfying_max_gnorm(prgr: &Progress, max_gnorm: f64) -> bool {
1059 prgr.gx.vec2norm() <= max_gnorm
1060}
1061
1062/// Functiona value (fx) delta based stopping criterion
1063///
1064/// Test for stopping criterion.
1065/// The criterion is given by the following formula:
1066/// |f(past_x) - f(x)| / f(x) < delta
1067///
1068/// # Parameters
1069///
1070/// * pf: an array for storing previous values of the objective function.
1071/// * delta: max fx delta allowed
1072///
1073#[inline]
1074fn satisfying_delta<'a>(prgr: &Progress, pf: &'a mut [f64], delta: f64) -> bool {
1075 let k = prgr.niter;
1076 let fx = prgr.fx;
1077 let past = pf.len();
1078 if past < 1 {
1079 return false;
1080 }
1081
1082 // We don't test the stopping criterion while k < past.
1083 if past <= k {
1084 // Compute the relative improvement from the past.
1085 let rate = (pf[(k % past) as usize] - fx).abs() / fx;
1086 // The stopping criterion.
1087 if rate < delta {
1088 info!("The stopping criterion.");
1089 return true;
1090 }
1091 }
1092 // Store the current value of the objective function.
1093 pf[(k % past) as usize] = fx;
1094
1095 false
1096}
1097// stopping conditions:1 ends here