gosh_lbfgs/
lbfgs.rs

1// [[file:../lbfgs.note::*header][header:1]]
2//!       Limited memory BFGS (L-BFGS).
3//
4//  Copyright (c) 1990, Jorge Nocedal
5//  Copyright (c) 2007-2010 Naoaki Okazaki
6//  Copyright (c) 2018-2019 Wenping Guo
7//  All rights reserved.
8//
9//  Permission is hereby granted, free of charge, to any person obtaining a copy
10//  of this software and associated documentation files (the "Software"), to deal
11//  in the Software without restriction, including without limitation the rights
12//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13//  copies of the Software, and to permit persons to whom the Software is
14//  furnished to do so, subject to the following conditions:
15//
16//  The above copyright notice and this permission notice shall be included in
17//  all copies or substantial portions of the Software.
18//
19//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25//  THE SOFTWARE.
26//
27// This library is a C port of the FORTRAN implementation of Limited-memory
28// Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
29// The original FORTRAN source code is available at:
30// http://www.ece.northwestern.edu/~nocedal/lbfgs.html
31//
32// The L-BFGS algorithm is described in:
33//     - Jorge Nocedal.
34//       Updating Quasi-Newton Matrices with Limited Storage.
35//       <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
36//     - Dong C. Liu and Jorge Nocedal.
37//       On the limited memory BFGS method for large scale optimization.
38//       <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
39//
40// The line search algorithms used in this implementation are described in:
41//     - John E. Dennis and Robert B. Schnabel.
42//       <i>Numerical Methods for Unconstrained Optimization and Nonlinear
43//       Equations</i>, Englewood Cliffs, 1983.
44//     - Jorge J. More and David J. Thuente.
45//       Line search algorithm with guaranteed sufficient decrease.
46//       <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
47//       pp. 286-307, 1994.
48//
49// This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
50// method presented in:
51//     - Galen Andrew and Jianfeng Gao.
52//       Scalable training of L1-regularized log-linear models.
53//       In <i>Proceedings of the 24th International Conference on Machine
54//       Learning (ICML 2007)</i>, pp. 33-40, 2007.
55
56// I would like to thank the original author, Jorge Nocedal, who has been
57// distributing the effieicnt and explanatory implementation in an open source
58// licence.
59// header:1 ends here
60
61// [[file:../lbfgs.note::*imports][imports:1]]
62use crate::core::*;
63
64use crate::math::LbfgsMath;
65use crate::line::*;
66// imports:1 ends here
67
68// [[file:../lbfgs.note::*parameters][parameters:1]]
69/// L-BFGS optimization parameters.
70///
71/// Call lbfgs_parameter_t::default() function to initialize parameters to the
72/// default values.
73#[derive(Copy, Clone, Debug)]
74#[repr(C)]
75pub struct LbfgsParam {
76    /// The number of corrections to approximate the inverse hessian matrix.
77    ///
78    /// The L-BFGS routine stores the computation results of previous \ref m
79    /// iterations to approximate the inverse hessian matrix of the current
80    /// iteration. This parameter controls the size of the limited memories
81    /// (corrections). The default value is 6. Values less than 3 are not
82    /// recommended. Large values will result in excessive computing time.
83    pub m: usize,
84
85    /// Epsilon for convergence test.
86    ///
87    /// This parameter determines the accuracy with which the solution is to be
88    /// found. A minimization terminates when
89    ///
90    /// ||g|| < epsilon * max(1, ||x||),
91    ///
92    /// where ||.|| denotes the Euclidean (L2) norm. The default value is \c
93    /// 1e-5.
94    pub epsilon: f64,
95
96    /// Distance for delta-based convergence test.
97    ///
98    /// This parameter determines the distance, in iterations, to compute the
99    /// rate of decrease of the objective function. If the value of this
100    /// parameter is zero, the library does not perform the delta-based
101    /// convergence test.
102    ///
103    /// The default value is 0.
104    pub past: usize,
105
106    /// Delta for convergence test.
107    ///
108    /// This parameter determines the minimum rate of decrease of the objective
109    /// function. The library stops iterations when the following condition is
110    /// met: |f' - f| / f < delta, where f' is the objective value of past
111    /// iterations ago, and f is the objective value of the current iteration.
112    /// The default value is 1e-5.
113    ///
114    pub delta: f64,
115
116    /// The maximum number of LBFGS iterations.
117    ///
118    /// The lbfgs optimization terminates when the iteration count exceedes this
119    /// parameter.
120    ///
121    /// Setting this parameter to zero continues an optimization process until a
122    /// convergence or error. The default value is 0.
123    pub max_iterations: usize,
124
125    /// The maximum allowed number of evaluations of function value and
126    /// gradients. This number could be larger than max_iterations since line
127    /// search procedure may involve one or more evaluations.
128    ///
129    /// Setting this parameter to zero continues an optimization process until a
130    /// convergence or error. The default value is 0.
131    pub max_evaluations: usize,
132
133    /// The line search options.
134    ///
135    ///  This parameter specifies a line search algorithm to be used by the
136    ///  L-BFGS routine.
137    ///
138    pub linesearch: LineSearch,
139
140    /// Enable OWL-QN regulation or not
141    pub orthantwise: bool,
142
143    // FIXME: better name
144    pub owlqn: Orthantwise,
145
146    /// A factor for scaling initial step size.
147    pub initial_inverse_hessian: f64,
148
149    /// The maximum allowed step size for each optimization step, useful for
150    /// preventing wild step.
151    pub max_step_size: f64,
152
153    /// Powell damping
154    pub damping: bool,
155}
156
157impl Default for LbfgsParam {
158    /// Initialize L-BFGS parameters to the default values.
159    ///
160    /// Call this function to fill a parameter structure with the default values
161    /// and overwrite parameter values if necessary.
162    fn default() -> Self {
163        LbfgsParam {
164            m: 6,
165            epsilon: 1e-5,
166            past: 0,
167            delta: 1e-5,
168            max_iterations: 0,
169            max_evaluations: 0,
170            orthantwise: false,
171            owlqn: Orthantwise::default(),
172            linesearch: LineSearch::default(),
173            initial_inverse_hessian: 1.0,
174            max_step_size: 1.0,
175            damping: false,
176        }
177    }
178}
179// parameters:1 ends here
180
181// [[file:../lbfgs.note::*problem][problem:1]]
182/// Represents an optimization problem.
183///
184/// `Problem` holds input variables `x`, gradient `gx` arrays, and function value `fx`.
185pub struct Problem<'a, E>
186where
187    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
188{
189    /// x is an array of length n. on input it must contain the base point for
190    /// the line search.
191    pub x: &'a mut [f64],
192
193    /// `fx` is a variable. It must contain the value of problem `f` at
194    /// x.
195    pub fx: f64,
196
197    /// `gx` is an array of length n. It must contain the gradient of `f` at
198    /// x.
199    pub gx: Vec<f64>,
200
201    /// Cached position vector of previous step.
202    xp: Vec<f64>,
203
204    /// Cached gradient vector of previous step.
205    gp: Vec<f64>,
206
207    /// Pseudo gradient for OrthantWise Limited-memory Quasi-Newton (owlqn) algorithm.
208    pg: Vec<f64>,
209
210    /// Search direction
211    d: Vec<f64>,
212
213    /// Store callback function for evaluating objective function.
214    eval_fn: E,
215
216    /// Orthantwise operations
217    owlqn: Option<Orthantwise>,
218
219    /// Evaluated or not
220    evaluated: bool,
221
222    /// The number of evaluation.
223    neval: usize,
224}
225
226impl<'a, E> Problem<'a, E>
227where
228    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
229{
230    /// Initialize problem with array length n
231    pub fn new(x: &'a mut [f64], eval: E, owlqn: Option<Orthantwise>) -> Self {
232        let n = x.len();
233        Problem {
234            fx: 0.0,
235            gx: vec![0.0; n],
236            xp: vec![0.0; n],
237            gp: vec![0.0; n],
238            pg: vec![0.0; n],
239            d: vec![0.0; n],
240            evaluated: false,
241            neval: 0,
242            x,
243            eval_fn: eval,
244            owlqn,
245        }
246    }
247
248    /// Compute the initial gradient in the search direction.
249    pub fn dginit(&self) -> Result<f64> {
250        if self.owlqn.is_none() {
251            let dginit = self.gx.vecdot(&self.d);
252            if dginit > 0.0 {
253                warn!(
254                    "The current search direction increases the objective function value. dginit = {:-0.4}",
255                    dginit
256                );
257            }
258
259            Ok(dginit)
260        } else {
261            Ok(self.pg.vecdot(&self.d))
262        }
263    }
264
265    /// Update search direction using evaluated gradient.
266    pub fn update_search_direction(&mut self) {
267        if self.owlqn.is_some() {
268            self.d.vecncpy(&self.pg);
269        } else {
270            self.d.vecncpy(&self.gx);
271        }
272    }
273
274    /// Return a reference to current search direction vector
275    pub fn search_direction(&self) -> &[f64] {
276        &self.d
277    }
278
279    /// Return a mutable reference to current search direction vector
280    pub fn search_direction_mut(&mut self) -> &mut [f64] {
281        &mut self.d
282    }
283
284    /// Compute the gradient in the search direction without sign checking.
285    pub fn dg_unchecked(&self) -> f64 {
286        self.gx.vecdot(&self.d)
287    }
288
289    // FIXME: improve
290    pub fn evaluate(&mut self) -> Result<()> {
291        self.fx = (self.eval_fn)(&self.x, &mut self.gx)?;
292        // self.fx = self.eval_fn.evaluate(&self.x, &mut self.gx)?;
293
294        // Compute the L1 norm of the variables and add it to the object value.
295        if let Some(owlqn) = self.owlqn {
296            self.fx += owlqn.x1norm(&self.x)
297        }
298
299        // FIXME: to be better
300        // if self.orthantwise {
301        // Compute the L1 norm of the variable and add it to the object value.
302        // fx += self.owlqn.x1norm(x);
303        // self.owlqn.pseudo_gradient(&mut pg, &x, &g);
304
305        self.evaluated = true;
306        self.neval += 1;
307
308        Ok(())
309    }
310
311    /// Return total number of evaluations.
312    pub fn number_of_evaluation(&self) -> usize {
313        self.neval
314    }
315
316    /// Test if `Problem` has been evaluated or not
317    pub fn evaluated(&self) -> bool {
318        self.evaluated
319    }
320
321    /// Copies all elements from src into self.
322    pub fn clone_from(&mut self, src: &Problem<E>) {
323        self.x.clone_from_slice(&src.x);
324        self.gx.clone_from_slice(&src.gx);
325        self.fx = src.fx;
326    }
327
328    /// Take a line step along search direction.
329    ///
330    /// Compute the current value of x: x <- x + (*step) * d.
331    ///
332    pub fn take_line_step(&mut self, step: f64) {
333        self.x.veccpy(&self.xp);
334        self.x.vecadd(&self.d, step);
335
336        // Choose the orthant for the new point.
337        // The current point is projected onto the orthant.
338        if let Some(owlqn) = self.owlqn {
339            owlqn.project(&mut self.x, &self.xp, &self.gp);
340        }
341    }
342
343    /// Return gradient vector norm: ||gx||
344    pub fn gnorm(&self) -> f64 {
345        if self.owlqn.is_some() {
346            self.pg.vec2norm()
347        } else {
348            self.gx.vec2norm()
349        }
350    }
351
352    /// Return position vector norm: ||x||
353    pub fn xnorm(&self) -> f64 {
354        self.x.vec2norm()
355    }
356
357    pub fn orthantwise(&self) -> bool {
358        self.owlqn.is_some()
359    }
360
361    /// Revert to previous step
362    pub fn revert(&mut self) {
363        self.x.veccpy(&self.xp);
364        self.gx.veccpy(&self.gp);
365    }
366
367    /// Store the current position and gradient vectors.
368    pub fn save_state(&mut self) {
369        self.xp.veccpy(&self.x);
370        self.gp.veccpy(&self.gx);
371    }
372
373    /// Constrain the search direction for orthant-wise updates.
374    pub fn constrain_search_direction(&mut self) {
375        if let Some(owlqn) = self.owlqn {
376            owlqn.constrain(&mut self.d, &self.pg);
377        }
378    }
379
380    // FIXME
381    pub fn update_owlqn_gradient(&mut self) {
382        if let Some(owlqn) = self.owlqn {
383            owlqn.pseudo_gradient(&mut self.pg, &self.x, &self.gx);
384        }
385    }
386}
387// problem:1 ends here
388
389// [[file:../lbfgs.note::*progress][progress:1]]
390/// Store optimization progress data, for progress monitor
391#[repr(C)]
392#[derive(Debug, Clone)]
393pub struct Progress<'a> {
394    /// The current values of variables
395    pub x: &'a [f64],
396
397    /// The current gradient values of variables.
398    pub gx: &'a [f64],
399
400    /// The current value of the objective function.
401    pub fx: f64,
402
403    /// The Euclidean norm of the variables
404    pub xnorm: f64,
405
406    /// The Euclidean norm of the gradients.
407    pub gnorm: f64,
408
409    /// The line-search step used for this iteration.
410    pub step: f64,
411
412    /// The iteration count.
413    pub niter: usize,
414
415    /// The total number of evaluations.
416    pub neval: usize,
417
418    /// The number of function evaluation calls in line search procedure
419    pub ncall: usize,
420}
421
422impl<'a> Progress<'a> {
423    fn new<E>(prb: &'a Problem<E>, niter: usize, ncall: usize, step: f64) -> Self
424    where
425        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
426    {
427        Progress {
428            x: &prb.x,
429            gx: &prb.gx,
430            fx: prb.fx,
431            xnorm: prb.xnorm(),
432            gnorm: prb.gnorm(),
433            neval: prb.number_of_evaluation(),
434            ncall,
435            step,
436            niter,
437        }
438    }
439}
440
441pub struct Report {
442    /// The current value of the objective function.
443    pub fx: f64,
444
445    /// The Euclidean norm of the variables
446    pub xnorm: f64,
447
448    /// The Euclidean norm of the gradients.
449    pub gnorm: f64,
450
451    /// The total number of evaluations.
452    pub neval: usize,
453}
454
455impl Report {
456    fn new<E>(prb: &Problem<E>) -> Self
457    where
458        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
459    {
460        Self {
461            fx: prb.fx,
462            xnorm: prb.xnorm(),
463            gnorm: prb.gnorm(),
464            neval: prb.number_of_evaluation(),
465        }
466    }
467}
468// progress:1 ends here
469
470// [[file:../lbfgs.note::*orthantwise][orthantwise:1]]
471use crate::base::Orthantwise;
472// orthantwise:1 ends here
473
474// [[file:../lbfgs.note::*builder][builder:1]]
475#[derive(Default, Debug, Clone)]
476pub struct Lbfgs {
477    param: LbfgsParam,
478}
479
480/// Create lbfgs optimizer with epsilon convergence
481impl Lbfgs {
482    /// Set scaled gradient norm for converence test
483    ///
484    /// This parameter determines the accuracy with which the solution is to be
485    /// found. A minimization terminates when
486    ///
487    /// ||g|| < epsilon * max(1, ||x||),
488    ///
489    /// where ||.|| denotes the Euclidean (L2) norm. The default value is 1e-5.
490    pub fn with_epsilon(mut self, epsilon: f64) -> Self {
491        assert!(epsilon.is_sign_positive(), "Invalid parameter epsilon specified.");
492
493        self.param.epsilon = epsilon;
494
495        self
496    }
497
498    /// Set initial step size for optimization. The default value is 1.0.
499    pub fn with_initial_step_size(mut self, b: f64) -> Self {
500        assert!(
501            b.is_sign_positive(),
502            "Invalid beta parameter for scaling the initial step size."
503        );
504
505        self.param.initial_inverse_hessian = b;
506
507        self
508    }
509
510    /// Set the maximum allowed step size for optimization. The default value is 1.0.
511    pub fn with_max_step_size(mut self, s: f64) -> Self {
512        assert!(s.is_sign_positive(), "Invalid max_step_size parameter.");
513
514        self.param.max_step_size = s;
515
516        self
517    }
518
519    /// Enable Powell damping.
520    pub fn with_damping(mut self, damped: bool) -> Self {
521        self.param.damping = damped;
522
523        self
524    }
525
526    /// Set orthantwise parameters
527    pub fn with_orthantwise(mut self, c: f64, start: usize, end: usize) -> Self {
528        assert!(
529            c.is_sign_positive(),
530            "Invalid parameter orthantwise c parameter specified."
531        );
532        warn!("Only the backtracking line search is available for OWL-QN algorithm.");
533
534        self.param.orthantwise = true;
535        self.param.owlqn.c = c;
536        self.param.owlqn.start = start as i32;
537        self.param.owlqn.end = end as i32;
538
539        self
540    }
541
542    /// A parameter to control the accuracy of the line search routine.
543    ///
544    /// The default value is 1e-4. This parameter should be greater
545    /// than zero and smaller than 0.5.
546    pub fn with_linesearch_ftol(mut self, ftol: f64) -> Self {
547        assert!(ftol >= 0.0, "Invalid parameter ftol specified.");
548        self.param.linesearch.ftol = ftol;
549
550        self
551    }
552
553    /// A parameter to control the accuracy of the line search routine.
554    ///
555    /// The default value is 0.9. If the function and gradient evaluations are
556    /// inexpensive with respect to the cost of the iteration (which is
557    /// sometimes the case when solving very large problems) it may be
558    /// advantageous to set this parameter to a small value. A typical small
559    /// value is 0.1. This parameter shuold be greater than the ftol parameter
560    /// (1e-4) and smaller than 1.0.
561    pub fn with_linesearch_gtol(mut self, gtol: f64) -> Self {
562        assert!(
563            gtol >= 0.0 && gtol < 1.0 && gtol > self.param.linesearch.ftol,
564            "Invalid parameter gtol specified."
565        );
566
567        self.param.linesearch.gtol = gtol;
568
569        self
570    }
571
572    /// Try to follow gradient only during optimization, by allowing object
573    /// value rises, which removes the sufficient decrease condition constrain
574    /// in line search. This option also implies Powell damping and
575    /// BacktrackingStrongWolfe line search for improving robustness.
576    pub fn with_gradient_only(mut self) -> Self {
577        self.param.linesearch.gradient_only = true;
578        self.param.damping = true;
579        self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe;
580
581        self
582    }
583
584    /// Set the max number of iterations for line search.
585    pub fn with_max_linesearch(mut self, n: usize) -> Self {
586        self.param.linesearch.max_linesearch = n;
587
588        self
589    }
590
591    /// xtol is a nonnegative input variable. termination occurs when the
592    /// relative width of the interval of uncertainty is at most xtol.
593    ///
594    /// The machine precision for floating-point values.
595    ///
596    ///  This parameter must be a positive value set by a client program to
597    ///  estimate the machine precision. The line search routine will terminate
598    ///  with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
599    ///  of the interval of uncertainty is less than this parameter.
600    pub fn with_linesearch_xtol(mut self, xtol: f64) -> Self {
601        assert!(xtol >= 0.0, "Invalid parameter xtol specified.");
602
603        self.param.linesearch.xtol = xtol;
604        self
605    }
606
607    /// The minimum step of the line search routine.
608    ///
609    /// The default value is 1e-20. This value need not be modified unless the
610    /// exponents are too large for the machine being used, or unless the
611    /// problem is extremely badly scaled (in which case the exponents should be
612    /// increased).
613    pub fn with_linesearch_min_step(mut self, min_step: f64) -> Self {
614        assert!(min_step >= 0.0, "Invalid parameter min_step specified.");
615
616        self.param.linesearch.min_step = min_step;
617        self
618    }
619
620    /// Set the maximum number of iterations.
621    ///
622    /// The lbfgs optimization terminates when the iteration count exceedes this
623    /// parameter. Setting this parameter to zero continues an optimization
624    /// process until a convergence or error.
625    ///
626    /// The default value is 0.
627    pub fn with_max_iterations(mut self, niter: usize) -> Self {
628        self.param.max_iterations = niter;
629        self
630    }
631
632    /// The maximum allowed number of evaluations of function value and
633    /// gradients. This number could be larger than max_iterations since line
634    /// search procedure may involve one or more evaluations.
635    ///
636    /// Setting this parameter to zero continues an optimization process until a
637    /// convergence or error. The default value is 0.
638    pub fn with_max_evaluations(mut self, neval: usize) -> Self {
639        self.param.max_evaluations = neval;
640        self
641    }
642
643    /// This parameter determines the minimum rate of decrease of the objective
644    /// function. The library stops iterations when the following condition is
645    /// met: |f' - f| / f < delta, where f' is the objective value of past
646    /// iterations ago, and f is the objective value of the current iteration.
647    ///
648    /// If `past` is zero, the library does not perform the delta-based
649    /// convergence test.
650    ///
651    /// The default value of delta is 1e-5.
652    ///
653    pub fn with_fx_delta(mut self, delta: f64, past: usize) -> Self {
654        assert!(delta >= 0.0, "Invalid parameter delta specified.");
655
656        self.param.past = past;
657        self.param.delta = delta;
658        self
659    }
660
661    /// Select line search algorithm
662    ///
663    /// The default is "MoreThuente" line search algorithm.
664    pub fn with_linesearch_algorithm(mut self, algo: &str) -> Self {
665        match algo {
666            "MoreThuente" => self.param.linesearch.algorithm = LineSearchAlgorithm::MoreThuente,
667            "BacktrackingArmijo" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingArmijo,
668            "BacktrackingStrongWolfe" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe,
669            "BacktrackingWolfe" | "Backtracking" => {
670                self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingWolfe
671            }
672            _ => unimplemented!(),
673        }
674
675        self
676    }
677}
678// builder:1 ends here
679
680// [[file:../lbfgs.note::*hack][hack:1]]
681impl Lbfgs {
682    /// Start the L-BFGS optimization; this will invoke the callback functions evaluate
683    /// and progress.
684    /// 
685    /// # Parameters
686    /// 
687    /// * x       : The array of input variables.
688    /// * evaluate: A closure for evaluating function value and gradient
689    /// * progress: A closure for monitor progress or defining stopping condition
690    /// 
691    /// # Return
692    /// 
693    /// * on success, return final evaluated `Problem`.
694    pub fn minimize<E, G>(self, x: &mut [f64], eval_fn: E, mut prgr_fn: G) -> Result<Report>
695    where
696        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
697        G: FnMut(&Progress) -> bool,
698    {
699        let mut state = self.build(x, eval_fn)?;
700        info!("start lbfgs loop...");
701        for _ in 0.. {
702            if state.is_converged() {
703                break;
704            }
705            let prgr = state.get_progress();
706            let cancel = prgr_fn(&prgr);
707            if cancel {
708                info!("The minimization process has been canceled.");
709                break;
710            }
711            state.propagate()?;
712        }
713
714        // Return the final value of the objective function.
715        Ok(state.report())
716    }
717}
718// hack:1 ends here
719
720// [[file:../lbfgs.note::*state][state:1]]
721/// LBFGS optimization state allowing iterative propagation
722pub struct LbfgsState<'a, E>
723where
724    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
725{
726    /// LBFGS parameters
727    vars: LbfgsParam,
728
729    /// Define how to evaluate gradient and value
730    prbl: Option<Problem<'a, E>>,
731    end: usize,
732    step: f64,
733    k: usize,
734    lm_arr: Vec<IterationData>,
735    pf: Vec<f64>,
736    ncall: usize,
737}
738// state:1 ends here
739
740// [[file:../lbfgs.note::*build][build:1]]
741impl Lbfgs {
742    /// Build LBFGS state struct for iteration.
743    pub fn build<'a, E>(self, x: &'a mut [f64], eval_fn: E) -> Result<LbfgsState<'a, E>>
744    where
745        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
746    {
747        // Initialize the limited memory.
748        let param = &self.param;
749        let lm_arr = (0..param.m).map(|_| IterationData::new(x.len())).collect();
750
751        // Allocate working space for LBFGS optimization
752        let owlqn = if param.orthantwise {
753            Some(param.owlqn.clone())
754        } else {
755            None
756        };
757        let mut problem = Problem::new(x, eval_fn, owlqn);
758
759        // Evaluate the function value and its gradient.
760        problem.evaluate()?;
761
762        // Compute the L1 norm of the variable and add it to the object value.
763        problem.update_owlqn_gradient();
764
765        // Compute the search direction with current gradient.
766        problem.update_search_direction();
767
768        // Compute the initial step:
769        let h0 = param.initial_inverse_hessian;
770        let step = problem.search_direction().vec2norminv() * h0;
771
772        // Apply Powell damping or not
773        let damping = param.damping;
774        if damping {
775            info!("Powell damping Enabled.");
776        }
777
778        let state = LbfgsState {
779            vars: self.param.clone(),
780            prbl: Some(problem),
781            end: 0,
782            step,
783            k: 0,
784            lm_arr,
785            pf: vec![],
786            ncall: 0,
787        };
788
789        Ok(state)
790    }
791}
792// build:1 ends here
793
794// [[file:../lbfgs.note::*propagate][propagate:1]]
795impl<'a, E> LbfgsState<'a, E>
796where
797    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
798{
799    /// Check if stopping critera met. Panics if not initialized.
800    pub fn is_converged(&mut self) -> bool {
801        // Monitor the progress.
802        let prgr = self.get_progress();
803        let converged = satisfying_stop_conditions(&self.vars, prgr);
804        converged
805    }
806
807    /// Report minimization progress. Panics if not initialized yet.
808    pub fn report(&self) -> Report {
809        Report::new(self.prbl.as_ref().expect("problem for report"))
810    }
811
812    /// Propagate in next LBFGS step. Return optimization progress on success.
813    /// Panics if not initialized.
814    pub fn propagate(&mut self) -> Result<Progress> {
815        self.k += 1;
816
817        // special case: already converged at the first point
818        if self.k == 1 {
819            let progress = self.get_progress();
820            return Ok(progress);
821        }
822
823        // Store the current position and gradient vectors.
824        let problem = self.prbl.as_mut().expect("problem for propagate");
825        problem.save_state();
826
827        // Search for an optimal step.
828        self.ncall = self
829            .vars
830            .linesearch
831            .find(problem, &mut self.step)
832            .context("Failure during line search")?;
833        problem.update_owlqn_gradient();
834
835        // Update LBFGS iteration data.
836        let it = &mut self.lm_arr[self.end];
837        let gamma = it.update(
838            &problem.x,
839            &problem.xp,
840            &problem.gx,
841            &problem.gp,
842            self.step,
843            self.vars.damping,
844        );
845
846        // Compute the steepest direction
847        problem.update_search_direction();
848        let d = problem.search_direction_mut();
849
850        // Apply LBFGS recursion procedure.
851        self.end = lbfgs_two_loop_recursion(&mut self.lm_arr, d, gamma, self.vars.m, self.k - 1, self.end);
852
853        // Now the search direction d is ready. Constrains the step size to
854        // prevent wild steps.
855        let dnorm = d.vec2norm();
856        self.step = self.vars.max_step_size.min(dnorm) / dnorm;
857
858        // Constrain the search direction for orthant-wise updates.
859        problem.constrain_search_direction();
860
861        let progress = self.get_progress();
862        Ok(progress)
863    }
864
865    fn get_progress(&self) -> Progress {
866        let problem = self.prbl.as_ref().expect("problem for progress");
867        Progress::new(&problem, self.k, self.ncall, self.step)
868    }
869}
870// propagate:1 ends here
871
872// [[file:../lbfgs.note::*recursion][recursion:1]]
873/// Algorithm 7.4, in Nocedal, J.; Wright, S. Numerical Optimization; Springer Science & Business Media, 2006.
874fn lbfgs_two_loop_recursion(
875    lm_arr: &mut [IterationData],
876    d: &mut [f64], // search direction
877    gamma: f64,    // H_k^{0} = \gamma I
878    m: usize,
879    k: usize,
880    end: usize,
881) -> usize {
882    let end = (end + 1) % m;
883    let mut j = end;
884    let bound = m.min(k);
885
886    // L-BFGS two-loop recursion, part1
887    for _ in 0..bound {
888        j = (j + m - 1) % m;
889        let it = &mut lm_arr[j as usize];
890
891        // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}.
892        it.alpha = it.s.vecdot(&d) / it.ys;
893        // q_{i} = q_{i+1} - \alpha_{i} y_{i}.
894        d.vecadd(&it.y, -it.alpha);
895    }
896    d.vecscale(gamma);
897
898    // L-BFGS two-loop recursion, part2
899    for _ in 0..bound {
900        let it = &mut lm_arr[j as usize];
901        // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}.
902        let beta = it.y.vecdot(d) / it.ys;
903        // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}.
904        d.vecadd(&it.s, it.alpha - beta);
905        j = (j + 1) % m;
906    }
907
908    end
909}
910// recursion:1 ends here
911
912// [[file:../lbfgs.note::*iteration data][iteration data:1]]
913/// Internal iternation data for L-BFGS
914#[derive(Clone)]
915struct IterationData {
916    alpha: f64,
917
918    s: Vec<f64>,
919
920    y: Vec<f64>,
921
922    /// vecdot(y, s)
923    ys: f64,
924}
925
926impl IterationData {
927    fn new(n: usize) -> Self {
928        IterationData {
929            alpha: 0.0,
930            ys: 0.0,
931            s: vec![0.0; n],
932            y: vec![0.0; n],
933        }
934    }
935
936    /// Updates L-BFGS correction pairs, returns Cholesky factor \gamma for
937    /// scaling the initial inverse Hessian matrix $H_k^0$
938    ///
939    /// # Arguments
940    ///
941    /// * x, xp: current position, and previous position
942    /// * gx, gp: current gradient and previous gradient
943    /// * step: step size along search direction
944    /// * damping: applying Powell damping to the gradient difference `y` helps
945    ///   stabilize L-BFGS from numerical noise in function value and gradient
946    ///
947    fn update(&mut self, x: &[f64], xp: &[f64], gx: &[f64], gp: &[f64], step: f64, damping: bool) -> f64 {
948        // Update vectors s and y:
949        // s_{k} = x_{k+1} - x_{k} = \alpha * d_{k}.
950        // y_{k} = g_{k+1} - g_{k}.
951        self.s.vecdiff(x, xp);
952        self.y.vecdiff(gx, gp);
953
954        // Compute scalars ys and yy:
955        // ys = y^t \cdot s = 1 / \rho.
956        // yy = y^t \cdot y.
957        // Notice that yy is used for scaling the intial inverse hessian matrix H_0 (Cholesky factor).
958        let ys = self.y.vecdot(&self.s);
959        let yy = self.y.vecdot(&self.y);
960        self.ys = ys;
961
962        // Al-Baali2014JOTA: Damped Techniques for the Limited Memory BFGS
963        // Method for Large-Scale Optimization. J. Optim. Theory Appl. 2014,
964        // 161 (2), 688–699.
965        //
966        // Nocedal suggests an equivalent value of 0.8 for sigma2 (Damped BFGS
967        // updating)
968        let sigma2 = 0.6;
969        let sigma3 = 3.0;
970        if damping {
971            debug!("Applying Powell damping, sigma2 = {}, sigma3 = {}", sigma2, sigma3);
972
973            // B_k * Sk = B_k * (x_k + step*d_k - x_k) = B_k * step * d_k = -g_k * step
974            let mut bs = gp.to_vec();
975            bs.vecscale(-step);
976            // s_k^T * B_k * s_k
977            let sbs = self.s.vecdot(&bs);
978
979            if ys < (1.0 - sigma2) * sbs {
980                trace!("damping case1");
981                let theta = sigma2 * sbs / (sbs - ys);
982                bs.vecscale(1.0 - theta);
983                bs.vecadd(&self.y, theta);
984                self.y.veccpy(&bs);
985            } else if ys > (1.0 + sigma3) * sbs {
986                trace!("damping case2");
987                let theta = sigma3 * sbs / (ys - sbs);
988                bs.vecscale(1.0 - theta);
989                bs.vecadd(&self.y, theta);
990            } else {
991                trace!("damping case3");
992                // for theta = 1.0, yk = yk, so do nothing here.
993            }
994        }
995
996        ys / yy
997    }
998}
999// iteration data:1 ends here
1000
1001// [[file:../lbfgs.note::*stopping conditions][stopping conditions:1]]
1002/// test if progress satisfying stop condition
1003#[inline]
1004fn satisfying_stop_conditions(param: &LbfgsParam, prgr: Progress) -> bool {
1005    // Buildin tests for stopping conditions
1006    if satisfying_max_iterations(&prgr, param.max_iterations)
1007        || satisfying_max_evaluations(&prgr, param.max_evaluations)
1008        || satisfying_scaled_gnorm(&prgr, param.epsilon)
1009    // || satisfying_delta(&prgr, pf, param.delta)
1010    // || satisfying_max_gnorm(&prgr, self.param.max_gnorm)
1011    {
1012        return true;
1013    }
1014
1015    false
1016}
1017
1018/// The criterion is given by the following formula:
1019///     |g(x)| / \max(1, |x|) < \epsilon
1020#[inline]
1021fn satisfying_scaled_gnorm(prgr: &Progress, epsilon: f64) -> bool {
1022    if prgr.gnorm / prgr.xnorm.max(1.0) <= epsilon {
1023        // Convergence.
1024        info!("L-BFGS reaches convergence.");
1025        true
1026    } else {
1027        false
1028    }
1029}
1030
1031/// Maximum number of lbfgs iterations.
1032#[inline]
1033fn satisfying_max_iterations(prgr: &Progress, max_iterations: usize) -> bool {
1034    if max_iterations == 0 {
1035        false
1036    } else if prgr.niter >= max_iterations {
1037        warn!("max iterations reached!");
1038        true
1039    } else {
1040        false
1041    }
1042}
1043
1044/// Maximum number of function evaluations
1045#[inline]
1046fn satisfying_max_evaluations(prgr: &Progress, max_evaluations: usize) -> bool {
1047    if max_evaluations == 0 {
1048        false
1049    } else if prgr.neval >= max_evaluations {
1050        warn!("Max allowed evaluations reached!");
1051        true
1052    } else {
1053        false
1054    }
1055}
1056
1057#[inline]
1058fn satisfying_max_gnorm(prgr: &Progress, max_gnorm: f64) -> bool {
1059    prgr.gx.vec2norm() <= max_gnorm
1060}
1061
1062/// Functiona value (fx) delta based stopping criterion
1063///
1064/// Test for stopping criterion.
1065/// The criterion is given by the following formula:
1066///    |f(past_x) - f(x)| / f(x) < delta
1067///
1068/// # Parameters
1069///
1070/// * pf: an array for storing previous values of the objective function.
1071/// * delta: max fx delta allowed
1072///
1073#[inline]
1074fn satisfying_delta<'a>(prgr: &Progress, pf: &'a mut [f64], delta: f64) -> bool {
1075    let k = prgr.niter;
1076    let fx = prgr.fx;
1077    let past = pf.len();
1078    if past < 1 {
1079        return false;
1080    }
1081
1082    // We don't test the stopping criterion while k < past.
1083    if past <= k {
1084        // Compute the relative improvement from the past.
1085        let rate = (pf[(k % past) as usize] - fx).abs() / fx;
1086        // The stopping criterion.
1087        if rate < delta {
1088            info!("The stopping criterion.");
1089            return true;
1090        }
1091    }
1092    // Store the current value of the objective function.
1093    pf[(k % past) as usize] = fx;
1094
1095    false
1096}
1097// stopping conditions:1 ends here