liblbfgs/
lbfgs.rs

1// [[file:../lbfgs.note::*header][header:1]]
2//!       Limited memory BFGS (L-BFGS).
3//
4//  Copyright (c) 1990, Jorge Nocedal
5//  Copyright (c) 2007-2010 Naoaki Okazaki
6//  Copyright (c) 2018-2019 Wenping Guo
7//  All rights reserved.
8//
9//  Permission is hereby granted, free of charge, to any person obtaining a copy
10//  of this software and associated documentation files (the "Software"), to deal
11//  in the Software without restriction, including without limitation the rights
12//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13//  copies of the Software, and to permit persons to whom the Software is
14//  furnished to do so, subject to the following conditions:
15//
16//  The above copyright notice and this permission notice shall be included in
17//  all copies or substantial portions of the Software.
18//
19//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25//  THE SOFTWARE.
26//
27// This library is a C port of the FORTRAN implementation of Limited-memory
28// Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
29// The original FORTRAN source code is available at:
30// http://www.ece.northwestern.edu/~nocedal/lbfgs.html
31//
32// The L-BFGS algorithm is described in:
33//     - Jorge Nocedal.
34//       Updating Quasi-Newton Matrices with Limited Storage.
35//       <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
36//     - Dong C. Liu and Jorge Nocedal.
37//       On the limited memory BFGS method for large scale optimization.
38//       <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
39//
40// The line search algorithms used in this implementation are described in:
41//     - John E. Dennis and Robert B. Schnabel.
42//       <i>Numerical Methods for Unconstrained Optimization and Nonlinear
43//       Equations</i>, Englewood Cliffs, 1983.
44//     - Jorge J. More and David J. Thuente.
45//       Line search algorithm with guaranteed sufficient decrease.
46//       <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
47//       pp. 286-307, 1994.
48//
49// This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
50// method presented in:
51//     - Galen Andrew and Jianfeng Gao.
52//       Scalable training of L1-regularized log-linear models.
53//       In <i>Proceedings of the 24th International Conference on Machine
54//       Learning (ICML 2007)</i>, pp. 33-40, 2007.
55
56// I would like to thank the original author, Jorge Nocedal, who has been
57// distributing the effieicnt and explanatory implementation in an open source
58// licence.
59// header:1 ends here
60
61// [[file:../lbfgs.note::*imports][imports:1]]
62use crate::core::*;
63
64use crate::math::LbfgsMath;
65use crate::line::*;
66// imports:1 ends here
67
68// [[file:../lbfgs.note::*parameters][parameters:1]]
69/// L-BFGS optimization parameters.
70///
71/// Call lbfgs_parameter_t::default() function to initialize parameters to the
72/// default values.
73#[derive(Copy, Clone, Debug)]
74#[repr(C)]
75pub struct LbfgsParam {
76    /// The number of corrections to approximate the inverse hessian matrix.
77    ///
78    /// The L-BFGS routine stores the computation results of previous \ref m
79    /// iterations to approximate the inverse hessian matrix of the current
80    /// iteration. This parameter controls the size of the limited memories
81    /// (corrections). The default value is 6. Values less than 3 are not
82    /// recommended. Large values will result in excessive computing time.
83    pub m: usize,
84
85    /// Epsilon for convergence test.
86    ///
87    /// This parameter determines the accuracy with which the solution is to be
88    /// found. A minimization terminates when
89    ///
90    /// ||g|| < epsilon * max(1, ||x||),
91    ///
92    /// where ||.|| denotes the Euclidean (L2) norm. The default value is \c
93    /// 1e-5.
94    pub epsilon: f64,
95
96    /// Distance for delta-based convergence test.
97    ///
98    /// This parameter determines the distance, in iterations, to compute the
99    /// rate of decrease of the objective function. If the value of this
100    /// parameter is zero, the library does not perform the delta-based
101    /// convergence test.
102    ///
103    /// The default value is 0.
104    pub past: usize,
105
106    /// Delta for convergence test.
107    ///
108    /// This parameter determines the minimum rate of decrease of the objective
109    /// function. The library stops iterations when the following condition is
110    /// met: |f' - f| / f < delta, where f' is the objective value of past
111    /// iterations ago, and f is the objective value of the current iteration.
112    /// The default value is 1e-5.
113    ///
114    pub delta: f64,
115
116    /// The maximum number of LBFGS iterations.
117    ///
118    /// The lbfgs optimization terminates when the iteration count exceedes this
119    /// parameter.
120    ///
121    /// Setting this parameter to zero continues an optimization process until a
122    /// convergence or error. The default value is 0.
123    pub max_iterations: usize,
124
125    /// The maximum allowed number of evaluations of function value and
126    /// gradients. This number could be larger than max_iterations since line
127    /// search procedure may involve one or more evaluations.
128    ///
129    /// Setting this parameter to zero continues an optimization process until a
130    /// convergence or error. The default value is 0.
131    pub max_evaluations: usize,
132
133    /// The line search options.
134    ///
135    ///  This parameter specifies a line search algorithm to be used by the
136    ///  L-BFGS routine.
137    ///
138    pub linesearch: LineSearch,
139
140    /// Enable OWL-QN regulation or not
141    pub orthantwise: bool,
142
143    // FIXME: better name
144    pub owlqn: Orthantwise,
145
146    /// A factor for scaling initial step size.
147    pub initial_inverse_hessian: f64,
148
149    /// The maximum allowed step size for each optimization step, useful for
150    /// preventing wild step.
151    pub max_step_size: f64,
152
153    /// Powell damping
154    pub damping: bool,
155}
156
157impl Default for LbfgsParam {
158    /// Initialize L-BFGS parameters to the default values.
159    ///
160    /// Call this function to fill a parameter structure with the default values
161    /// and overwrite parameter values if necessary.
162    fn default() -> Self {
163        LbfgsParam {
164            m: 6,
165            epsilon: 1e-5,
166            past: 0,
167            delta: 1e-5,
168            max_iterations: 0,
169            max_evaluations: 0,
170            orthantwise: false,
171            owlqn: Orthantwise::default(),
172            linesearch: LineSearch::default(),
173            initial_inverse_hessian: 1.0,
174            max_step_size: 1.0,
175            damping: false,
176        }
177    }
178}
179// parameters:1 ends here
180
181// [[file:../lbfgs.note::*problem][problem:1]]
182/// Represents an optimization problem.
183///
184/// `Problem` holds input variables `x`, gradient `gx` arrays, and function value `fx`.
185pub struct Problem<'a, E>
186where
187    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
188{
189    /// x is an array of length n. on input it must contain the base point for
190    /// the line search.
191    pub x: &'a mut [f64],
192
193    /// `fx` is a variable. It must contain the value of problem `f` at
194    /// x.
195    pub fx: f64,
196
197    /// `gx` is an array of length n. It must contain the gradient of `f` at
198    /// x.
199    pub gx: Vec<f64>,
200
201    /// Cached position vector of previous step.
202    xp: Vec<f64>,
203
204    /// Cached gradient vector of previous step.
205    gp: Vec<f64>,
206
207    /// Pseudo gradient for OrthantWise Limited-memory Quasi-Newton (owlqn) algorithm.
208    pg: Vec<f64>,
209
210    /// Search direction
211    d: Vec<f64>,
212
213    /// Store callback function for evaluating objective function.
214    eval_fn: E,
215
216    /// Orthantwise operations
217    owlqn: Option<Orthantwise>,
218
219    /// Evaluated or not
220    evaluated: bool,
221
222    /// The number of evaluation.
223    neval: usize,
224}
225
226impl<'a, E> Problem<'a, E>
227where
228    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
229{
230    /// Initialize problem with array length n
231    pub fn new(x: &'a mut [f64], eval: E, owlqn: Option<Orthantwise>) -> Self {
232        let n = x.len();
233        Problem {
234            fx: 0.0,
235            gx: vec![0.0; n],
236            xp: vec![0.0; n],
237            gp: vec![0.0; n],
238            pg: vec![0.0; n],
239            d: vec![0.0; n],
240            evaluated: false,
241            neval: 0,
242            x,
243            eval_fn: eval,
244            owlqn,
245        }
246    }
247
248    /// Compute the initial gradient in the search direction.
249    pub fn dginit(&self) -> Result<f64> {
250        if self.owlqn.is_none() {
251            let dginit = self.gx.vecdot(&self.d);
252            if dginit > 0.0 {
253                warn!(
254                    "The current search direction increases the objective function value. dginit = {:-0.4}",
255                    dginit
256                );
257            }
258
259            Ok(dginit)
260        } else {
261            Ok(self.pg.vecdot(&self.d))
262        }
263    }
264
265    /// Update search direction using evaluated gradient.
266    pub fn update_search_direction(&mut self) {
267        if self.owlqn.is_some() {
268            self.d.vecncpy(&self.pg);
269        } else {
270            self.d.vecncpy(&self.gx);
271        }
272    }
273
274    /// Return a reference to current search direction vector
275    pub fn search_direction(&self) -> &[f64] {
276        &self.d
277    }
278
279    /// Return a mutable reference to current search direction vector
280    pub fn search_direction_mut(&mut self) -> &mut [f64] {
281        &mut self.d
282    }
283
284    /// Compute the gradient in the search direction without sign checking.
285    pub fn dg_unchecked(&self) -> f64 {
286        self.gx.vecdot(&self.d)
287    }
288
289    // FIXME: improve
290    pub fn evaluate(&mut self) -> Result<()> {
291        self.fx = (self.eval_fn)(&self.x, &mut self.gx)?;
292        // self.fx = self.eval_fn.evaluate(&self.x, &mut self.gx)?;
293
294        // Compute the L1 norm of the variables and add it to the object value.
295        if let Some(owlqn) = self.owlqn {
296            self.fx += owlqn.x1norm(&self.x)
297        }
298
299        // FIXME: to be better
300        // if self.orthantwise {
301        // Compute the L1 norm of the variable and add it to the object value.
302        // fx += self.owlqn.x1norm(x);
303        // self.owlqn.pseudo_gradient(&mut pg, &x, &g);
304
305        self.evaluated = true;
306        self.neval += 1;
307
308        Ok(())
309    }
310
311    /// Return total number of evaluations.
312    pub fn number_of_evaluation(&self) -> usize {
313        self.neval
314    }
315
316    /// Test if `Problem` has been evaluated or not
317    pub fn evaluated(&self) -> bool {
318        self.evaluated
319    }
320
321    /// Copies all elements from src into self.
322    pub fn clone_from(&mut self, src: &Problem<E>) {
323        self.x.clone_from_slice(&src.x);
324        self.gx.clone_from_slice(&src.gx);
325        self.fx = src.fx;
326    }
327
328    /// Take a line step along search direction.
329    ///
330    /// Compute the current value of x: x <- x + (*step) * d.
331    ///
332    pub fn take_line_step(&mut self, step: f64) {
333        self.x.veccpy(&self.xp);
334        self.x.vecadd(&self.d, step);
335
336        // Choose the orthant for the new point.
337        // The current point is projected onto the orthant.
338        if let Some(owlqn) = self.owlqn {
339            owlqn.project(&mut self.x, &self.xp, &self.gp);
340        }
341    }
342
343    /// Return gradient vector norm: ||gx||
344    pub fn gnorm(&self) -> f64 {
345        if self.owlqn.is_some() {
346            self.pg.vec2norm()
347        } else {
348            self.gx.vec2norm()
349        }
350    }
351
352    /// Return position vector norm: ||x||
353    pub fn xnorm(&self) -> f64 {
354        self.x.vec2norm()
355    }
356
357    pub fn orthantwise(&self) -> bool {
358        self.owlqn.is_some()
359    }
360
361    /// Revert to previous step
362    pub fn revert(&mut self) {
363        self.x.veccpy(&self.xp);
364        self.gx.veccpy(&self.gp);
365    }
366
367    /// Store the current position and gradient vectors.
368    pub fn save_state(&mut self) {
369        self.xp.veccpy(&self.x);
370        self.gp.veccpy(&self.gx);
371    }
372
373    /// Constrain the search direction for orthant-wise updates.
374    pub fn constrain_search_direction(&mut self) {
375        if let Some(owlqn) = self.owlqn {
376            owlqn.constrain(&mut self.d, &self.pg);
377        }
378    }
379
380    // FIXME
381    pub fn update_owlqn_gradient(&mut self) {
382        if let Some(owlqn) = self.owlqn {
383            owlqn.pseudo_gradient(&mut self.pg, &self.x, &self.gx);
384        }
385    }
386}
387// problem:1 ends here
388
389// [[file:../lbfgs.note::*progress][progress:1]]
390/// Store optimization progress data, for progress monitor
391#[repr(C)]
392#[derive(Debug, Clone)]
393pub struct Progress<'a> {
394    /// The current values of variables
395    pub x: &'a [f64],
396
397    /// The current gradient values of variables.
398    pub gx: &'a [f64],
399
400    /// The current value of the objective function.
401    pub fx: f64,
402
403    /// The Euclidean norm of the variables
404    pub xnorm: f64,
405
406    /// The Euclidean norm of the gradients.
407    pub gnorm: f64,
408
409    /// The line-search step used for this iteration.
410    pub step: f64,
411
412    /// The iteration count.
413    pub niter: usize,
414
415    /// The total number of evaluations.
416    pub neval: usize,
417
418    /// The number of function evaluation calls in line search procedure
419    pub ncall: usize,
420}
421
422impl<'a> Progress<'a> {
423    fn new<E>(prb: &'a Problem<E>, niter: usize, ncall: usize, step: f64) -> Self
424    where
425        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
426    {
427        Progress {
428            x: &prb.x,
429            gx: &prb.gx,
430            fx: prb.fx,
431            xnorm: prb.xnorm(),
432            gnorm: prb.gnorm(),
433            neval: prb.number_of_evaluation(),
434            ncall,
435            step,
436            niter,
437        }
438    }
439}
440
441pub struct Report {
442    /// The current value of the objective function.
443    pub fx: f64,
444
445    /// The Euclidean norm of the variables
446    pub xnorm: f64,
447
448    /// The Euclidean norm of the gradients.
449    pub gnorm: f64,
450
451    /// The total number of evaluations.
452    pub neval: usize,
453}
454
455impl Report {
456    fn new<E>(prb: &Problem<E>) -> Self
457    where
458        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
459    {
460        Self {
461            fx: prb.fx,
462            xnorm: prb.xnorm(),
463            gnorm: prb.gnorm(),
464            neval: prb.number_of_evaluation(),
465        }
466    }
467}
468// progress:1 ends here
469
470// [[file:../lbfgs.note::*orthantwise][orthantwise:1]]
471/// Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) algorithm
472#[derive(Copy, Clone, Debug)]
473pub struct Orthantwise {
474    /// Coeefficient for the L1 norm of variables.
475    ///
476    ///  Setting this parameter to a positive value activates Orthant-Wise
477    ///  Limited-memory Quasi-Newton (OWL-QN) method, which minimizes the
478    ///  objective function F(x) combined with the L1 norm |x| of the variables,
479    ///  {F(x) + C |x|}. This parameter is the coeefficient for the |x|, i.e.,
480    ///  C. As the L1 norm |x| is not differentiable at zero, the library
481    ///  modifies function and gradient evaluations from a client program
482    ///  suitably; a client program thus have only to return the function value
483    ///  F(x) and gradients G(x) as usual. The default value is 1.
484    pub c: f64,
485
486    /// Start index for computing L1 norm of the variables.
487    ///
488    /// This parameter is valid only for OWL-QN method (i.e., orthantwise_c !=
489    /// 0). This parameter b (0 <= b < N) specifies the index number from which
490    /// the library computes the L1 norm of the variables x,
491    ///
492    /// |x| := |x_{b}| + |x_{b+1}| + ... + |x_{N}| .
493    ///
494    /// In other words, variables x_1, ..., x_{b-1} are not used for computing
495    /// the L1 norm. Setting b (0 < b < N), one can protect variables, x_1, ...,
496    /// x_{b-1} (e.g., a bias term of logistic regression) from being
497    /// regularized. The default value is zero.
498    pub start: i32,
499
500    /// End index for computing L1 norm of the variables.
501    ///
502    /// This parameter is valid only for OWL-QN method (i.e., \ref orthantwise_c
503    /// != 0). This parameter e (0 < e <= N) specifies the index number at which
504    /// the library stops computing the L1 norm of the variables x,
505    pub end: i32,
506}
507
508impl Default for Orthantwise {
509    fn default() -> Self {
510        Orthantwise {
511            c: 1.0,
512            start: 0,
513            end: -1,
514        }
515    }
516}
517
518impl Orthantwise {
519    // FIXME: remove
520    // a dirty wrapper for start and end
521    fn start_end(&self, x: &[f64]) -> (usize, usize) {
522        let start = self.start as usize;
523        let end = if self.end < 0 {
524            x.len()
525        } else {
526            self.end as usize
527        };
528
529        (start, end)
530    }
531
532    /// Compute the L1 norm of the variables.
533    fn x1norm(&self, x: &[f64]) -> f64 {
534        let (start, end) = self.start_end(x);
535
536        let mut s = 0.0;
537        for i in start..end {
538            s += self.c * x[i].abs();
539        }
540
541        s
542    }
543
544    /// Compute the psuedo-gradients.
545    fn pseudo_gradient(&self, pg: &mut [f64], x: &[f64], g: &[f64]) {
546        let (start, end) = self.start_end(x);
547        let c = self.c;
548
549        // Compute the negative of gradients.
550        for i in 0..start {
551            pg[i] = g[i];
552        }
553
554        // Compute the psuedo-gradients.
555        for i in start..end {
556            if x[i] < 0.0 {
557                // Differentiable.
558                pg[i] = g[i] - c;
559            } else if 0.0 < x[i] {
560                pg[i] = g[i] + c;
561            } else {
562                if g[i] < -c {
563                    // Take the right partial derivative.
564                    pg[i] = g[i] + c;
565                } else if c < g[i] {
566                    // Take the left partial derivative.
567                    pg[i] = g[i] - c;
568                } else {
569                    pg[i] = 0.;
570                }
571            }
572        }
573
574        for i in end..g.len() {
575            pg[i] = g[i];
576        }
577    }
578
579    /// Choose the orthant for the new point.
580    ///
581    /// During the line search, each search point is projected onto the orthant
582    /// of the previous point.
583    fn project(&self, x: &mut [f64], xp: &[f64], gp: &[f64]) {
584        let (start, end) = self.start_end(xp);
585
586        for i in start..end {
587            let sign = if xp[i] == 0.0 { -gp[i] } else { xp[i] };
588            if x[i] * sign <= 0.0 {
589                x[i] = 0.0
590            }
591        }
592    }
593
594    fn constrain(&self, d: &mut [f64], pg: &[f64]) {
595        let (start, end) = self.start_end(pg);
596
597        for i in start..end {
598            if d[i] * pg[i] >= 0.0 {
599                d[i] = 0.0;
600            }
601        }
602    }
603}
604// orthantwise:1 ends here
605
606// [[file:../lbfgs.note::*builder][builder:1]]
607#[derive(Default, Debug, Clone)]
608pub struct Lbfgs {
609    param: LbfgsParam,
610}
611
612/// Create lbfgs optimizer with epsilon convergence
613impl Lbfgs {
614    /// Set scaled gradient norm for converence test
615    ///
616    /// This parameter determines the accuracy with which the solution is to be
617    /// found. A minimization terminates when
618    ///
619    /// ||g|| < epsilon * max(1, ||x||),
620    ///
621    /// where ||.|| denotes the Euclidean (L2) norm. The default value is 1e-5.
622    pub fn with_epsilon(mut self, epsilon: f64) -> Self {
623        assert!(epsilon.is_sign_positive(), "Invalid parameter epsilon specified.");
624
625        self.param.epsilon = epsilon;
626
627        self
628    }
629
630    /// Set initial step size for optimization. The default value is 1.0.
631    pub fn with_initial_step_size(mut self, b: f64) -> Self {
632        assert!(
633            b.is_sign_positive(),
634            "Invalid beta parameter for scaling the initial step size."
635        );
636
637        self.param.initial_inverse_hessian = b;
638
639        self
640    }
641
642    /// Set the maximum allowed step size for optimization. The default value is 1.0.
643    pub fn with_max_step_size(mut self, s: f64) -> Self {
644        assert!(s.is_sign_positive(), "Invalid max_step_size parameter.");
645
646        self.param.max_step_size = s;
647
648        self
649    }
650
651    /// Enable Powell damping.
652    pub fn with_damping(mut self, damped: bool) -> Self {
653        self.param.damping = damped;
654
655        self
656    }
657
658    /// Set orthantwise parameters
659    pub fn with_orthantwise(mut self, c: f64, start: usize, end: usize) -> Self {
660        assert!(
661            c.is_sign_positive(),
662            "Invalid parameter orthantwise c parameter specified."
663        );
664        warn!("Only the backtracking line search is available for OWL-QN algorithm.");
665
666        self.param.orthantwise = true;
667        self.param.owlqn.c = c;
668        self.param.owlqn.start = start as i32;
669        self.param.owlqn.end = end as i32;
670
671        self
672    }
673
674    /// A parameter to control the accuracy of the line search routine.
675    ///
676    /// The default value is 1e-4. This parameter should be greater
677    /// than zero and smaller than 0.5.
678    pub fn with_linesearch_ftol(mut self, ftol: f64) -> Self {
679        assert!(ftol >= 0.0, "Invalid parameter ftol specified.");
680        self.param.linesearch.ftol = ftol;
681
682        self
683    }
684
685    /// A parameter to control the accuracy of the line search routine.
686    ///
687    /// The default value is 0.9. If the function and gradient evaluations are
688    /// inexpensive with respect to the cost of the iteration (which is
689    /// sometimes the case when solving very large problems) it may be
690    /// advantageous to set this parameter to a small value. A typical small
691    /// value is 0.1. This parameter shuold be greater than the ftol parameter
692    /// (1e-4) and smaller than 1.0.
693    pub fn with_linesearch_gtol(mut self, gtol: f64) -> Self {
694        assert!(
695            gtol >= 0.0 && gtol < 1.0 && gtol > self.param.linesearch.ftol,
696            "Invalid parameter gtol specified."
697        );
698
699        self.param.linesearch.gtol = gtol;
700
701        self
702    }
703
704    /// Try to follow gradient only during optimization, by allowing object
705    /// value rises, which removes the sufficient decrease condition constrain
706    /// in line search. This option also implies Powell damping and
707    /// BacktrackingStrongWolfe line search for improving robustness.
708    pub fn with_gradient_only(mut self) -> Self {
709        self.param.linesearch.gradient_only = true;
710        self.param.damping = true;
711        self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe;
712
713        self
714    }
715
716    /// Set the max number of iterations for line search.
717    pub fn with_max_linesearch(mut self, n: usize) -> Self {
718        self.param.linesearch.max_linesearch = n;
719
720        self
721    }
722
723    /// xtol is a nonnegative input variable. termination occurs when the
724    /// relative width of the interval of uncertainty is at most xtol.
725    ///
726    /// The machine precision for floating-point values.
727    ///
728    ///  This parameter must be a positive value set by a client program to
729    ///  estimate the machine precision. The line search routine will terminate
730    ///  with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
731    ///  of the interval of uncertainty is less than this parameter.
732    pub fn with_linesearch_xtol(mut self, xtol: f64) -> Self {
733        assert!(xtol >= 0.0, "Invalid parameter xtol specified.");
734
735        self.param.linesearch.xtol = xtol;
736        self
737    }
738
739    /// The minimum step of the line search routine.
740    ///
741    /// The default value is 1e-20. This value need not be modified unless the
742    /// exponents are too large for the machine being used, or unless the
743    /// problem is extremely badly scaled (in which case the exponents should be
744    /// increased).
745    pub fn with_linesearch_min_step(mut self, min_step: f64) -> Self {
746        assert!(min_step >= 0.0, "Invalid parameter min_step specified.");
747
748        self.param.linesearch.min_step = min_step;
749        self
750    }
751
752    /// Set the maximum number of iterations.
753    ///
754    /// The lbfgs optimization terminates when the iteration count exceedes this
755    /// parameter. Setting this parameter to zero continues an optimization
756    /// process until a convergence or error.
757    ///
758    /// The default value is 0.
759    pub fn with_max_iterations(mut self, niter: usize) -> Self {
760        self.param.max_iterations = niter;
761        self
762    }
763
764    /// The maximum allowed number of evaluations of function value and
765    /// gradients. This number could be larger than max_iterations since line
766    /// search procedure may involve one or more evaluations.
767    ///
768    /// Setting this parameter to zero continues an optimization process until a
769    /// convergence or error. The default value is 0.
770    pub fn with_max_evaluations(mut self, neval: usize) -> Self {
771        self.param.max_evaluations = neval;
772        self
773    }
774
775    /// This parameter determines the minimum rate of decrease of the objective
776    /// function. The library stops iterations when the following condition is
777    /// met: |f' - f| / f < delta, where f' is the objective value of past
778    /// iterations ago, and f is the objective value of the current iteration.
779    ///
780    /// If `past` is zero, the library does not perform the delta-based
781    /// convergence test.
782    ///
783    /// The default value of delta is 1e-5.
784    ///
785    pub fn with_fx_delta(mut self, delta: f64, past: usize) -> Self {
786        assert!(delta >= 0.0, "Invalid parameter delta specified.");
787
788        self.param.past = past;
789        self.param.delta = delta;
790        self
791    }
792
793    /// Select line search algorithm
794    ///
795    /// The default is "MoreThuente" line search algorithm.
796    pub fn with_linesearch_algorithm(mut self, algo: &str) -> Self {
797        match algo {
798            "MoreThuente" => self.param.linesearch.algorithm = LineSearchAlgorithm::MoreThuente,
799            "BacktrackingArmijo" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingArmijo,
800            "BacktrackingStrongWolfe" => self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingStrongWolfe,
801            "BacktrackingWolfe" | "Backtracking" => {
802                self.param.linesearch.algorithm = LineSearchAlgorithm::BacktrackingWolfe
803            }
804            _ => unimplemented!(),
805        }
806
807        self
808    }
809}
810// builder:1 ends here
811
812// [[file:../lbfgs.note::*hack][hack:1]]
813impl Lbfgs {
814    /// Start the L-BFGS optimization; this will invoke the callback functions evaluate
815    /// and progress.
816    /// 
817    /// # Parameters
818    /// 
819    /// * x       : The array of input variables.
820    /// * evaluate: A closure for evaluating function value and gradient
821    /// * progress: A closure for monitor progress or defining stopping condition
822    /// 
823    /// # Return
824    /// 
825    /// * on success, return final evaluated `Problem`.
826    pub fn minimize<E, G>(self, x: &mut [f64], eval_fn: E, mut prgr_fn: G) -> Result<Report>
827    where
828        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
829        G: FnMut(&Progress) -> bool,
830    {
831        let mut state = self.build(x, eval_fn)?;
832        info!("start lbfgs loop...");
833        for _ in 0.. {
834            if state.is_converged() {
835                break;
836            }
837            let prgr = state.get_progress();
838            let cancel = prgr_fn(&prgr);
839            if cancel {
840                info!("The minimization process has been canceled.");
841                break;
842            }
843            state.propagate()?;
844        }
845
846        // Return the final value of the objective function.
847        Ok(state.report())
848    }
849}
850// hack:1 ends here
851
852// [[file:../lbfgs.note::*state][state:1]]
853/// LBFGS optimization state allowing iterative propagation
854pub struct LbfgsState<'a, E>
855where
856    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
857{
858    /// LBFGS parameters
859    vars: LbfgsParam,
860
861    /// Define how to evaluate gradient and value
862    prbl: Option<Problem<'a, E>>,
863    end: usize,
864    step: f64,
865    k: usize,
866    lm_arr: Vec<IterationData>,
867    pf: Vec<f64>,
868    ncall: usize,
869}
870// state:1 ends here
871
872// [[file:../lbfgs.note::*build][build:1]]
873impl Lbfgs {
874    /// Build LBFGS state struct for iteration.
875    pub fn build<'a, E>(self, x: &'a mut [f64], eval_fn: E) -> Result<LbfgsState<'a, E>>
876    where
877        E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
878    {
879        // Initialize the limited memory.
880        let param = &self.param;
881        let lm_arr = (0..param.m).map(|_| IterationData::new(x.len())).collect();
882
883        // Allocate working space for LBFGS optimization
884        let owlqn = if param.orthantwise {
885            Some(param.owlqn.clone())
886        } else {
887            None
888        };
889        let mut problem = Problem::new(x, eval_fn, owlqn);
890
891        // Evaluate the function value and its gradient.
892        problem.evaluate()?;
893
894        // Compute the L1 norm of the variable and add it to the object value.
895        problem.update_owlqn_gradient();
896
897        // Compute the search direction with current gradient.
898        problem.update_search_direction();
899
900        // Compute the initial step:
901        let h0 = param.initial_inverse_hessian;
902        let step = problem.search_direction().vec2norminv() * h0;
903
904        // Apply Powell damping or not
905        let damping = param.damping;
906        if damping {
907            info!("Powell damping Enabled.");
908        }
909
910        let state = LbfgsState {
911            vars: self.param.clone(),
912            prbl: Some(problem),
913            end: 0,
914            step,
915            k: 0,
916            lm_arr,
917            pf: vec![],
918            ncall: 0,
919        };
920
921        Ok(state)
922    }
923}
924// build:1 ends here
925
926// [[file:../lbfgs.note::*propagate][propagate:1]]
927impl<'a, E> LbfgsState<'a, E>
928where
929    E: FnMut(&[f64], &mut [f64]) -> Result<f64>,
930{
931    /// Check if stopping critera met. Panics if not initialized.
932    pub fn is_converged(&mut self) -> bool {
933        // Monitor the progress.
934        let prgr = self.get_progress();
935        let converged = satisfying_stop_conditions(&self.vars, prgr);
936        converged
937    }
938
939    /// Report minimization progress. Panics if not initialized yet.
940    pub fn report(&self) -> Report {
941        Report::new(self.prbl.as_ref().expect("problem for report"))
942    }
943
944    /// Propagate in next LBFGS step. Return optimization progress on success.
945    /// Panics if not initialized.
946    pub fn propagate(&mut self) -> Result<Progress> {
947        self.k += 1;
948
949        // special case: already converged at the first point
950        if self.k == 1 {
951            let progress = self.get_progress();
952            return Ok(progress);
953        }
954
955        // Store the current position and gradient vectors.
956        let problem = self.prbl.as_mut().expect("problem for propagate");
957        problem.save_state();
958
959        // Search for an optimal step.
960        self.ncall = self
961            .vars
962            .linesearch
963            .find(problem, &mut self.step)
964            .context("Failure during line search")?;
965        problem.update_owlqn_gradient();
966
967        // Update LBFGS iteration data.
968        let it = &mut self.lm_arr[self.end];
969        let gamma = it.update(
970            &problem.x,
971            &problem.xp,
972            &problem.gx,
973            &problem.gp,
974            self.step,
975            self.vars.damping,
976        );
977
978        // Compute the steepest direction
979        problem.update_search_direction();
980        let d = problem.search_direction_mut();
981
982        // Apply LBFGS recursion procedure.
983        self.end = lbfgs_two_loop_recursion(&mut self.lm_arr, d, gamma, self.vars.m, self.k - 1, self.end);
984
985        // Now the search direction d is ready. Constrains the step size to
986        // prevent wild steps.
987        let dnorm = d.vec2norm();
988        self.step = self.vars.max_step_size.min(dnorm) / dnorm;
989
990        // Constrain the search direction for orthant-wise updates.
991        problem.constrain_search_direction();
992
993        let progress = self.get_progress();
994        Ok(progress)
995    }
996
997    fn get_progress(&self) -> Progress {
998        let problem = self.prbl.as_ref().expect("problem for progress");
999        Progress::new(&problem, self.k, self.ncall, self.step)
1000    }
1001}
1002// propagate:1 ends here
1003
1004// [[file:../lbfgs.note::*recursion][recursion:1]]
1005/// Algorithm 7.4, in Nocedal, J.; Wright, S. Numerical Optimization; Springer Science & Business Media, 2006.
1006fn lbfgs_two_loop_recursion(
1007    lm_arr: &mut [IterationData],
1008    d: &mut [f64], // search direction
1009    gamma: f64,    // H_k^{0} = \gamma I
1010    m: usize,
1011    k: usize,
1012    end: usize,
1013) -> usize {
1014    let end = (end + 1) % m;
1015    let mut j = end;
1016    let bound = m.min(k);
1017
1018    // L-BFGS two-loop recursion, part1
1019    for _ in 0..bound {
1020        j = (j + m - 1) % m;
1021        let it = &mut lm_arr[j as usize];
1022
1023        // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}.
1024        it.alpha = it.s.vecdot(&d) / it.ys;
1025        // q_{i} = q_{i+1} - \alpha_{i} y_{i}.
1026        d.vecadd(&it.y, -it.alpha);
1027    }
1028    d.vecscale(gamma);
1029
1030    // L-BFGS two-loop recursion, part2
1031    for _ in 0..bound {
1032        let it = &mut lm_arr[j as usize];
1033        // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}.
1034        let beta = it.y.vecdot(d) / it.ys;
1035        // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}.
1036        d.vecadd(&it.s, it.alpha - beta);
1037        j = (j + 1) % m;
1038    }
1039
1040    end
1041}
1042// recursion:1 ends here
1043
1044// [[file:../lbfgs.note::*iteration data][iteration data:1]]
1045/// Internal iternation data for L-BFGS
1046#[derive(Clone)]
1047struct IterationData {
1048    alpha: f64,
1049
1050    s: Vec<f64>,
1051
1052    y: Vec<f64>,
1053
1054    /// vecdot(y, s)
1055    ys: f64,
1056}
1057
1058impl IterationData {
1059    fn new(n: usize) -> Self {
1060        IterationData {
1061            alpha: 0.0,
1062            ys: 0.0,
1063            s: vec![0.0; n],
1064            y: vec![0.0; n],
1065        }
1066    }
1067
1068    /// Updates L-BFGS correction pairs, returns Cholesky factor \gamma for
1069    /// scaling the initial inverse Hessian matrix $H_k^0$
1070    ///
1071    /// # Arguments
1072    ///
1073    /// * x, xp: current position, and previous position
1074    /// * gx, gp: current gradient and previous gradient
1075    /// * step: step size along search direction
1076    /// * damping: applying Powell damping to the gradient difference `y` helps
1077    ///   stabilize L-BFGS from numerical noise in function value and gradient
1078    ///
1079    fn update(&mut self, x: &[f64], xp: &[f64], gx: &[f64], gp: &[f64], step: f64, damping: bool) -> f64 {
1080        // Update vectors s and y:
1081        // s_{k} = x_{k+1} - x_{k} = \alpha * d_{k}.
1082        // y_{k} = g_{k+1} - g_{k}.
1083        self.s.vecdiff(x, xp);
1084        self.y.vecdiff(gx, gp);
1085
1086        // Compute scalars ys and yy:
1087        // ys = y^t \cdot s = 1 / \rho.
1088        // yy = y^t \cdot y.
1089        // Notice that yy is used for scaling the intial inverse hessian matrix H_0 (Cholesky factor).
1090        let ys = self.y.vecdot(&self.s);
1091        let yy = self.y.vecdot(&self.y);
1092        self.ys = ys;
1093
1094        // Al-Baali2014JOTA: Damped Techniques for the Limited Memory BFGS
1095        // Method for Large-Scale Optimization. J. Optim. Theory Appl. 2014,
1096        // 161 (2), 688–699.
1097        //
1098        // Nocedal suggests an equivalent value of 0.8 for sigma2 (Damped BFGS
1099        // updating)
1100        let sigma2 = 0.6;
1101        let sigma3 = 3.0;
1102        if damping {
1103            debug!("Applying Powell damping, sigma2 = {}, sigma3 = {}", sigma2, sigma3);
1104
1105            // B_k * Sk = B_k * (x_k + step*d_k - x_k) = B_k * step * d_k = -g_k * step
1106            let mut bs = gp.to_vec();
1107            bs.vecscale(-step);
1108            // s_k^T * B_k * s_k
1109            let sbs = self.s.vecdot(&bs);
1110
1111            if ys < (1.0 - sigma2) * sbs {
1112                trace!("damping case1");
1113                let theta = sigma2 * sbs / (sbs - ys);
1114                bs.vecscale(1.0 - theta);
1115                bs.vecadd(&self.y, theta);
1116                self.y.veccpy(&bs);
1117            } else if ys > (1.0 + sigma3) * sbs {
1118                trace!("damping case2");
1119                let theta = sigma3 * sbs / (ys - sbs);
1120                bs.vecscale(1.0 - theta);
1121                bs.vecadd(&self.y, theta);
1122            } else {
1123                trace!("damping case3");
1124                // for theta = 1.0, yk = yk, so do nothing here.
1125            }
1126        }
1127
1128        ys / yy
1129    }
1130}
1131// iteration data:1 ends here
1132
1133// [[file:../lbfgs.note::*stopping conditions][stopping conditions:1]]
1134/// test if progress satisfying stop condition
1135#[inline]
1136fn satisfying_stop_conditions(param: &LbfgsParam, prgr: Progress) -> bool {
1137    // Buildin tests for stopping conditions
1138    if satisfying_max_iterations(&prgr, param.max_iterations)
1139        || satisfying_max_evaluations(&prgr, param.max_evaluations)
1140        || satisfying_scaled_gnorm(&prgr, param.epsilon)
1141    // || satisfying_delta(&prgr, pf, param.delta)
1142    // || satisfying_max_gnorm(&prgr, self.param.max_gnorm)
1143    {
1144        return true;
1145    }
1146
1147    false
1148}
1149
1150/// The criterion is given by the following formula:
1151///     |g(x)| / \max(1, |x|) < \epsilon
1152#[inline]
1153fn satisfying_scaled_gnorm(prgr: &Progress, epsilon: f64) -> bool {
1154    if prgr.gnorm / prgr.xnorm.max(1.0) <= epsilon {
1155        // Convergence.
1156        info!("L-BFGS reaches convergence.");
1157        true
1158    } else {
1159        false
1160    }
1161}
1162
1163/// Maximum number of lbfgs iterations.
1164#[inline]
1165fn satisfying_max_iterations(prgr: &Progress, max_iterations: usize) -> bool {
1166    if max_iterations == 0 {
1167        false
1168    } else if prgr.niter >= max_iterations {
1169        warn!("max iterations reached!");
1170        true
1171    } else {
1172        false
1173    }
1174}
1175
1176/// Maximum number of function evaluations
1177#[inline]
1178fn satisfying_max_evaluations(prgr: &Progress, max_evaluations: usize) -> bool {
1179    if max_evaluations == 0 {
1180        false
1181    } else if prgr.neval >= max_evaluations {
1182        warn!("Max allowed evaluations reached!");
1183        true
1184    } else {
1185        false
1186    }
1187}
1188
1189#[inline]
1190fn satisfying_max_gnorm(prgr: &Progress, max_gnorm: f64) -> bool {
1191    prgr.gx.vec2norm() <= max_gnorm
1192}
1193
1194/// Functiona value (fx) delta based stopping criterion
1195///
1196/// Test for stopping criterion.
1197/// The criterion is given by the following formula:
1198///    |f(past_x) - f(x)| / f(x) < delta
1199///
1200/// # Parameters
1201///
1202/// * pf: an array for storing previous values of the objective function.
1203/// * delta: max fx delta allowed
1204///
1205#[inline]
1206fn satisfying_delta<'a>(prgr: &Progress, pf: &'a mut [f64], delta: f64) -> bool {
1207    let k = prgr.niter;
1208    let fx = prgr.fx;
1209    let past = pf.len();
1210    if past < 1 {
1211        return false;
1212    }
1213
1214    // We don't test the stopping criterion while k < past.
1215    if past <= k {
1216        // Compute the relative improvement from the past.
1217        let rate = (pf[(k % past) as usize] - fx).abs() / fx;
1218        // The stopping criterion.
1219        if rate < delta {
1220            info!("The stopping criterion.");
1221            return true;
1222        }
1223    }
1224    // Store the current value of the objective function.
1225    pf[(k % past) as usize] = fx;
1226
1227    false
1228}
1229// stopping conditions:1 ends here
liblbfgs/lbfgs.rs

liblbfgs/
lbfgs.rs