optimization_solvers/steepest_descent/
pnorm_descent.rs

1use super::*;
2
3// All the algorithms in the family of steepest descent differ only in the way they compute the descent direction (i.e. they differ in the norm used so that the associated unit ball is the constraint set on which search the direction that minimizes the directional derivative at the current iterate. Typically this minimizer is a unit vector but any scaled version of the vector is good (the line search will adjust the direction later), so it's good supplying the rescaled version of the minimizer which has minimal computational cost).
4
5// the family of steepest descent algorithms has (at most) linear convergence rate, and it's possible to see it by computing the trajectory of the upper bound of the log-suboptimality error ln(f(x_k)-p^*) where p^* is the optimal value of the problem. In particular, the convergence drops significantly if the upper bound of the condition number of the hessian matrix of the function is high (you can see it by solving the log-suboptimality error trajectory for the iteration number k). Recall that an upper bound on the condition number of the hessian can be derived by taking the ratio between the maximal and the minimal eigenvalue of the hessian matrix. This condition number can be also thought as the volume of the ellipsoid {x: x^T H x <= 1} where H is the hessian matrix of the function, which is always relatable to the volume of the euclidean unit ball gamma*sqrt{det (H^TH)} where gamma is the volume of the euclidean unit ball.The p-norm descent tries to tackle this issue by taking a Matrix P that proxies correctly the hessian matrix (i.e. its unit norm {x: x^T P x <= 1} is a good approximation of the sublevel sets of the function), and this adjustments decreases the condition number of P^{-0.5} H P^{-0.5} because it would resemble (more or less) the identity matrix. It's from this intuition that the newton and quasi-newton methods become more clear.
6
7// Notice that the the pnorm descent is equivalent to the steepest descent with P= identity matrix
8// This approach finds the direction of the steepest descent by minimizing the directional derivative (at current iterate) over the ellipsoid {d: d^T P d <= 1} (which could be thought as the unit ball of the P-norm ||P^(-1/2) d||_2)
9// The best thing would be picking a matrix P (and then compute its inverse) such that the P is a good approximation of the hessian of the function. By doing this, the condition number of the hessian is in control and the convergence rate of the algorithm is improved. It's from this rationale that newton and quasi-newton methods are born.
10
11#[derive(derive_getters::Getters)]
12pub struct PnormDescent {
13    pub grad_tol: Floating,
14    pub x: DVector<Floating>,
15    pub k: usize,
16    pub inverse_p: DMatrix<Floating>,
17}
18
19impl PnormDescent {
20    pub fn new(grad_tol: Floating, x0: DVector<Floating>, inverse_p: DMatrix<Floating>) -> Self {
21        PnormDescent {
22            grad_tol,
23            x: x0,
24            k: 0,
25            inverse_p,
26        }
27    }
28}
29
30impl ComputeDirection for PnormDescent {
31    fn compute_direction(
32        &mut self,
33        eval: &FuncEvalMultivariate,
34    ) -> Result<DVector<Floating>, SolverError> {
35        Ok(-&self.inverse_p * eval.g())
36    }
37}
38
39impl LineSearchSolver for PnormDescent {
40    fn xk(&self) -> &DVector<Floating> {
41        &self.x
42    }
43    fn xk_mut(&mut self) -> &mut DVector<Floating> {
44        &mut self.x
45    }
46    fn k(&self) -> &usize {
47        &self.k
48    }
49    fn k_mut(&mut self) -> &mut usize {
50        &mut self.k
51    }
52    fn has_converged(&self, eval: &FuncEvalMultivariate) -> bool {
53        // we verify that the norm of the gradient is below the tolerance.
54        let grad = eval.g();
55        // we compute the infinity norm of the gradient
56        grad.iter()
57            .fold(Floating::NEG_INFINITY, |acc, x| x.abs().max(acc))
58            < self.grad_tol
59    }
60
61    fn update_next_iterate<LS: LineSearch>(
62        &mut self,
63        line_search: &mut LS,
64        eval_x_k: &FuncEvalMultivariate, //eval: &FuncEvalMultivariate,
65        oracle: &mut impl FnMut(&DVector<Floating>) -> FuncEvalMultivariate,
66        direction: &DVector<Floating>,
67        max_iter_line_search: usize,
68    ) -> Result<(), SolverError> {
69        let step = line_search.compute_step_len(
70            self.xk(),
71            eval_x_k,
72            direction,
73            oracle,
74            max_iter_line_search,
75        );
76
77        debug!(target: "pnorm_descent", "ITERATE: {} + {} * {} = {}", self.xk(), step, direction, self.xk() + step * direction);
78
79        let next_iterate = self.xk() + step * direction;
80
81        *self.xk_mut() = next_iterate;
82
83        Ok(())
84    }
85}
86
87#[cfg(test)]
88mod gpnorm_descent_test {
89    use super::*;
90
91    #[test]
92    pub fn pnorm_morethuente() {
93        std::env::set_var("RUST_LOG", "info");
94
95        let _ = Tracer::default()
96            .with_stdout_layer(Some(LogFormat::Normal))
97            .build();
98        let gamma = 90.0;
99        let f_and_g = |x: &DVector<Floating>| -> FuncEvalMultivariate {
100            let f = 0.5 * (x[0].powi(2) + gamma * x[1].powi(2));
101            let g = DVector::from(vec![x[0], gamma * x[1]]);
102            (f, g).into()
103        };
104        // We compute the inverse hessian of the function. Notice that the hessian is constant since the objective function is quadratic
105        let inv_hessian = DMatrix::from_iterator(2, 2, vec![1.0, 0.0, 0.0, 1.0 / gamma]);
106        // let inv_hessian = DMatrix::identity(2, 2);
107
108        // Linesearch builder
109        let mut ls = MoreThuente::default();
110
111        // pnorm descent builder
112        let tol = 1e-12;
113        let x_0 = DVector::from(vec![180.0, 152.0]);
114        let mut gd = PnormDescent::new(tol, x_0, inv_hessian);
115
116        // Minimization
117        let max_iter_solver = 1000;
118        let max_iter_line_search = 100;
119
120        gd.minimize(
121            &mut ls,
122            f_and_g,
123            max_iter_solver,
124            max_iter_line_search,
125            None,
126        )
127        .unwrap();
128
129        println!("Iterate: {:?}", gd.xk());
130
131        let eval = f_and_g(gd.xk());
132        println!("Function eval: {:?}", eval);
133        println!("Gradient norm: {:?}", eval.g().norm());
134        println!("tol: {:?}", tol);
135
136        let convergence = gd.has_converged(&eval);
137        println!("Convergence: {:?}", convergence);
138
139        assert!((eval.f() - 0.0).abs() < 1e-6);
140    }
141
142    #[test]
143    pub fn pnorm_backtracking() {
144        std::env::set_var("RUST_LOG", "info");
145
146        let _ = Tracer::default()
147            .with_stdout_layer(Some(LogFormat::Normal))
148            .build();
149        let gamma = 90.0;
150        let f_and_g = |x: &DVector<Floating>| -> FuncEvalMultivariate {
151            let f = 0.5 * (x[0].powi(2) + gamma * x[1].powi(2));
152            let g = DVector::from(vec![x[0], gamma * x[1]]);
153            (f, g).into()
154        };
155        // We compute the inverse hessian of the function. Notice that the hessian is constant since the objective function is quadratic
156        let inv_hessian = DMatrix::from_iterator(2, 2, vec![1.0, 0.0, 0.0, 1.0 / gamma]);
157        // let inv_hessian = DMatrix::identity(2, 2);
158
159        // Linesearch builder
160        let alpha = 1e-4;
161        let beta = 0.5;
162        let mut ls = BackTracking::new(alpha, beta);
163
164        // pnorm descent builder
165        let tol = 1e-12;
166        let x_0 = DVector::from(vec![180.0, 152.0]);
167        let mut gd = PnormDescent::new(tol, x_0, inv_hessian);
168
169        // Minimization
170        let max_iter_solver = 1000;
171        let max_iter_line_search = 100;
172
173        gd.minimize(
174            &mut ls,
175            f_and_g,
176            max_iter_solver,
177            max_iter_line_search,
178            None,
179        )
180        .unwrap();
181
182        println!("Iterate: {:?}", gd.xk());
183
184        let eval = f_and_g(gd.xk());
185        println!("Function eval: {:?}", eval);
186        println!("Gradient norm: {:?}", eval.g().norm());
187        println!("tol: {:?}", tol);
188
189        let convergence = gd.has_converged(&eval);
190        println!("Convergence: {:?}", convergence);
191
192        assert!((eval.f() - 0.0).abs() < 1e-6);
193    }
194}