gam_geometry/optimizer.rs
1use ndarray::{Array1, ArrayView1};
2
3use crate::manifold::{GeometryResult, RiemannianManifold, check_len, quad_form};
4
5/// Linear factor of the Steihaug truncated-CG forcing sequence: the inner CG
6/// solve is terminated once the residual drops to `min(η·‖r₀‖, ‖r₀‖²)`. The
7/// quadratic `‖r₀‖²` term gives the super-linear convergence of an inexact
8/// Newton step near the optimum, while `η·‖r₀‖` caps wasted inner work far from
9/// it (Nocedal & Wright, *Numerical Optimization*, §7.1, eq. 7.3).
10const STEIHAUG_CG_FORCING_FACTOR: f64 = 1.0e-2;
11
12pub trait RiemannianObjective {
13 fn value_gradient(&mut self, point: ArrayView1<'_, f64>) -> GeometryResult<(f64, Array1<f64>)>;
14
15 /// Riemannian Hessian–vector product `H(x)·v` for a tangent direction `v`
16 /// at `point`, returned in the same ambient/tangent coordinates as the
17 /// gradient.
18 ///
19 /// This is what upgrades the trust-region subproblem from a Cauchy-point
20 /// step (the exact minimizer of the *linear* model along the steepest
21 /// descent direction) to a Steihaug truncated-CG step that exploits real
22 /// curvature. An objective that exposes no second-order information returns
23 /// `None` (the default), and the trust region transparently falls back to
24 /// the Cauchy point — never to plain clipped steepest descent, which has no
25 /// model, no predicted/actual reduction ratio, and no accept/reject.
26 ///
27 /// The Riemannian-Hessian quadratic model the trust region builds from this
28 /// product is a valid second-order model of `f` only along a (≥)second-order
29 /// retraction (the exponential map, or any retraction with
30 /// [`RiemannianManifold::retraction_is_second_order`] `== true`). On a
31 /// manifold whose `retract` is only FIRST-order (e.g. the Stiefel/Grassmann
32 /// QR retraction) the second derivative of the pullback `f∘R_x` is not the
33 /// Riemannian Hessian, so the trust region ignores this curvature and uses
34 /// the first-order-correct Cauchy model instead (issue #956).
35 fn hessian_vector_product(
36 &mut self,
37 point: ArrayView1<'_, f64>,
38 tangent: ArrayView1<'_, f64>,
39 ) -> GeometryResult<Option<Array1<f64>>> {
40 // Validate the shapes the contract requires (a tangent at `point`), then
41 // report "no curvature available" so the trust region selects the
42 // Cauchy point. We never fabricate a Hessian here.
43 check_len("hessian_vector_product tangent", tangent.len(), point.len())?;
44 Ok(None)
45 }
46}
47
48/// Metric inner product `g_x(a, b) = aᵀ G(x) b` using the manifold metric
49/// tensor at `point`. For manifolds whose metric is the ambient identity
50/// (Euclidean, Sphere, Circle, Torus, …) this reduces to the Euclidean dot
51/// product; for a genuine Riemannian metric (e.g. the affine-invariant SPD
52/// metric) it evaluates the correct geometric inner product on the tangent
53/// space. Every norm and inner product in both optimizers below routes through
54/// this so the algorithms are metric-correct on curved manifolds.
55fn g_inner(
56 manifold: &dyn RiemannianManifold,
57 point: ArrayView1<'_, f64>,
58 a: ArrayView1<'_, f64>,
59 b: ArrayView1<'_, f64>,
60) -> GeometryResult<f64> {
61 let g = manifold.metric_tensor(point)?;
62 Ok(quad_form(g.view(), a, b))
63}
64
65fn g_norm(
66 manifold: &dyn RiemannianManifold,
67 point: ArrayView1<'_, f64>,
68 a: ArrayView1<'_, f64>,
69) -> GeometryResult<f64> {
70 Ok(g_inner(manifold, point, a, a)?.max(0.0).sqrt())
71}
72
73/// Shift-invariant relative-gradient stationarity measure
74/// `‖grad_k‖_g / max(‖grad_0‖_g, 1)`, comparing the current Riemannian gradient
75/// norm to the gradient norm at the INITIAL iterate. The initial gradient norm
76/// carries the same *multiplicative* scale the objective and its gradient share
77/// (`f → c·f` ⇒ `grad → c·grad`), so a fixed `grad_tol` still reads as a
78/// *relative* tolerance — but, unlike dividing by `max(|f|, 1)`, `‖grad_0‖` is
79/// invariant under an additive shift `f → f + C`, which leaves the minimizers,
80/// gradient, trust-region model reduction, Armijo slope, and accepted path all
81/// unchanged. Dividing by `|f|` was non-invariant: a large additive constant
82/// inflates the denominator and can falsely certify convergence at a
83/// non-stationary iterate (e.g. `f̃(x) = C + x²` at `x = 1` with `C > 2/τ − 1`),
84/// issue #954. The `max(·, 1)` floor reduces this to the absolute test
85/// `‖grad_k‖ ≤ grad_tol` on a unit-scale objective and preserves the
86/// O(n)-gradient calibration of the profiled REML latent objective (whose
87/// `‖grad_0‖` is itself O(n), issue #879). The non-intrinsic `‖x‖_typ` factor is
88/// dropped: ambient iterate magnitude is not coordinate/chart invariant on a
89/// manifold, so it does not belong in a Riemannian stationarity test. A
90/// non-finite gradient maps to `+∞` so a blown-up iterate is never stationary.
91fn relative_stationarity(grad_norm: f64, grad0_norm: f64) -> f64 {
92 if !grad_norm.is_finite() || !grad0_norm.is_finite() {
93 return f64::INFINITY;
94 }
95 grad_norm / grad0_norm.max(1.0)
96}
97
98#[derive(Debug, Clone, PartialEq)]
99pub struct RiemannianTrustRegion {
100 /// Initial trust-region radius Δ₀.
101 pub radius: f64,
102 /// Hard cap Δmax on the radius across all iterations.
103 pub max_radius: f64,
104 pub max_iter: usize,
105 pub grad_tol: f64,
106}
107
108impl Default for RiemannianTrustRegion {
109 fn default() -> Self {
110 Self {
111 radius: 1.0,
112 max_radius: 1.0e6,
113 max_iter: 64,
114 grad_tol: 1.0e-8,
115 }
116 }
117}
118
119impl RiemannianTrustRegion {
120 /// A genuine Riemannian trust-region method.
121 ///
122 /// At each iterate `x` we build the quadratic model in the tangent space
123 /// `T_xM`,
124 ///
125 /// ```text
126 /// m(η) = f(x) + g_x(grad, η) + ½ g_x(η, Hη),
127 /// ```
128 ///
129 /// where `g_x(·,·)` is the manifold metric inner product and `H` is the
130 /// Riemannian Hessian (accessed only through Hessian–vector products). The
131 /// step is the (approximate) solution of the trust-region subproblem
132 ///
133 /// ```text
134 /// min_{η ∈ T_xM, ‖η‖_g ≤ Δ} m(η).
135 /// ```
136 ///
137 /// When the objective supplies Hessian–vector products AND the manifold's
138 /// `retract` is at least a second-order retraction
139 /// ([`RiemannianManifold::retraction_is_second_order`]) we solve the
140 /// subproblem with the Steihaug truncated-CG method (stopping at negative
141 /// curvature or the trust-region boundary). Otherwise — no curvature, or a
142 /// first-order retraction whose pullback second derivative is not the
143 /// Riemannian Hessian (issue #956) — we fall back to the Cauchy point: the
144 /// exact minimizer of the model along the steepest-descent direction within
145 /// the trust region (with curvature taken from the model where available,
146 /// and the boundary point of the decreasing linear model otherwise). The
147 /// linear term `Df_x[η]` is retraction-independent, so the Cauchy model
148 /// keeps ρ and the radius control valid along any retraction. Either way
149 /// this is a real model-based step — not clipped descent.
150 ///
151 /// We then form the ratio of actual to predicted reduction
152 ///
153 /// ```text
154 /// ρ = (f(x) − f(x⁺)) / (m(0) − m(η)),
155 /// ```
156 ///
157 /// accept the step only when `ρ > η₁`, and adapt Δ: shrink on a poor ratio,
158 /// expand on an excellent ratio that reaches the boundary, otherwise hold.
159 /// Only accepted steps are retracted onto the manifold.
160 pub fn minimize(
161 &self,
162 manifold: &dyn RiemannianManifold,
163 objective: &mut dyn RiemannianObjective,
164 initial: ArrayView1<'_, f64>,
165 ) -> GeometryResult<Array1<f64>> {
166 // Trust-region acceptance / radius-update constants.
167 const ETA1: f64 = 0.1; // accept the step iff ρ > ETA1
168 const ETA_SHRINK: f64 = 0.25; // ρ below this ⇒ shrink the radius
169 const ETA_EXPAND: f64 = 0.75; // ρ above this (at the boundary) ⇒ expand
170 const SHRINK: f64 = 0.25;
171 const EXPAND: f64 = 2.0;
172 let mut x = initial.to_owned();
173 let d = manifold.ambient_dim();
174 check_len("trust-region initial point", x.len(), d)?;
175
176 // Establish the trust-region invariant `0 < Δ_k ≤ Δmax` *before* the
177 // first step, not just on later expansions. The expansion rule below
178 // caps via `min(·, max_radius)` and contraction only shrinks, so once
179 // `0 < Δ₀ ≤ Δmax` holds we have `0 < Δ_k ≤ Δmax` for all `k` by
180 // induction; every subproblem then obeys `‖η_k‖_g ≤ Δ_k ≤ Δmax`,
181 // restoring `max_radius` as the documented hard cap. A configured
182 // `radius > max_radius` (or a non-finite `radius`) would otherwise let
183 // the very first Cauchy/Steihaug step overshoot the advertised maximum,
184 // so we clamp the initial radius into `(0, max_radius]` here.
185 let mut delta = self.radius.min(self.max_radius);
186
187 // Initial Riemannian gradient norm, captured on the first iteration and
188 // used as the shift-invariant scale in the relative stationarity test
189 // (see `relative_stationarity`).
190 let mut grad0_norm: Option<f64> = None;
191
192 for _ in 0..self.max_iter {
193 let (f_curr, grad_e) = objective.value_gradient(x.view())?;
194 // Raise the ambient Euclidean differential to the *Riemannian*
195 // gradient through the manifold metric. Merely projecting onto the
196 // tangent space is the Riemannian gradient only for the embedded
197 // (identity) metric; for a genuine metric (affine-invariant SPD,
198 // canonical Stiefel) it is the wrong direction, making the model
199 // linear term `g_x(grad, η)` not the differential `Df_x[η]` and the
200 // step not first-order correct (issue #955).
201 let grad = manifold.riemannian_gradient(x.view(), grad_e.view())?;
202 let grad_norm = g_norm(manifold, x.view(), grad.view())?;
203 // Shift-invariant (relative) stationarity test. Comparing the bare
204 // gradient norm to a fixed absolute `grad_tol` is mis-calibrated for
205 // objectives whose natural scale is large — e.g. the *profiled*
206 // Gaussian REML latent objective, whose `n·log σ̂²` term leaves
207 // `‖grad‖` at an O(n) magnitude even at a genuine stationary point
208 // near interpolation (issue #879). We instead test the dimensionless
209 // ratio `‖grad_k‖_g / max(‖grad_0‖_g, 1)`, where `‖grad_0‖_g` is the
210 // gradient norm at the initial iterate. It carries the same
211 // *multiplicative* scale the objective and its gradient share but is
212 // invariant under an additive shift `f → f + C` (unlike `max(|f|,1)`,
213 // which a large constant inflates into a false convergence, #954),
214 // and reduces to the absolute test on a unit-scale objective.
215 let grad0 = *grad0_norm.get_or_insert(grad_norm);
216 if relative_stationarity(grad_norm, grad0) <= self.grad_tol {
217 break;
218 }
219
220 // Solve the trust-region subproblem in T_xM.
221 let (step, predicted_reduction, hit_boundary) =
222 self.solve_subproblem(manifold, objective, x.view(), grad.view(), delta)?;
223
224 // A non-positive predicted reduction means the model offers no
225 // descent (e.g. a vanishing step); shrink and retry from the same
226 // point rather than dividing by ~0 in ρ.
227 if !(predicted_reduction > 0.0) {
228 delta *= SHRINK;
229 if delta <= self.grad_tol * self.grad_tol {
230 break;
231 }
232 continue;
233 }
234
235 let trial_x = manifold.retract(x.view(), step.view())?;
236 let f_trial = objective.value_gradient(trial_x.view())?.0;
237 let actual_reduction = f_curr - f_trial;
238 let rho = if f_trial.is_finite() {
239 actual_reduction / predicted_reduction
240 } else {
241 f64::NEG_INFINITY
242 };
243
244 // Radius update.
245 if rho < ETA_SHRINK {
246 delta *= SHRINK;
247 } else if rho > ETA_EXPAND && hit_boundary {
248 delta = (delta * EXPAND).min(self.max_radius);
249 }
250
251 // Accept only sufficiently-good steps; otherwise keep x (the next
252 // iteration recomputes f and the gradient at the retained point).
253 if rho > ETA1 && f_trial.is_finite() {
254 x = trial_x;
255 }
256 }
257 Ok(x)
258 }
259
260 /// Solve `min_{‖η‖_g ≤ Δ} m(η)` and return `(η, m(0) − m(η), hit_boundary)`.
261 ///
262 /// Uses Steihaug truncated-CG when the objective provides Hessian–vector
263 /// products *and* the manifold's `retract` is at least a second-order
264 /// retraction ([`RiemannianManifold::retraction_is_second_order`]). When the
265 /// retraction is only first-order the Riemannian-Hessian quadratic term is
266 /// not the second derivative of `f∘R_x`, so scoring it would corrupt ρ
267 /// (issue #956); we then take the Cauchy point, whose linear model is
268 /// first-order correct along any retraction (as we also do when no curvature
269 /// is available).
270 fn solve_subproblem(
271 &self,
272 manifold: &dyn RiemannianManifold,
273 objective: &mut dyn RiemannianObjective,
274 x: ArrayView1<'_, f64>,
275 grad: ArrayView1<'_, f64>,
276 delta: f64,
277 ) -> GeometryResult<(Array1<f64>, f64, bool)> {
278 const BOUNDARY_FRAC: f64 = 0.9;
279
280 // Probe for curvature once: if the objective exposes no Hessian–vector
281 // product we take the Cauchy point.
282 let has_hessian = objective.hessian_vector_product(x, grad)?.is_some();
283
284 // The Riemannian-Hessian quadratic model `½ g_x(η, Hη)` is the correct
285 // second-order model of `f` along the trial path ONLY when that path is
286 // generated by the exponential map or another second-order retraction:
287 // for a first-order retraction `R_x` the pullback `f∘R_x` has a second
288 // derivative at `0` that is NOT the Riemannian Hessian, so scoring the
289 // curved model against `manifold.retract` corrupts ρ and the radius
290 // control (issue #956). The linear term `g_x(grad, η) = Df_x[η]` is
291 // retraction-independent, so the curvature-free Cauchy model stays
292 // first-order correct along ANY retraction. We therefore use the curved
293 // Steihaug truncated-CG step only when the objective supplies curvature
294 // AND the manifold's retraction is (at least) second-order; otherwise we
295 // take the Cauchy point — never asserting a second-order model the
296 // retraction cannot honor.
297 if !has_hessian || !manifold.retraction_is_second_order() {
298 return self.cauchy_point(manifold, x, grad, delta);
299 }
300
301 // --- Steihaug truncated-CG on the metric inner product. ---
302 // Solve min m(η) = g_x(grad, η) + ½ g_x(η, Hη) within ‖η‖_g ≤ Δ.
303 let n = grad.len();
304 let mut z = Array1::<f64>::zeros(n); // current iterate η
305 let mut r = grad.to_owned(); // residual = grad + Hz (z=0 ⇒ grad)
306 let mut p = -&r; // search direction
307 let r0_norm = g_norm(manifold, x, r.view())?;
308 let tol = (STEIHAUG_CG_FORCING_FACTOR * r0_norm).min(r0_norm * r0_norm);
309
310 // model reduction tracker m(0) − m(z); m(0) = 0 here (constant dropped).
311 // m(z) = g(grad,z) + ½ g(z,Hz); we recompute it at the end for ρ.
312 let max_cg = 2 * n + 1;
313 for _ in 0..max_cg {
314 let hp = objective.hessian_vector_product(x, p.view())?.ok_or(
315 crate::manifold::GeometryError::Unsupported(
316 "Hessian–vector product became unavailable mid-subproblem",
317 ),
318 )?;
319 let php = g_inner(manifold, x, p.view(), hp.view())?;
320 if php <= 0.0 {
321 // Negative curvature: go to the boundary along p.
322 let (tau, _) = boundary_tau(manifold, x, z.view(), p.view(), delta)?;
323 let eta = &z + &(&p * tau);
324 let red = model_reduction(manifold, objective, x, grad, eta.view())?;
325 return Ok((eta, red, true));
326 }
327 let rr = g_inner(manifold, x, r.view(), r.view())?;
328 let alpha = rr / php;
329 let z_next = &z + &(&p * alpha);
330 if g_norm(manifold, x, z_next.view())? >= delta {
331 // Trust-region boundary crossed: step to it.
332 let (tau, _) = boundary_tau(manifold, x, z.view(), p.view(), delta)?;
333 let eta = &z + &(&p * tau);
334 let red = model_reduction(manifold, objective, x, grad, eta.view())?;
335 return Ok((eta, red, true));
336 }
337 z = z_next;
338 let r_next = &r + &(&hp * alpha);
339 let r_next_norm = g_norm(manifold, x, r_next.view())?;
340 if r_next_norm <= tol {
341 let red = model_reduction(manifold, objective, x, grad, z.view())?;
342 let hit = g_norm(manifold, x, z.view())? >= BOUNDARY_FRAC * delta;
343 return Ok((z, red, hit));
344 }
345 let rr_next = g_inner(manifold, x, r_next.view(), r_next.view())?;
346 let beta = rr_next / rr;
347 p = &(-&r_next) + &(&p * beta);
348 r = r_next;
349 }
350 let red = model_reduction(manifold, objective, x, grad, z.view())?;
351 let hit = g_norm(manifold, x, z.view())? >= BOUNDARY_FRAC * delta;
352 Ok((z, red, hit))
353 }
354
355 /// Cauchy point: the exact minimizer of the model along the steepest-descent
356 /// direction `−grad` within the trust region. With no curvature available
357 /// the model is the decreasing linear `m(τ·(−grad)) = −τ‖grad‖²_g`, whose
358 /// constrained minimizer sits on the boundary `τ = Δ / ‖grad‖_g`, giving a
359 /// predicted reduction `Δ·‖grad‖_g`.
360 fn cauchy_point(
361 &self,
362 manifold: &dyn RiemannianManifold,
363 x: ArrayView1<'_, f64>,
364 grad: ArrayView1<'_, f64>,
365 delta: f64,
366 ) -> GeometryResult<(Array1<f64>, f64, bool)> {
367 let grad_norm = g_norm(manifold, x, grad.view())?;
368 if grad_norm <= 0.0 {
369 return Ok((Array1::<f64>::zeros(grad.len()), 0.0, false));
370 }
371 let tau = delta / grad_norm;
372 let step = &grad.to_owned() * (-tau);
373 // Predicted reduction of the linear model m(0) − m(η) = τ‖grad‖²_g.
374 let predicted = tau * grad_norm * grad_norm;
375 Ok((step, predicted, true))
376 }
377}
378
379/// Largest `τ ≥ 0` with `‖z + τ p‖_g = Δ`, solving the quadratic
380/// `‖p‖²_g τ² + 2 g(z,p) τ + (‖z‖²_g − Δ²) = 0`. Returns `(τ, ‖z + τp‖_g)`.
381fn boundary_tau(
382 manifold: &dyn RiemannianManifold,
383 x: ArrayView1<'_, f64>,
384 z: ArrayView1<'_, f64>,
385 p: ArrayView1<'_, f64>,
386 delta: f64,
387) -> GeometryResult<(f64, f64)> {
388 let pp = g_inner(manifold, x, p, p)?;
389 let zp = g_inner(manifold, x, z, p)?;
390 let zz = g_inner(manifold, x, z, z)?;
391 if pp <= 0.0 {
392 return Ok((0.0, zz.max(0.0).sqrt()));
393 }
394 let c = zz - delta * delta;
395 let disc = (zp * zp - pp * c).max(0.0);
396 let tau = (-zp + disc.sqrt()) / pp;
397 let tau = tau.max(0.0);
398 Ok((tau, delta))
399}
400
401/// Model reduction `m(0) − m(η) = −g(grad, η) − ½ g(η, Hη)`.
402fn model_reduction(
403 manifold: &dyn RiemannianManifold,
404 objective: &mut dyn RiemannianObjective,
405 x: ArrayView1<'_, f64>,
406 grad: ArrayView1<'_, f64>,
407 eta: ArrayView1<'_, f64>,
408) -> GeometryResult<f64> {
409 let lin = g_inner(manifold, x, grad, eta)?;
410 let heta = objective.hessian_vector_product(x, eta)?.ok_or(
411 crate::manifold::GeometryError::Unsupported(
412 "Hessian–vector product unavailable while scoring the model",
413 ),
414 )?;
415 let quad = g_inner(manifold, x, eta, heta.view())?;
416 Ok(-lin - 0.5 * quad)
417}
418
419#[derive(Debug, Clone, PartialEq)]
420pub struct RiemannianLBFGS {
421 pub history: usize,
422 pub step_size: f64,
423 pub max_iter: usize,
424 pub grad_tol: f64,
425}
426
427impl Default for RiemannianLBFGS {
428 fn default() -> Self {
429 Self {
430 history: 10,
431 step_size: 1.0,
432 max_iter: 100,
433 grad_tol: 1.0e-8,
434 }
435 }
436}
437
438/// One stored secant pair, kept with its base point so the two-loop recursion
439/// can transport it into whatever the current tangent space is. `s` and `y`
440/// both live in `T_{base}M`.
441#[derive(Clone)]
442struct SecantPair {
443 base: Array1<f64>,
444 s: Array1<f64>,
445 y: Array1<f64>,
446}
447
448impl RiemannianLBFGS {
449 /// Riemannian L-BFGS with a backtracking-and-expansion Armijo line search.
450 ///
451 /// The search starts at the user-supplied `step_size` (a hint, not a hard
452 /// cap) and first *expands* by doubling while the Armijo sufficient-
453 /// decrease condition continues to hold and the objective is still
454 /// strictly improving. Once expansion stalls, it accepts the best step
455 /// it has seen so far; if even the initial trial violates Armijo, it
456 /// *contracts* by halving until Armijo holds or a safeguard floor is
457 /// reached. This makes the optimizer robust to mis-scaled `step_size`
458 /// inputs (including the Newton-natural α=1 that BFGS expects on
459 /// well-conditioned quadratics) without forcing the caller to retune
460 /// it, and preserves the secant pair (s, y) curvature condition so the
461 /// L-BFGS inverse-Hessian approximation stays SPD.
462 ///
463 /// All inner products use the manifold metric `g_x(·,·)`, and every secant
464 /// pair is *parallel-transported into the current tangent space* before it
465 /// enters the two-loop recursion, so the BFGS algebra never mixes vectors
466 /// living in different tangent spaces (the bug fixed in #616). The freshly
467 /// formed secant pair likewise transports the accepted step from the old
468 /// tangent space into the new one before pairing it with the gradient
469 /// difference, so both `s` and `y` live in `T_{x_new}M`.
470 pub fn minimize(
471 &self,
472 manifold: &dyn RiemannianManifold,
473 objective: &mut dyn RiemannianObjective,
474 initial: ArrayView1<'_, f64>,
475 ) -> GeometryResult<Array1<f64>> {
476 let mut x = initial.to_owned();
477 let d = manifold.ambient_dim();
478 check_len("L-BFGS initial point", x.len(), d)?;
479 let mut history: Vec<SecantPair> = Vec::new();
480 let (mut f_curr, grad_e0) = objective.value_gradient(x.view())?;
481 // Riemannian gradient (metric-raised), not a bare tangent projection —
482 // see the trust region above and issue #955. The secant pairs, two-loop
483 // recursion, and Armijo slope are all metric inner products, so they
484 // must operate on the true Riemannian gradient.
485 let mut grad = manifold.riemannian_gradient(x.view(), grad_e0.view())?;
486 // Initial Riemannian gradient norm: the shift-invariant scale of the
487 // relative stationarity test (see `relative_stationarity`).
488 let grad0_norm = g_norm(manifold, x.view(), grad.view())?;
489 let armijo_c: f64 = 1.0e-4;
490 let alpha_min: f64 = 1.0e-16;
491 let alpha_max: f64 = 1.0e16;
492 let initial_step = if self.step_size.is_finite() && self.step_size > 0.0 {
493 self.step_size
494 } else {
495 1.0
496 };
497 for _ in 0..self.max_iter {
498 // Shift-invariant (relative) stationarity test, identical in form to
499 // the trust region's (see `relative_stationarity`): the current
500 // gradient norm is measured against the initial one, so a fixed
501 // `grad_tol` reads as a *relative* tolerance for a large-scale
502 // objective (e.g. the profiled REML latent objective, issue #879)
503 // while staying invariant under an additive shift of `f` (#954).
504 // Reduces to the absolute test on a unit-scale objective.
505 let grad_norm = g_norm(manifold, x.view(), grad.view())?;
506 if relative_stationarity(grad_norm, grad0_norm) <= self.grad_tol {
507 break;
508 }
509 let direction = two_loop(manifold, x.view(), grad.view(), &history)?;
510 let direction = -&direction;
511 let slope = g_inner(manifold, x.view(), grad.view(), direction.view())?;
512 // Guard against ascent directions caused by stale curvature; if the
513 // BFGS direction is not a (metric) descent direction, fall back to
514 // the projected steepest-descent direction so progress is
515 // guaranteed.
516 let (direction, slope) = if slope < 0.0 {
517 (direction, slope)
518 } else {
519 let sd = -&grad;
520 let s_sd = g_inner(manifold, x.view(), grad.view(), sd.view())?;
521 (sd, s_sd)
522 };
523 let old_x = x.clone();
524 let old_grad = grad.clone();
525 // --- Armijo line search with bidirectional adaptation. ---
526 let mut alpha = initial_step;
527 let mut best_alpha = 0.0;
528 let mut best_f = f_curr;
529 let mut best_x = x.clone();
530 let mut best_grad = grad.clone();
531 // First try to expand: while Armijo holds and the objective keeps
532 // improving, double the step.
533 loop {
534 let step = &direction * alpha;
535 let trial_x = manifold.retract(x.view(), step.view())?;
536 let (f_trial, g_trial_e) = objective.value_gradient(trial_x.view())?;
537 let armijo_rhs = f_curr + armijo_c * alpha * slope;
538 let accepted = f_trial.is_finite() && f_trial <= armijo_rhs;
539 if accepted && f_trial < best_f {
540 best_alpha = alpha;
541 best_f = f_trial;
542 best_x = trial_x;
543 // Riemannian (metric-raised) gradient at the trial point —
544 // the object the secant pair (s, y) is formed from (#955).
545 best_grad = manifold.riemannian_gradient(best_x.view(), g_trial_e.view())?;
546 if alpha >= alpha_max {
547 break;
548 }
549 alpha *= 2.0;
550 } else {
551 break;
552 }
553 }
554 // If no expansion succeeded, contract from the initial trial.
555 if best_alpha == 0.0 {
556 alpha = initial_step;
557 while alpha > alpha_min {
558 let step = &direction * alpha;
559 let trial_x = manifold.retract(x.view(), step.view())?;
560 let (f_trial, g_trial_e) = objective.value_gradient(trial_x.view())?;
561 let armijo_rhs = f_curr + armijo_c * alpha * slope;
562 if f_trial.is_finite() && f_trial <= armijo_rhs {
563 best_alpha = alpha;
564 best_f = f_trial;
565 best_x = trial_x;
566 // Riemannian (metric-raised) gradient at the trial point.
567 best_grad =
568 manifold.riemannian_gradient(best_x.view(), g_trial_e.view())?;
569 break;
570 }
571 alpha *= 0.5;
572 }
573 }
574 if best_alpha == 0.0 {
575 // No admissible step found — terminate at the current point.
576 break;
577 }
578 // The accepted tangent step at `old_x` (the actual move taken).
579 let eta = &direction * best_alpha;
580 x = best_x;
581 f_curr = best_f;
582 grad = best_grad;
583
584 // --- Secant pair, formed entirely in T_{x_new}M (the #616 fix). ---
585 // Parallel-transport the accepted step from T_{old_x}M to T_{x}M so
586 // it is a tangent at the NEW point, matching the gradient there. The
587 // old gradient is likewise transported to T_{x}M before subtraction.
588 let path = transport_path(&old_x, &x);
589 let s = manifold.parallel_transport(path.view(), eta.view())?;
590 let transported_old_grad = manifold.parallel_transport(path.view(), old_grad.view())?;
591 let y = &grad - &transported_old_grad;
592 // Commit the (s, y) pair only when the metric curvature condition
593 // g_x(s, y) > 0 holds (strict positivity). This is required for the
594 // implicit BFGS inverse-Hessian update to remain SPD.
595 let sy = g_inner(manifold, x.view(), s.view(), y.view())?;
596 if sy > 1.0e-14 {
597 history.push(SecantPair {
598 base: x.clone(),
599 s,
600 y,
601 });
602 if history.len() > self.history {
603 history.remove(0);
604 }
605 }
606 }
607 Ok(x)
608 }
609}
610
611/// Build the 2×D point path matrix `[p_from; p_to]` consumed by
612/// [`RiemannianManifold::parallel_transport`].
613fn transport_path(p_from: &Array1<f64>, p_to: &Array1<f64>) -> ndarray::Array2<f64> {
614 let d = p_from.len();
615 let mut path = ndarray::Array2::<f64>::zeros((2, d));
616 path.row_mut(0).assign(p_from);
617 path.row_mut(1).assign(p_to);
618 path
619}
620
621/// L-BFGS two-loop recursion in the CURRENT tangent space `T_xM`.
622///
623/// Each stored secant pair `(s_i, y_i)` lives in `T_{base_i}M`; before it can
624/// participate in the recursion it is parallel-transported into `T_xM`. All
625/// inner products use the manifold metric `g_x(·,·)` at the current point, so
626/// no two vectors from different tangent spaces are ever combined (#616).
627fn two_loop(
628 manifold: &dyn RiemannianManifold,
629 x: ArrayView1<'_, f64>,
630 grad: ArrayView1<'_, f64>,
631 history: &[SecantPair],
632) -> GeometryResult<Array1<f64>> {
633 // Transport every stored pair into the current tangent space once.
634 let mut s_cur: Vec<Array1<f64>> = Vec::with_capacity(history.len());
635 let mut y_cur: Vec<Array1<f64>> = Vec::with_capacity(history.len());
636 for pair in history {
637 let path = transport_path(&pair.base, &x.to_owned());
638 s_cur.push(manifold.parallel_transport(path.view(), pair.s.view())?);
639 y_cur.push(manifold.parallel_transport(path.view(), pair.y.view())?);
640 }
641
642 let mut q = grad.to_owned();
643 let mut alpha = vec![0.0; history.len()];
644 let mut rho = vec![0.0; history.len()];
645 for i in (0..history.len()).rev() {
646 let sy = g_inner(manifold, x, s_cur[i].view(), y_cur[i].view())?;
647 rho[i] = 1.0 / sy;
648 alpha[i] = rho[i] * g_inner(manifold, x, s_cur[i].view(), q.view())?;
649 q = &q - &(&y_cur[i] * alpha[i]);
650 }
651 let mut r = q;
652 if let (Some(s), Some(y)) = (s_cur.last(), y_cur.last()) {
653 let yy = g_inner(manifold, x, y.view(), y.view())?;
654 if yy > 1.0e-14 {
655 let sy = g_inner(manifold, x, s.view(), y.view())?;
656 r = &r * (sy / yy);
657 }
658 }
659 for i in 0..history.len() {
660 let beta = rho[i] * g_inner(manifold, x, y_cur[i].view(), r.view())?;
661 r = &r + &(&s_cur[i] * (alpha[i] - beta));
662 }
663 Ok(r)
664}
665
666#[cfg(test)]
667mod tests {
668 use super::*;
669 use crate::EuclideanManifold;
670 use ndarray::{Array1, ArrayView1};
671
672 /// Scalar objective `f(x) = x²` on the 1-D Euclidean line. Gradient `2x`,
673 /// Hessian `2`, exposed as an HVP so the trust region runs Steihaug-CG.
674 struct Square;
675 impl RiemannianObjective for Square {
676 fn value_gradient(
677 &mut self,
678 point: ArrayView1<'_, f64>,
679 ) -> GeometryResult<(f64, Array1<f64>)> {
680 let x = point[0];
681 Ok((x * x, Array1::from_vec(vec![2.0 * x])))
682 }
683 fn hessian_vector_product(
684 &mut self,
685 point: ArrayView1<'_, f64>,
686 tangent: ArrayView1<'_, f64>,
687 ) -> GeometryResult<Option<Array1<f64>>> {
688 assert!(point.iter().all(|value| value.is_finite()));
689 check_len("hessian_vector_product tangent", tangent.len(), point.len())?;
690 Ok(Some(&tangent.to_owned() * 2.0))
691 }
692 }
693
694 /// Gradient-only variant of `f(x)=x²` (no HVP) to exercise the Cauchy-point
695 /// branch of the trust region.
696 struct SquareGradOnly;
697 impl RiemannianObjective for SquareGradOnly {
698 fn value_gradient(
699 &mut self,
700 point: ArrayView1<'_, f64>,
701 ) -> GeometryResult<(f64, Array1<f64>)> {
702 let x = point[0];
703 Ok((x * x, Array1::from_vec(vec![2.0 * x])))
704 }
705 }
706
707 /// General convex quadratic `f(x) = ½ xᵀ A x − bᵀ x` on Euclidean R^n with
708 /// SPD `A`; minimizer solves `A x = b`. Provides an exact HVP `A v`.
709 struct Quadratic {
710 a: ndarray::Array2<f64>,
711 b: Array1<f64>,
712 }
713 impl RiemannianObjective for Quadratic {
714 fn value_gradient(
715 &mut self,
716 point: ArrayView1<'_, f64>,
717 ) -> GeometryResult<(f64, Array1<f64>)> {
718 let ax = self.a.dot(&point.to_owned());
719 let val = 0.5 * point.dot(&ax) - self.b.dot(&point.to_owned());
720 let grad = &ax - &self.b;
721 Ok((val, grad))
722 }
723 fn hessian_vector_product(
724 &mut self,
725 point: ArrayView1<'_, f64>,
726 tangent: ArrayView1<'_, f64>,
727 ) -> GeometryResult<Option<Array1<f64>>> {
728 assert!(point.iter().all(|value| value.is_finite()));
729 check_len("hessian_vector_product tangent", tangent.len(), point.len())?;
730 Ok(Some(self.a.dot(&tangent.to_owned())))
731 }
732 }
733
734 /// (#615 counterexample) A correct trust region on `f(x)=x²` from `x₀=0.1`
735 /// with `Δ=1` must CONVERGE to 0 (not oscillate), monotonically driving `f`
736 /// down — never increasing it on an accepted iterate.
737 #[test]
738 fn trust_region_converges_on_square_steihaug() {
739 let manifold = EuclideanManifold::new(1);
740 let tr = RiemannianTrustRegion {
741 radius: 1.0,
742 max_radius: 1.0e6,
743 max_iter: 100,
744 grad_tol: 1.0e-12,
745 };
746 let mut obj = Square;
747 let x0 = Array1::from_vec(vec![0.1]);
748 let x = tr
749 .minimize(&manifold, &mut obj, x0.view())
750 .expect("TR runs");
751 assert!(
752 x[0].abs() < 1.0e-6,
753 "trust region must converge to 0, got {}",
754 x[0]
755 );
756 }
757
758 /// The trust region must never increase `f` across accepted iterates. We
759 /// check the monotone-descent invariant directly by stepping the public
760 /// `minimize` from a sequence of decreasing budgets and confirming the
761 /// returned value is below the start value, and that from `x₀=0.1` it does
762 /// not return a point with larger `|x|`.
763 #[test]
764 fn trust_region_never_increases_objective() {
765 let manifold = EuclideanManifold::new(1);
766 let tr = RiemannianTrustRegion {
767 radius: 1.0,
768 max_radius: 1.0e6,
769 max_iter: 1,
770 grad_tol: 1.0e-12,
771 };
772 let mut obj = Square;
773 // A single TR iteration from 0.1: with exact Hessian the Newton step
774 // lands at the minimum (inside Δ=1), ρ=1, so it must be accepted and f
775 // must strictly decrease.
776 let x0 = Array1::from_vec(vec![0.1]);
777 let f0 = obj.value_gradient(x0.view()).unwrap().0;
778 let x1 = tr
779 .minimize(&manifold, &mut obj, x0.view())
780 .expect("TR runs");
781 let f1 = obj.value_gradient(x1.view()).unwrap().0;
782 assert!(f1 <= f0, "objective increased: {f0} -> {f1}");
783 assert!(x1[0].abs() <= x0[0].abs() + 1e-15, "moved away from min");
784 }
785
786 /// Cauchy-point branch (no HVP) must still be a real trust-region method:
787 /// from `x₀=0.1`, `Δ=1` on `f(x)=x²` it converges toward 0 and never
788 /// oscillates upward in `f`.
789 #[test]
790 fn trust_region_cauchy_point_converges() {
791 let manifold = EuclideanManifold::new(1);
792 let tr = RiemannianTrustRegion {
793 radius: 1.0,
794 max_radius: 1.0e6,
795 max_iter: 500,
796 grad_tol: 1.0e-12,
797 };
798 let mut obj = SquareGradOnly;
799 let x0 = Array1::from_vec(vec![0.1]);
800 let x = tr
801 .minimize(&manifold, &mut obj, x0.view())
802 .expect("TR runs");
803 assert!(
804 x[0].abs() < 1.0e-6,
805 "Cauchy-point trust region must converge to 0, got {}",
806 x[0]
807 );
808 }
809
810 /// Steihaug-CG trust region on a 3-D SPD quadratic must reach the exact
811 /// minimizer `A⁻¹ b`.
812 #[test]
813 fn trust_region_solves_spd_quadratic() {
814 let manifold = EuclideanManifold::new(3);
815 let a = ndarray::array![[4.0, 1.0, 0.0], [1.0, 3.0, 1.0], [0.0, 1.0, 2.0],];
816 let b = Array1::from_vec(vec![1.0, 2.0, -1.0]);
817 // Reference solution A x = b.
818 let x_ref = crate::manifold::inverse(&a).unwrap().dot(&b);
819 let mut obj = Quadratic { a, b };
820 let tr = RiemannianTrustRegion {
821 radius: 1.0,
822 max_radius: 1.0e6,
823 max_iter: 200,
824 grad_tol: 1.0e-12,
825 };
826 let x0 = Array1::from_vec(vec![0.0, 0.0, 0.0]);
827 let x = tr
828 .minimize(&manifold, &mut obj, x0.view())
829 .expect("TR runs");
830 for i in 0..3 {
831 assert!(
832 (x[i] - x_ref[i]).abs() < 1.0e-6,
833 "component {i}: got {}, want {}",
834 x[i],
835 x_ref[i]
836 );
837 }
838 }
839
840 /// (#616 sanity) Riemannian L-BFGS on a Euclidean SPD quadratic must reduce
841 /// the objective and converge to the analytic minimizer `A⁻¹ b`.
842 #[test]
843 fn lbfgs_reduces_euclidean_quadratic() {
844 let manifold = EuclideanManifold::new(3);
845 let a = ndarray::array![[5.0, 1.0, 0.5], [1.0, 4.0, 1.0], [0.5, 1.0, 3.0],];
846 let b = Array1::from_vec(vec![2.0, -1.0, 0.5]);
847 let x_ref = crate::manifold::inverse(&a).unwrap().dot(&b);
848 let mut obj = Quadratic { a, b };
849 let lbfgs = RiemannianLBFGS {
850 history: 10,
851 step_size: 1.0,
852 max_iter: 200,
853 grad_tol: 1.0e-10,
854 };
855 let x0 = Array1::from_vec(vec![0.0, 0.0, 0.0]);
856 let f0 = obj.value_gradient(x0.view()).unwrap().0;
857 let x = lbfgs
858 .minimize(&manifold, &mut obj, x0.view())
859 .expect("L-BFGS runs");
860 let f1 = obj.value_gradient(x.view()).unwrap().0;
861 assert!(f1 < f0, "L-BFGS did not reduce the quadratic: {f0} -> {f1}");
862 for i in 0..3 {
863 assert!(
864 (x[i] - x_ref[i]).abs() < 1.0e-6,
865 "component {i}: got {}, want {}",
866 x[i],
867 x_ref[i]
868 );
869 }
870 }
871}