Skip to main content

gam_geometry/
optimizer.rs

1use ndarray::{Array1, ArrayView1};
2
3use crate::manifold::{GeometryResult, RiemannianManifold, check_len, quad_form};
4
5/// Linear factor of the Steihaug truncated-CG forcing sequence: the inner CG
6/// solve is terminated once the residual drops to `min(η·‖r₀‖, ‖r₀‖²)`. The
7/// quadratic `‖r₀‖²` term gives the super-linear convergence of an inexact
8/// Newton step near the optimum, while `η·‖r₀‖` caps wasted inner work far from
9/// it (Nocedal & Wright, *Numerical Optimization*, §7.1, eq. 7.3).
10const STEIHAUG_CG_FORCING_FACTOR: f64 = 1.0e-2;
11
12pub trait RiemannianObjective {
13    fn value_gradient(&mut self, point: ArrayView1<'_, f64>) -> GeometryResult<(f64, Array1<f64>)>;
14
15    /// Riemannian Hessian–vector product `H(x)·v` for a tangent direction `v`
16    /// at `point`, returned in the same ambient/tangent coordinates as the
17    /// gradient.
18    ///
19    /// This is what upgrades the trust-region subproblem from a Cauchy-point
20    /// step (the exact minimizer of the *linear* model along the steepest
21    /// descent direction) to a Steihaug truncated-CG step that exploits real
22    /// curvature. An objective that exposes no second-order information returns
23    /// `None` (the default), and the trust region transparently falls back to
24    /// the Cauchy point — never to plain clipped steepest descent, which has no
25    /// model, no predicted/actual reduction ratio, and no accept/reject.
26    ///
27    /// The Riemannian-Hessian quadratic model the trust region builds from this
28    /// product is a valid second-order model of `f` only along a (≥)second-order
29    /// retraction (the exponential map, or any retraction with
30    /// [`RiemannianManifold::retraction_is_second_order`] `== true`). On a
31    /// manifold whose `retract` is only FIRST-order (e.g. the Stiefel/Grassmann
32    /// QR retraction) the second derivative of the pullback `f∘R_x` is not the
33    /// Riemannian Hessian, so the trust region ignores this curvature and uses
34    /// the first-order-correct Cauchy model instead (issue #956).
35    fn hessian_vector_product(
36        &mut self,
37        point: ArrayView1<'_, f64>,
38        tangent: ArrayView1<'_, f64>,
39    ) -> GeometryResult<Option<Array1<f64>>> {
40        // Validate the shapes the contract requires (a tangent at `point`), then
41        // report "no curvature available" so the trust region selects the
42        // Cauchy point. We never fabricate a Hessian here.
43        check_len("hessian_vector_product tangent", tangent.len(), point.len())?;
44        Ok(None)
45    }
46}
47
48/// Metric inner product `g_x(a, b) = aᵀ G(x) b` using the manifold metric
49/// tensor at `point`. For manifolds whose metric is the ambient identity
50/// (Euclidean, Sphere, Circle, Torus, …) this reduces to the Euclidean dot
51/// product; for a genuine Riemannian metric (e.g. the affine-invariant SPD
52/// metric) it evaluates the correct geometric inner product on the tangent
53/// space. Every norm and inner product in both optimizers below routes through
54/// this so the algorithms are metric-correct on curved manifolds.
55fn g_inner(
56    manifold: &dyn RiemannianManifold,
57    point: ArrayView1<'_, f64>,
58    a: ArrayView1<'_, f64>,
59    b: ArrayView1<'_, f64>,
60) -> GeometryResult<f64> {
61    let g = manifold.metric_tensor(point)?;
62    Ok(quad_form(g.view(), a, b))
63}
64
65fn g_norm(
66    manifold: &dyn RiemannianManifold,
67    point: ArrayView1<'_, f64>,
68    a: ArrayView1<'_, f64>,
69) -> GeometryResult<f64> {
70    Ok(g_inner(manifold, point, a, a)?.max(0.0).sqrt())
71}
72
73/// Shift-invariant relative-gradient stationarity measure
74/// `‖grad_k‖_g / max(‖grad_0‖_g, 1)`, comparing the current Riemannian gradient
75/// norm to the gradient norm at the INITIAL iterate. The initial gradient norm
76/// carries the same *multiplicative* scale the objective and its gradient share
77/// (`f → c·f` ⇒ `grad → c·grad`), so a fixed `grad_tol` still reads as a
78/// *relative* tolerance — but, unlike dividing by `max(|f|, 1)`, `‖grad_0‖` is
79/// invariant under an additive shift `f → f + C`, which leaves the minimizers,
80/// gradient, trust-region model reduction, Armijo slope, and accepted path all
81/// unchanged. Dividing by `|f|` was non-invariant: a large additive constant
82/// inflates the denominator and can falsely certify convergence at a
83/// non-stationary iterate (e.g. `f̃(x) = C + x²` at `x = 1` with `C > 2/τ − 1`),
84/// issue #954. The `max(·, 1)` floor reduces this to the absolute test
85/// `‖grad_k‖ ≤ grad_tol` on a unit-scale objective and preserves the
86/// O(n)-gradient calibration of the profiled REML latent objective (whose
87/// `‖grad_0‖` is itself O(n), issue #879). The non-intrinsic `‖x‖_typ` factor is
88/// dropped: ambient iterate magnitude is not coordinate/chart invariant on a
89/// manifold, so it does not belong in a Riemannian stationarity test. A
90/// non-finite gradient maps to `+∞` so a blown-up iterate is never stationary.
91fn relative_stationarity(grad_norm: f64, grad0_norm: f64) -> f64 {
92    if !grad_norm.is_finite() || !grad0_norm.is_finite() {
93        return f64::INFINITY;
94    }
95    grad_norm / grad0_norm.max(1.0)
96}
97
98#[derive(Debug, Clone, PartialEq)]
99pub struct RiemannianTrustRegion {
100    /// Initial trust-region radius Δ₀.
101    pub radius: f64,
102    /// Hard cap Δmax on the radius across all iterations.
103    pub max_radius: f64,
104    pub max_iter: usize,
105    pub grad_tol: f64,
106}
107
108impl Default for RiemannianTrustRegion {
109    fn default() -> Self {
110        Self {
111            radius: 1.0,
112            max_radius: 1.0e6,
113            max_iter: 64,
114            grad_tol: 1.0e-8,
115        }
116    }
117}
118
119impl RiemannianTrustRegion {
120    /// A genuine Riemannian trust-region method.
121    ///
122    /// At each iterate `x` we build the quadratic model in the tangent space
123    /// `T_xM`,
124    ///
125    /// ```text
126    ///   m(η) = f(x) + g_x(grad, η) + ½ g_x(η, Hη),
127    /// ```
128    ///
129    /// where `g_x(·,·)` is the manifold metric inner product and `H` is the
130    /// Riemannian Hessian (accessed only through Hessian–vector products). The
131    /// step is the (approximate) solution of the trust-region subproblem
132    ///
133    /// ```text
134    ///   min_{η ∈ T_xM, ‖η‖_g ≤ Δ}  m(η).
135    /// ```
136    ///
137    /// When the objective supplies Hessian–vector products AND the manifold's
138    /// `retract` is at least a second-order retraction
139    /// ([`RiemannianManifold::retraction_is_second_order`]) we solve the
140    /// subproblem with the Steihaug truncated-CG method (stopping at negative
141    /// curvature or the trust-region boundary). Otherwise — no curvature, or a
142    /// first-order retraction whose pullback second derivative is not the
143    /// Riemannian Hessian (issue #956) — we fall back to the Cauchy point: the
144    /// exact minimizer of the model along the steepest-descent direction within
145    /// the trust region (with curvature taken from the model where available,
146    /// and the boundary point of the decreasing linear model otherwise). The
147    /// linear term `Df_x[η]` is retraction-independent, so the Cauchy model
148    /// keeps ρ and the radius control valid along any retraction. Either way
149    /// this is a real model-based step — not clipped descent.
150    ///
151    /// We then form the ratio of actual to predicted reduction
152    ///
153    /// ```text
154    ///   ρ = (f(x) − f(x⁺)) / (m(0) − m(η)),
155    /// ```
156    ///
157    /// accept the step only when `ρ > η₁`, and adapt Δ: shrink on a poor ratio,
158    /// expand on an excellent ratio that reaches the boundary, otherwise hold.
159    /// Only accepted steps are retracted onto the manifold.
160    pub fn minimize(
161        &self,
162        manifold: &dyn RiemannianManifold,
163        objective: &mut dyn RiemannianObjective,
164        initial: ArrayView1<'_, f64>,
165    ) -> GeometryResult<Array1<f64>> {
166        // Trust-region acceptance / radius-update constants.
167        const ETA1: f64 = 0.1; // accept the step iff ρ > ETA1
168        const ETA_SHRINK: f64 = 0.25; // ρ below this ⇒ shrink the radius
169        const ETA_EXPAND: f64 = 0.75; // ρ above this (at the boundary) ⇒ expand
170        const SHRINK: f64 = 0.25;
171        const EXPAND: f64 = 2.0;
172        let mut x = initial.to_owned();
173        let d = manifold.ambient_dim();
174        check_len("trust-region initial point", x.len(), d)?;
175
176        // Establish the trust-region invariant `0 < Δ_k ≤ Δmax` *before* the
177        // first step, not just on later expansions. The expansion rule below
178        // caps via `min(·, max_radius)` and contraction only shrinks, so once
179        // `0 < Δ₀ ≤ Δmax` holds we have `0 < Δ_k ≤ Δmax` for all `k` by
180        // induction; every subproblem then obeys `‖η_k‖_g ≤ Δ_k ≤ Δmax`,
181        // restoring `max_radius` as the documented hard cap. A configured
182        // `radius > max_radius` (or a non-finite `radius`) would otherwise let
183        // the very first Cauchy/Steihaug step overshoot the advertised maximum,
184        // so we clamp the initial radius into `(0, max_radius]` here.
185        let mut delta = self.radius.min(self.max_radius);
186
187        // Initial Riemannian gradient norm, captured on the first iteration and
188        // used as the shift-invariant scale in the relative stationarity test
189        // (see `relative_stationarity`).
190        let mut grad0_norm: Option<f64> = None;
191
192        for _ in 0..self.max_iter {
193            let (f_curr, grad_e) = objective.value_gradient(x.view())?;
194            // Raise the ambient Euclidean differential to the *Riemannian*
195            // gradient through the manifold metric. Merely projecting onto the
196            // tangent space is the Riemannian gradient only for the embedded
197            // (identity) metric; for a genuine metric (affine-invariant SPD,
198            // canonical Stiefel) it is the wrong direction, making the model
199            // linear term `g_x(grad, η)` not the differential `Df_x[η]` and the
200            // step not first-order correct (issue #955).
201            let grad = manifold.riemannian_gradient(x.view(), grad_e.view())?;
202            let grad_norm = g_norm(manifold, x.view(), grad.view())?;
203            // Shift-invariant (relative) stationarity test. Comparing the bare
204            // gradient norm to a fixed absolute `grad_tol` is mis-calibrated for
205            // objectives whose natural scale is large — e.g. the *profiled*
206            // Gaussian REML latent objective, whose `n·log σ̂²` term leaves
207            // `‖grad‖` at an O(n) magnitude even at a genuine stationary point
208            // near interpolation (issue #879). We instead test the dimensionless
209            // ratio `‖grad_k‖_g / max(‖grad_0‖_g, 1)`, where `‖grad_0‖_g` is the
210            // gradient norm at the initial iterate. It carries the same
211            // *multiplicative* scale the objective and its gradient share but is
212            // invariant under an additive shift `f → f + C` (unlike `max(|f|,1)`,
213            // which a large constant inflates into a false convergence, #954),
214            // and reduces to the absolute test on a unit-scale objective.
215            let grad0 = *grad0_norm.get_or_insert(grad_norm);
216            if relative_stationarity(grad_norm, grad0) <= self.grad_tol {
217                break;
218            }
219
220            // Solve the trust-region subproblem in T_xM.
221            let (step, predicted_reduction, hit_boundary) =
222                self.solve_subproblem(manifold, objective, x.view(), grad.view(), delta)?;
223
224            // A non-positive predicted reduction means the model offers no
225            // descent (e.g. a vanishing step); shrink and retry from the same
226            // point rather than dividing by ~0 in ρ.
227            if !(predicted_reduction > 0.0) {
228                delta *= SHRINK;
229                if delta <= self.grad_tol * self.grad_tol {
230                    break;
231                }
232                continue;
233            }
234
235            let trial_x = manifold.retract(x.view(), step.view())?;
236            let f_trial = objective.value_gradient(trial_x.view())?.0;
237            let actual_reduction = f_curr - f_trial;
238            let rho = if f_trial.is_finite() {
239                actual_reduction / predicted_reduction
240            } else {
241                f64::NEG_INFINITY
242            };
243
244            // Radius update.
245            if rho < ETA_SHRINK {
246                delta *= SHRINK;
247            } else if rho > ETA_EXPAND && hit_boundary {
248                delta = (delta * EXPAND).min(self.max_radius);
249            }
250
251            // Accept only sufficiently-good steps; otherwise keep x (the next
252            // iteration recomputes f and the gradient at the retained point).
253            if rho > ETA1 && f_trial.is_finite() {
254                x = trial_x;
255            }
256        }
257        Ok(x)
258    }
259
260    /// Solve `min_{‖η‖_g ≤ Δ} m(η)` and return `(η, m(0) − m(η), hit_boundary)`.
261    ///
262    /// Uses Steihaug truncated-CG when the objective provides Hessian–vector
263    /// products *and* the manifold's `retract` is at least a second-order
264    /// retraction ([`RiemannianManifold::retraction_is_second_order`]). When the
265    /// retraction is only first-order the Riemannian-Hessian quadratic term is
266    /// not the second derivative of `f∘R_x`, so scoring it would corrupt ρ
267    /// (issue #956); we then take the Cauchy point, whose linear model is
268    /// first-order correct along any retraction (as we also do when no curvature
269    /// is available).
270    fn solve_subproblem(
271        &self,
272        manifold: &dyn RiemannianManifold,
273        objective: &mut dyn RiemannianObjective,
274        x: ArrayView1<'_, f64>,
275        grad: ArrayView1<'_, f64>,
276        delta: f64,
277    ) -> GeometryResult<(Array1<f64>, f64, bool)> {
278        const BOUNDARY_FRAC: f64 = 0.9;
279
280        // Probe for curvature once: if the objective exposes no Hessian–vector
281        // product we take the Cauchy point.
282        let has_hessian = objective.hessian_vector_product(x, grad)?.is_some();
283
284        // The Riemannian-Hessian quadratic model `½ g_x(η, Hη)` is the correct
285        // second-order model of `f` along the trial path ONLY when that path is
286        // generated by the exponential map or another second-order retraction:
287        // for a first-order retraction `R_x` the pullback `f∘R_x` has a second
288        // derivative at `0` that is NOT the Riemannian Hessian, so scoring the
289        // curved model against `manifold.retract` corrupts ρ and the radius
290        // control (issue #956). The linear term `g_x(grad, η) = Df_x[η]` is
291        // retraction-independent, so the curvature-free Cauchy model stays
292        // first-order correct along ANY retraction. We therefore use the curved
293        // Steihaug truncated-CG step only when the objective supplies curvature
294        // AND the manifold's retraction is (at least) second-order; otherwise we
295        // take the Cauchy point — never asserting a second-order model the
296        // retraction cannot honor.
297        if !has_hessian || !manifold.retraction_is_second_order() {
298            return self.cauchy_point(manifold, x, grad, delta);
299        }
300
301        // --- Steihaug truncated-CG on the metric inner product. ---
302        // Solve min m(η) = g_x(grad, η) + ½ g_x(η, Hη) within ‖η‖_g ≤ Δ.
303        let n = grad.len();
304        let mut z = Array1::<f64>::zeros(n); // current iterate η
305        let mut r = grad.to_owned(); // residual = grad + Hz (z=0 ⇒ grad)
306        let mut p = -&r; // search direction
307        let r0_norm = g_norm(manifold, x, r.view())?;
308        let tol = (STEIHAUG_CG_FORCING_FACTOR * r0_norm).min(r0_norm * r0_norm);
309
310        // model reduction tracker m(0) − m(z); m(0) = 0 here (constant dropped).
311        // m(z) = g(grad,z) + ½ g(z,Hz); we recompute it at the end for ρ.
312        let max_cg = 2 * n + 1;
313        for _ in 0..max_cg {
314            let hp = objective.hessian_vector_product(x, p.view())?.ok_or(
315                crate::manifold::GeometryError::Unsupported(
316                    "Hessian–vector product became unavailable mid-subproblem",
317                ),
318            )?;
319            let php = g_inner(manifold, x, p.view(), hp.view())?;
320            if php <= 0.0 {
321                // Negative curvature: go to the boundary along p.
322                let (tau, _) = boundary_tau(manifold, x, z.view(), p.view(), delta)?;
323                let eta = &z + &(&p * tau);
324                let red = model_reduction(manifold, objective, x, grad, eta.view())?;
325                return Ok((eta, red, true));
326            }
327            let rr = g_inner(manifold, x, r.view(), r.view())?;
328            let alpha = rr / php;
329            let z_next = &z + &(&p * alpha);
330            if g_norm(manifold, x, z_next.view())? >= delta {
331                // Trust-region boundary crossed: step to it.
332                let (tau, _) = boundary_tau(manifold, x, z.view(), p.view(), delta)?;
333                let eta = &z + &(&p * tau);
334                let red = model_reduction(manifold, objective, x, grad, eta.view())?;
335                return Ok((eta, red, true));
336            }
337            z = z_next;
338            let r_next = &r + &(&hp * alpha);
339            let r_next_norm = g_norm(manifold, x, r_next.view())?;
340            if r_next_norm <= tol {
341                let red = model_reduction(manifold, objective, x, grad, z.view())?;
342                let hit = g_norm(manifold, x, z.view())? >= BOUNDARY_FRAC * delta;
343                return Ok((z, red, hit));
344            }
345            let rr_next = g_inner(manifold, x, r_next.view(), r_next.view())?;
346            let beta = rr_next / rr;
347            p = &(-&r_next) + &(&p * beta);
348            r = r_next;
349        }
350        let red = model_reduction(manifold, objective, x, grad, z.view())?;
351        let hit = g_norm(manifold, x, z.view())? >= BOUNDARY_FRAC * delta;
352        Ok((z, red, hit))
353    }
354
355    /// Cauchy point: the exact minimizer of the model along the steepest-descent
356    /// direction `−grad` within the trust region. With no curvature available
357    /// the model is the decreasing linear `m(τ·(−grad)) = −τ‖grad‖²_g`, whose
358    /// constrained minimizer sits on the boundary `τ = Δ / ‖grad‖_g`, giving a
359    /// predicted reduction `Δ·‖grad‖_g`.
360    fn cauchy_point(
361        &self,
362        manifold: &dyn RiemannianManifold,
363        x: ArrayView1<'_, f64>,
364        grad: ArrayView1<'_, f64>,
365        delta: f64,
366    ) -> GeometryResult<(Array1<f64>, f64, bool)> {
367        let grad_norm = g_norm(manifold, x, grad.view())?;
368        if grad_norm <= 0.0 {
369            return Ok((Array1::<f64>::zeros(grad.len()), 0.0, false));
370        }
371        let tau = delta / grad_norm;
372        let step = &grad.to_owned() * (-tau);
373        // Predicted reduction of the linear model m(0) − m(η) = τ‖grad‖²_g.
374        let predicted = tau * grad_norm * grad_norm;
375        Ok((step, predicted, true))
376    }
377}
378
379/// Largest `τ ≥ 0` with `‖z + τ p‖_g = Δ`, solving the quadratic
380/// `‖p‖²_g τ² + 2 g(z,p) τ + (‖z‖²_g − Δ²) = 0`. Returns `(τ, ‖z + τp‖_g)`.
381fn boundary_tau(
382    manifold: &dyn RiemannianManifold,
383    x: ArrayView1<'_, f64>,
384    z: ArrayView1<'_, f64>,
385    p: ArrayView1<'_, f64>,
386    delta: f64,
387) -> GeometryResult<(f64, f64)> {
388    let pp = g_inner(manifold, x, p, p)?;
389    let zp = g_inner(manifold, x, z, p)?;
390    let zz = g_inner(manifold, x, z, z)?;
391    if pp <= 0.0 {
392        return Ok((0.0, zz.max(0.0).sqrt()));
393    }
394    let c = zz - delta * delta;
395    let disc = (zp * zp - pp * c).max(0.0);
396    let tau = (-zp + disc.sqrt()) / pp;
397    let tau = tau.max(0.0);
398    Ok((tau, delta))
399}
400
401/// Model reduction `m(0) − m(η) = −g(grad, η) − ½ g(η, Hη)`.
402fn model_reduction(
403    manifold: &dyn RiemannianManifold,
404    objective: &mut dyn RiemannianObjective,
405    x: ArrayView1<'_, f64>,
406    grad: ArrayView1<'_, f64>,
407    eta: ArrayView1<'_, f64>,
408) -> GeometryResult<f64> {
409    let lin = g_inner(manifold, x, grad, eta)?;
410    let heta = objective.hessian_vector_product(x, eta)?.ok_or(
411        crate::manifold::GeometryError::Unsupported(
412            "Hessian–vector product unavailable while scoring the model",
413        ),
414    )?;
415    let quad = g_inner(manifold, x, eta, heta.view())?;
416    Ok(-lin - 0.5 * quad)
417}
418
419#[derive(Debug, Clone, PartialEq)]
420pub struct RiemannianLBFGS {
421    pub history: usize,
422    pub step_size: f64,
423    pub max_iter: usize,
424    pub grad_tol: f64,
425}
426
427impl Default for RiemannianLBFGS {
428    fn default() -> Self {
429        Self {
430            history: 10,
431            step_size: 1.0,
432            max_iter: 100,
433            grad_tol: 1.0e-8,
434        }
435    }
436}
437
438/// One stored secant pair, kept with its base point so the two-loop recursion
439/// can transport it into whatever the current tangent space is. `s` and `y`
440/// both live in `T_{base}M`.
441#[derive(Clone)]
442struct SecantPair {
443    base: Array1<f64>,
444    s: Array1<f64>,
445    y: Array1<f64>,
446}
447
448impl RiemannianLBFGS {
449    /// Riemannian L-BFGS with a backtracking-and-expansion Armijo line search.
450    ///
451    /// The search starts at the user-supplied `step_size` (a hint, not a hard
452    /// cap) and first *expands* by doubling while the Armijo sufficient-
453    /// decrease condition continues to hold and the objective is still
454    /// strictly improving. Once expansion stalls, it accepts the best step
455    /// it has seen so far; if even the initial trial violates Armijo, it
456    /// *contracts* by halving until Armijo holds or a safeguard floor is
457    /// reached. This makes the optimizer robust to mis-scaled `step_size`
458    /// inputs (including the Newton-natural α=1 that BFGS expects on
459    /// well-conditioned quadratics) without forcing the caller to retune
460    /// it, and preserves the secant pair (s, y) curvature condition so the
461    /// L-BFGS inverse-Hessian approximation stays SPD.
462    ///
463    /// All inner products use the manifold metric `g_x(·,·)`, and every secant
464    /// pair is *parallel-transported into the current tangent space* before it
465    /// enters the two-loop recursion, so the BFGS algebra never mixes vectors
466    /// living in different tangent spaces (the bug fixed in #616). The freshly
467    /// formed secant pair likewise transports the accepted step from the old
468    /// tangent space into the new one before pairing it with the gradient
469    /// difference, so both `s` and `y` live in `T_{x_new}M`.
470    pub fn minimize(
471        &self,
472        manifold: &dyn RiemannianManifold,
473        objective: &mut dyn RiemannianObjective,
474        initial: ArrayView1<'_, f64>,
475    ) -> GeometryResult<Array1<f64>> {
476        let mut x = initial.to_owned();
477        let d = manifold.ambient_dim();
478        check_len("L-BFGS initial point", x.len(), d)?;
479        let mut history: Vec<SecantPair> = Vec::new();
480        let (mut f_curr, grad_e0) = objective.value_gradient(x.view())?;
481        // Riemannian gradient (metric-raised), not a bare tangent projection —
482        // see the trust region above and issue #955. The secant pairs, two-loop
483        // recursion, and Armijo slope are all metric inner products, so they
484        // must operate on the true Riemannian gradient.
485        let mut grad = manifold.riemannian_gradient(x.view(), grad_e0.view())?;
486        // Initial Riemannian gradient norm: the shift-invariant scale of the
487        // relative stationarity test (see `relative_stationarity`).
488        let grad0_norm = g_norm(manifold, x.view(), grad.view())?;
489        let armijo_c: f64 = 1.0e-4;
490        let alpha_min: f64 = 1.0e-16;
491        let alpha_max: f64 = 1.0e16;
492        let initial_step = if self.step_size.is_finite() && self.step_size > 0.0 {
493            self.step_size
494        } else {
495            1.0
496        };
497        for _ in 0..self.max_iter {
498            // Shift-invariant (relative) stationarity test, identical in form to
499            // the trust region's (see `relative_stationarity`): the current
500            // gradient norm is measured against the initial one, so a fixed
501            // `grad_tol` reads as a *relative* tolerance for a large-scale
502            // objective (e.g. the profiled REML latent objective, issue #879)
503            // while staying invariant under an additive shift of `f` (#954).
504            // Reduces to the absolute test on a unit-scale objective.
505            let grad_norm = g_norm(manifold, x.view(), grad.view())?;
506            if relative_stationarity(grad_norm, grad0_norm) <= self.grad_tol {
507                break;
508            }
509            let direction = two_loop(manifold, x.view(), grad.view(), &history)?;
510            let direction = -&direction;
511            let slope = g_inner(manifold, x.view(), grad.view(), direction.view())?;
512            // Guard against ascent directions caused by stale curvature; if the
513            // BFGS direction is not a (metric) descent direction, fall back to
514            // the projected steepest-descent direction so progress is
515            // guaranteed.
516            let (direction, slope) = if slope < 0.0 {
517                (direction, slope)
518            } else {
519                let sd = -&grad;
520                let s_sd = g_inner(manifold, x.view(), grad.view(), sd.view())?;
521                (sd, s_sd)
522            };
523            let old_x = x.clone();
524            let old_grad = grad.clone();
525            // --- Armijo line search with bidirectional adaptation. ---
526            let mut alpha = initial_step;
527            let mut best_alpha = 0.0;
528            let mut best_f = f_curr;
529            let mut best_x = x.clone();
530            let mut best_grad = grad.clone();
531            // First try to expand: while Armijo holds and the objective keeps
532            // improving, double the step.
533            loop {
534                let step = &direction * alpha;
535                let trial_x = manifold.retract(x.view(), step.view())?;
536                let (f_trial, g_trial_e) = objective.value_gradient(trial_x.view())?;
537                let armijo_rhs = f_curr + armijo_c * alpha * slope;
538                let accepted = f_trial.is_finite() && f_trial <= armijo_rhs;
539                if accepted && f_trial < best_f {
540                    best_alpha = alpha;
541                    best_f = f_trial;
542                    best_x = trial_x;
543                    // Riemannian (metric-raised) gradient at the trial point —
544                    // the object the secant pair (s, y) is formed from (#955).
545                    best_grad = manifold.riemannian_gradient(best_x.view(), g_trial_e.view())?;
546                    if alpha >= alpha_max {
547                        break;
548                    }
549                    alpha *= 2.0;
550                } else {
551                    break;
552                }
553            }
554            // If no expansion succeeded, contract from the initial trial.
555            if best_alpha == 0.0 {
556                alpha = initial_step;
557                while alpha > alpha_min {
558                    let step = &direction * alpha;
559                    let trial_x = manifold.retract(x.view(), step.view())?;
560                    let (f_trial, g_trial_e) = objective.value_gradient(trial_x.view())?;
561                    let armijo_rhs = f_curr + armijo_c * alpha * slope;
562                    if f_trial.is_finite() && f_trial <= armijo_rhs {
563                        best_alpha = alpha;
564                        best_f = f_trial;
565                        best_x = trial_x;
566                        // Riemannian (metric-raised) gradient at the trial point.
567                        best_grad =
568                            manifold.riemannian_gradient(best_x.view(), g_trial_e.view())?;
569                        break;
570                    }
571                    alpha *= 0.5;
572                }
573            }
574            if best_alpha == 0.0 {
575                // No admissible step found — terminate at the current point.
576                break;
577            }
578            // The accepted tangent step at `old_x` (the actual move taken).
579            let eta = &direction * best_alpha;
580            x = best_x;
581            f_curr = best_f;
582            grad = best_grad;
583
584            // --- Secant pair, formed entirely in T_{x_new}M (the #616 fix). ---
585            // Parallel-transport the accepted step from T_{old_x}M to T_{x}M so
586            // it is a tangent at the NEW point, matching the gradient there. The
587            // old gradient is likewise transported to T_{x}M before subtraction.
588            let path = transport_path(&old_x, &x);
589            let s = manifold.parallel_transport(path.view(), eta.view())?;
590            let transported_old_grad = manifold.parallel_transport(path.view(), old_grad.view())?;
591            let y = &grad - &transported_old_grad;
592            // Commit the (s, y) pair only when the metric curvature condition
593            // g_x(s, y) > 0 holds (strict positivity). This is required for the
594            // implicit BFGS inverse-Hessian update to remain SPD.
595            let sy = g_inner(manifold, x.view(), s.view(), y.view())?;
596            if sy > 1.0e-14 {
597                history.push(SecantPair {
598                    base: x.clone(),
599                    s,
600                    y,
601                });
602                if history.len() > self.history {
603                    history.remove(0);
604                }
605            }
606        }
607        Ok(x)
608    }
609}
610
611/// Build the 2×D point path matrix `[p_from; p_to]` consumed by
612/// [`RiemannianManifold::parallel_transport`].
613fn transport_path(p_from: &Array1<f64>, p_to: &Array1<f64>) -> ndarray::Array2<f64> {
614    let d = p_from.len();
615    let mut path = ndarray::Array2::<f64>::zeros((2, d));
616    path.row_mut(0).assign(p_from);
617    path.row_mut(1).assign(p_to);
618    path
619}
620
621/// L-BFGS two-loop recursion in the CURRENT tangent space `T_xM`.
622///
623/// Each stored secant pair `(s_i, y_i)` lives in `T_{base_i}M`; before it can
624/// participate in the recursion it is parallel-transported into `T_xM`. All
625/// inner products use the manifold metric `g_x(·,·)` at the current point, so
626/// no two vectors from different tangent spaces are ever combined (#616).
627fn two_loop(
628    manifold: &dyn RiemannianManifold,
629    x: ArrayView1<'_, f64>,
630    grad: ArrayView1<'_, f64>,
631    history: &[SecantPair],
632) -> GeometryResult<Array1<f64>> {
633    // Transport every stored pair into the current tangent space once.
634    let mut s_cur: Vec<Array1<f64>> = Vec::with_capacity(history.len());
635    let mut y_cur: Vec<Array1<f64>> = Vec::with_capacity(history.len());
636    for pair in history {
637        let path = transport_path(&pair.base, &x.to_owned());
638        s_cur.push(manifold.parallel_transport(path.view(), pair.s.view())?);
639        y_cur.push(manifold.parallel_transport(path.view(), pair.y.view())?);
640    }
641
642    let mut q = grad.to_owned();
643    let mut alpha = vec![0.0; history.len()];
644    let mut rho = vec![0.0; history.len()];
645    for i in (0..history.len()).rev() {
646        let sy = g_inner(manifold, x, s_cur[i].view(), y_cur[i].view())?;
647        rho[i] = 1.0 / sy;
648        alpha[i] = rho[i] * g_inner(manifold, x, s_cur[i].view(), q.view())?;
649        q = &q - &(&y_cur[i] * alpha[i]);
650    }
651    let mut r = q;
652    if let (Some(s), Some(y)) = (s_cur.last(), y_cur.last()) {
653        let yy = g_inner(manifold, x, y.view(), y.view())?;
654        if yy > 1.0e-14 {
655            let sy = g_inner(manifold, x, s.view(), y.view())?;
656            r = &r * (sy / yy);
657        }
658    }
659    for i in 0..history.len() {
660        let beta = rho[i] * g_inner(manifold, x, y_cur[i].view(), r.view())?;
661        r = &r + &(&s_cur[i] * (alpha[i] - beta));
662    }
663    Ok(r)
664}
665
666#[cfg(test)]
667mod tests {
668    use super::*;
669    use crate::EuclideanManifold;
670    use ndarray::{Array1, ArrayView1};
671
672    /// Scalar objective `f(x) = x²` on the 1-D Euclidean line. Gradient `2x`,
673    /// Hessian `2`, exposed as an HVP so the trust region runs Steihaug-CG.
674    struct Square;
675    impl RiemannianObjective for Square {
676        fn value_gradient(
677            &mut self,
678            point: ArrayView1<'_, f64>,
679        ) -> GeometryResult<(f64, Array1<f64>)> {
680            let x = point[0];
681            Ok((x * x, Array1::from_vec(vec![2.0 * x])))
682        }
683        fn hessian_vector_product(
684            &mut self,
685            point: ArrayView1<'_, f64>,
686            tangent: ArrayView1<'_, f64>,
687        ) -> GeometryResult<Option<Array1<f64>>> {
688            assert!(point.iter().all(|value| value.is_finite()));
689            check_len("hessian_vector_product tangent", tangent.len(), point.len())?;
690            Ok(Some(&tangent.to_owned() * 2.0))
691        }
692    }
693
694    /// Gradient-only variant of `f(x)=x²` (no HVP) to exercise the Cauchy-point
695    /// branch of the trust region.
696    struct SquareGradOnly;
697    impl RiemannianObjective for SquareGradOnly {
698        fn value_gradient(
699            &mut self,
700            point: ArrayView1<'_, f64>,
701        ) -> GeometryResult<(f64, Array1<f64>)> {
702            let x = point[0];
703            Ok((x * x, Array1::from_vec(vec![2.0 * x])))
704        }
705    }
706
707    /// General convex quadratic `f(x) = ½ xᵀ A x − bᵀ x` on Euclidean R^n with
708    /// SPD `A`; minimizer solves `A x = b`. Provides an exact HVP `A v`.
709    struct Quadratic {
710        a: ndarray::Array2<f64>,
711        b: Array1<f64>,
712    }
713    impl RiemannianObjective for Quadratic {
714        fn value_gradient(
715            &mut self,
716            point: ArrayView1<'_, f64>,
717        ) -> GeometryResult<(f64, Array1<f64>)> {
718            let ax = self.a.dot(&point.to_owned());
719            let val = 0.5 * point.dot(&ax) - self.b.dot(&point.to_owned());
720            let grad = &ax - &self.b;
721            Ok((val, grad))
722        }
723        fn hessian_vector_product(
724            &mut self,
725            point: ArrayView1<'_, f64>,
726            tangent: ArrayView1<'_, f64>,
727        ) -> GeometryResult<Option<Array1<f64>>> {
728            assert!(point.iter().all(|value| value.is_finite()));
729            check_len("hessian_vector_product tangent", tangent.len(), point.len())?;
730            Ok(Some(self.a.dot(&tangent.to_owned())))
731        }
732    }
733
734    /// (#615 counterexample) A correct trust region on `f(x)=x²` from `x₀=0.1`
735    /// with `Δ=1` must CONVERGE to 0 (not oscillate), monotonically driving `f`
736    /// down — never increasing it on an accepted iterate.
737    #[test]
738    fn trust_region_converges_on_square_steihaug() {
739        let manifold = EuclideanManifold::new(1);
740        let tr = RiemannianTrustRegion {
741            radius: 1.0,
742            max_radius: 1.0e6,
743            max_iter: 100,
744            grad_tol: 1.0e-12,
745        };
746        let mut obj = Square;
747        let x0 = Array1::from_vec(vec![0.1]);
748        let x = tr
749            .minimize(&manifold, &mut obj, x0.view())
750            .expect("TR runs");
751        assert!(
752            x[0].abs() < 1.0e-6,
753            "trust region must converge to 0, got {}",
754            x[0]
755        );
756    }
757
758    /// The trust region must never increase `f` across accepted iterates. We
759    /// check the monotone-descent invariant directly by stepping the public
760    /// `minimize` from a sequence of decreasing budgets and confirming the
761    /// returned value is below the start value, and that from `x₀=0.1` it does
762    /// not return a point with larger `|x|`.
763    #[test]
764    fn trust_region_never_increases_objective() {
765        let manifold = EuclideanManifold::new(1);
766        let tr = RiemannianTrustRegion {
767            radius: 1.0,
768            max_radius: 1.0e6,
769            max_iter: 1,
770            grad_tol: 1.0e-12,
771        };
772        let mut obj = Square;
773        // A single TR iteration from 0.1: with exact Hessian the Newton step
774        // lands at the minimum (inside Δ=1), ρ=1, so it must be accepted and f
775        // must strictly decrease.
776        let x0 = Array1::from_vec(vec![0.1]);
777        let f0 = obj.value_gradient(x0.view()).unwrap().0;
778        let x1 = tr
779            .minimize(&manifold, &mut obj, x0.view())
780            .expect("TR runs");
781        let f1 = obj.value_gradient(x1.view()).unwrap().0;
782        assert!(f1 <= f0, "objective increased: {f0} -> {f1}");
783        assert!(x1[0].abs() <= x0[0].abs() + 1e-15, "moved away from min");
784    }
785
786    /// Cauchy-point branch (no HVP) must still be a real trust-region method:
787    /// from `x₀=0.1`, `Δ=1` on `f(x)=x²` it converges toward 0 and never
788    /// oscillates upward in `f`.
789    #[test]
790    fn trust_region_cauchy_point_converges() {
791        let manifold = EuclideanManifold::new(1);
792        let tr = RiemannianTrustRegion {
793            radius: 1.0,
794            max_radius: 1.0e6,
795            max_iter: 500,
796            grad_tol: 1.0e-12,
797        };
798        let mut obj = SquareGradOnly;
799        let x0 = Array1::from_vec(vec![0.1]);
800        let x = tr
801            .minimize(&manifold, &mut obj, x0.view())
802            .expect("TR runs");
803        assert!(
804            x[0].abs() < 1.0e-6,
805            "Cauchy-point trust region must converge to 0, got {}",
806            x[0]
807        );
808    }
809
810    /// Steihaug-CG trust region on a 3-D SPD quadratic must reach the exact
811    /// minimizer `A⁻¹ b`.
812    #[test]
813    fn trust_region_solves_spd_quadratic() {
814        let manifold = EuclideanManifold::new(3);
815        let a = ndarray::array![[4.0, 1.0, 0.0], [1.0, 3.0, 1.0], [0.0, 1.0, 2.0],];
816        let b = Array1::from_vec(vec![1.0, 2.0, -1.0]);
817        // Reference solution A x = b.
818        let x_ref = crate::manifold::inverse(&a).unwrap().dot(&b);
819        let mut obj = Quadratic { a, b };
820        let tr = RiemannianTrustRegion {
821            radius: 1.0,
822            max_radius: 1.0e6,
823            max_iter: 200,
824            grad_tol: 1.0e-12,
825        };
826        let x0 = Array1::from_vec(vec![0.0, 0.0, 0.0]);
827        let x = tr
828            .minimize(&manifold, &mut obj, x0.view())
829            .expect("TR runs");
830        for i in 0..3 {
831            assert!(
832                (x[i] - x_ref[i]).abs() < 1.0e-6,
833                "component {i}: got {}, want {}",
834                x[i],
835                x_ref[i]
836            );
837        }
838    }
839
840    /// (#616 sanity) Riemannian L-BFGS on a Euclidean SPD quadratic must reduce
841    /// the objective and converge to the analytic minimizer `A⁻¹ b`.
842    #[test]
843    fn lbfgs_reduces_euclidean_quadratic() {
844        let manifold = EuclideanManifold::new(3);
845        let a = ndarray::array![[5.0, 1.0, 0.5], [1.0, 4.0, 1.0], [0.5, 1.0, 3.0],];
846        let b = Array1::from_vec(vec![2.0, -1.0, 0.5]);
847        let x_ref = crate::manifold::inverse(&a).unwrap().dot(&b);
848        let mut obj = Quadratic { a, b };
849        let lbfgs = RiemannianLBFGS {
850            history: 10,
851            step_size: 1.0,
852            max_iter: 200,
853            grad_tol: 1.0e-10,
854        };
855        let x0 = Array1::from_vec(vec![0.0, 0.0, 0.0]);
856        let f0 = obj.value_gradient(x0.view()).unwrap().0;
857        let x = lbfgs
858            .minimize(&manifold, &mut obj, x0.view())
859            .expect("L-BFGS runs");
860        let f1 = obj.value_gradient(x.view()).unwrap().0;
861        assert!(f1 < f0, "L-BFGS did not reduce the quadratic: {f0} -> {f1}");
862        for i in 0..3 {
863            assert!(
864                (x[i] - x_ref[i]).abs() < 1.0e-6,
865                "component {i}: got {}, want {}",
866                x[i],
867                x_ref[i]
868            );
869        }
870    }
871}