Skip to main content

gam_geometry/
manifold.rs

1use std::fmt;
2
3use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
4
5pub const GEOMETRY_EPS: f64 = 1.0e-12;
6
7#[derive(Debug, Clone, PartialEq)]
8pub enum GeometryError {
9    DimensionMismatch {
10        context: &'static str,
11        expected: usize,
12        got: usize,
13    },
14    InvalidPoint(&'static str),
15    Singular(&'static str),
16    /// A manifold primitive has no implementation for this manifold and must
17    /// not silently fall back to a wrong default (e.g. a curved-manifold VJP
18    /// for which no closed form is wired up yet).
19    Unsupported(&'static str),
20}
21
22impl fmt::Display for GeometryError {
23    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
24        match self {
25            Self::DimensionMismatch {
26                context,
27                expected,
28                got,
29            } => write!(f, "{context} expected length {expected}, got {got}"),
30            Self::InvalidPoint(message) => write!(f, "invalid manifold point: {message}"),
31            Self::Singular(message) => write!(f, "singular geometry operation: {message}"),
32            Self::Unsupported(message) => write!(f, "unsupported geometry operation: {message}"),
33        }
34    }
35}
36
37impl std::error::Error for GeometryError {}
38
39pub type GeometryResult<T> = Result<T, GeometryError>;
40
41pub trait RiemannianManifold: Send + Sync {
42    fn dim(&self) -> usize;
43
44    fn ambient_dim(&self) -> usize {
45        self.dim()
46    }
47
48    fn tangent_basis(&self, point: ArrayView1<'_, f64>) -> GeometryResult<Array2<f64>>;
49
50    fn exp_map(
51        &self,
52        point: ArrayView1<'_, f64>,
53        tangent_vec: ArrayView1<'_, f64>,
54    ) -> GeometryResult<Array1<f64>>;
55
56    fn log_map(
57        &self,
58        p_from: ArrayView1<'_, f64>,
59        p_to: ArrayView1<'_, f64>,
60    ) -> GeometryResult<Array1<f64>>;
61
62    fn parallel_transport(
63        &self,
64        point_along: ArrayView2<'_, f64>,
65        vec: ArrayView1<'_, f64>,
66    ) -> GeometryResult<Array1<f64>>;
67
68    fn metric_tensor(&self, point: ArrayView1<'_, f64>) -> GeometryResult<Array2<f64>>;
69
70    fn christoffel_symbols(&self, point: ArrayView1<'_, f64>) -> GeometryResult<Vec<Array2<f64>>> {
71        check_len("Christoffel point", point.len(), self.ambient_dim())?;
72        Err(GeometryError::Unsupported(
73            "Christoffel symbols require a manifold-specific local chart",
74        ))
75    }
76
77    fn sectional_curvature(
78        &self,
79        point: ArrayView1<'_, f64>,
80        tangent_pair: (ArrayView1<'_, f64>, ArrayView1<'_, f64>),
81    ) -> GeometryResult<f64>;
82
83    fn project_tangent(
84        &self,
85        point: ArrayView1<'_, f64>,
86        vec: ArrayView1<'_, f64>,
87    ) -> GeometryResult<Array1<f64>> {
88        // Default projection is the identity (Euclidean-flat tangent space).
89        // Validate that BOTH the base point and the tangent vector live in the
90        // ambient space so a caller passing a wrong-length vector fails fast
91        // here rather than producing a silently mis-shaped tangent vector. The
92        // tangent of `T_pM` is represented in the same ambient coordinates as
93        // the point, so its length must equal `ambient_dim()` too.
94        let expected = self.ambient_dim();
95        if point.len() != expected {
96            return Err(GeometryError::DimensionMismatch {
97                context: "project_tangent point",
98                expected,
99                got: point.len(),
100            });
101        }
102        if vec.len() != expected {
103            return Err(GeometryError::DimensionMismatch {
104                context: "project_tangent vector",
105                expected,
106                got: vec.len(),
107            });
108        }
109        Ok(vec.to_owned())
110    }
111
112    /// Riemannian gradient of a scalar `f` raised from its **ambient Euclidean
113    /// differential** `e` — the vector `∂f/∂x` in ambient coordinates that an
114    /// objective returns from its `value_gradient`.
115    ///
116    /// The Riemannian gradient is the Riesz representative of the differential
117    /// under the manifold metric `g`: the unique tangent vector `v` satisfying
118    ///
119    /// ```text
120    ///   g_x(v, ξ) = Df_x[ξ] = ⟨e, ξ⟩   for every tangent ξ.
121    /// ```
122    ///
123    /// Orthogonally projecting `e` onto the tangent space ([`project_tangent`])
124    /// produces `v` **only** for the embedded/identity metric. For a genuine
125    /// Riemannian metric (affine-invariant SPD, canonical Stiefel, …) the
126    /// differential must be *raised through the metric* — projecting alone gives
127    /// the wrong direction and the wrong slope, so any model linear term or
128    /// Armijo slope built from it is not even first-order accurate (issue #955).
129    ///
130    /// The default raises `e` in a tangent basis `B = tangent_basis(x)` against
131    /// the metric `G = metric_tensor(x)`:
132    ///
133    /// ```text
134    ///   v = B (Bᵀ G B)⁻¹ Bᵀ e.
135    /// ```
136    ///
137    /// This is the Riesz representative for ANY basis `B` of `T_xM` (proof: for
138    /// `ξ = B c`, `g_x(v, ξ) = eᵀ B (Bᵀ G B)⁻¹ (Bᵀ G B) c = eᵀ B c = ⟨e, ξ⟩`),
139    /// and it collapses to the orthogonal tangent projection `B Bᵀ e` exactly
140    /// when `B` is metric-orthonormal / the metric is the embedded one. It is the
141    /// mathematically correct fallback, so a future non-identity-metric manifold
142    /// is never silently first-order wrong.
143    ///
144    /// Manifolds whose tangent projection already coincides with this (every
145    /// *embedded* manifold carrying the induced metric — Euclidean, Sphere,
146    /// Circle, Torus, Grassmann) override with the O(m) `project_tangent`;
147    /// manifolds with a slick closed form (SPD: `P·sym(E)·P`; Stiefel:
148    /// `E − Y Eᵀ Y`) override with that, avoiding the dense `m×m` metric tensor.
149    fn riemannian_gradient(
150        &self,
151        point: ArrayView1<'_, f64>,
152        euclidean_grad: ArrayView1<'_, f64>,
153    ) -> GeometryResult<Array1<f64>> {
154        let m = self.ambient_dim();
155        check_len("riemannian_gradient point", point.len(), m)?;
156        check_len(
157            "riemannian_gradient euclidean_grad",
158            euclidean_grad.len(),
159            m,
160        )?;
161        let b = self.tangent_basis(point)?; // m × d
162        let g = self.metric_tensor(point)?; // m × m
163        // Bᵀ e  (length d) and the Gram matrix Bᵀ G B  (d × d).
164        let bt = b.t();
165        let bte = bt.dot(&euclidean_grad.to_owned());
166        let gb = g.dot(&b);
167        let btgb = bt.dot(&gb);
168        if btgb.nrows() == 0 {
169            // A zero-dimensional tangent space (no degrees of freedom): the only
170            // tangent vector is 0.
171            return Ok(Array1::<f64>::zeros(m));
172        }
173        // Solve (BᵀGB) c = Bᵀ e for the basis coordinates of v, then v = B c.
174        let c = inverse(&btgb)?.dot(&bte);
175        Ok(b.dot(&c))
176    }
177
178    fn retract(
179        &self,
180        point: ArrayView1<'_, f64>,
181        tangent_vec: ArrayView1<'_, f64>,
182    ) -> GeometryResult<Array1<f64>> {
183        self.exp_map(point, tangent_vec)
184    }
185
186    /// Whether [`retract`](Self::retract) is at least a SECOND-ORDER retraction,
187    /// i.e. `D²(f∘R_x)(0) = Hess f(x)` for all `f`, so the trust-region quadratic
188    /// model built from the Riemannian Hessian is a valid second-order model of
189    /// `f` along the retraction (issue #956).
190    ///
191    /// Manifolds whose `retract` is the exponential map or another second-order
192    /// retraction return `true` (the default — the default `retract` *is*
193    /// `exp_map`, which is second-order). A manifold exposing only a FIRST-ORDER
194    /// retraction (e.g. the Stiefel/Grassmann QR retraction `qf(Y + Δ)`, whose
195    /// acceleration at `0` is not normal to the manifold) must override this to
196    /// `false`: the linear model term `Df_x[η]` is retraction-independent and
197    /// stays correct, but the Riemannian-Hessian quadratic term is *not* the
198    /// second derivative of `f∘R_x` and would corrupt the predicted-vs-actual
199    /// reduction ratio `ρ` and hence the trust-region radius control. The trust
200    /// region falls back to the first-order-correct Cauchy model in that case.
201    fn retraction_is_second_order(&self) -> bool {
202        true
203    }
204
205    /// Vector–Jacobian product of the ambient map `exp_p(v)`.
206    ///
207    /// Given a cotangent `grad_output` w.r.t. the ambient output of
208    /// [`exp_map`](Self::exp_map), return `(grad_point, grad_tangent)`, the
209    /// pullbacks w.r.t. the base point `p` and the (raw, unprojected) tangent
210    /// input `v`. This is the analytic backward used by reverse-mode autodiff
211    /// wrappers (e.g. the Python `torch.autograd.Function` around
212    /// `manifold_exp_map`); it must never be the silent straight-through
213    /// identity for a curved manifold.
214    ///
215    /// The default is the exact VJP for *flat* manifolds, where
216    /// `exp_p(v) = p + v` in ambient coordinates and so both Jacobians are the
217    /// identity (Euclidean, Circle, Torus, and products thereof). Curved
218    /// manifolds **must** override this with their analytic Jacobi-field VJP;
219    /// a manifold without a closed form must override it to return an error
220    /// rather than inherit the wrong identity default.
221    fn exp_map_vjp(
222        &self,
223        point: ArrayView1<'_, f64>,
224        tangent_vec: ArrayView1<'_, f64>,
225        grad_output: ArrayView1<'_, f64>,
226    ) -> GeometryResult<(Array1<f64>, Array1<f64>)> {
227        let m = self.ambient_dim();
228        check_len("exp_map_vjp point", point.len(), m)?;
229        check_len("exp_map_vjp tangent", tangent_vec.len(), m)?;
230        check_len("exp_map_vjp grad_output", grad_output.len(), m)?;
231        Ok((grad_output.to_owned(), grad_output.to_owned()))
232    }
233}
234
235#[derive(Debug, Clone, PartialEq)]
236pub enum ManifoldSpec {
237    Euclidean(usize),
238    Circle,
239    Sphere { intrinsic_dim: usize },
240    Torus { dim: usize },
241    Grassmann { k: usize, n: usize },
242    Stiefel { k: usize, n: usize },
243    Spd { n: usize },
244    Product(Vec<ManifoldSpec>),
245}
246
247impl ManifoldSpec {
248    /// Instantiate the concrete [`RiemannianManifold`] for this descriptor.
249    ///
250    /// Fallible because the constrained-frame families have nonempty domains:
251    /// `Gr(k, n)` and `St(n, k)` exist only for `1 ≤ k ≤ n`. An out-of-domain
252    /// descriptor is rejected here (and recursively for [`Product`] parts)
253    /// before any dimension, projection, exponential, or curvature computation
254    /// can run on a nonexistent manifold.
255    ///
256    /// [`Product`]: Self::Product
257    pub fn build(&self) -> GeometryResult<Box<dyn RiemannianManifold>> {
258        match self {
259            Self::Euclidean(dim) => Ok(Box::new(crate::EuclideanManifold::new(*dim))),
260            Self::Circle => Ok(Box::new(crate::CircleManifold::new())),
261            Self::Sphere { intrinsic_dim } => {
262                Ok(Box::new(crate::SphereManifold::new(*intrinsic_dim)))
263            }
264            Self::Torus { dim } => Ok(Box::new(crate::TorusManifold::new(*dim))),
265            Self::Grassmann { k, n } => Ok(Box::new(crate::GrassmannManifold::new(*k, *n)?)),
266            Self::Stiefel { k, n } => Ok(Box::new(crate::StiefelManifold::new(*k, *n)?)),
267            Self::Spd { n } => Ok(Box::new(crate::SpdManifold::new(*n))),
268            Self::Product(parts) => {
269                let mut built = Vec::with_capacity(parts.len());
270                for part in parts {
271                    built.push(part.build()?);
272                }
273                Ok(Box::new(crate::ProductManifold::new(built)))
274            }
275        }
276    }
277}
278
279pub(crate) const fn check_len(
280    context: &'static str,
281    got: usize,
282    expected: usize,
283) -> GeometryResult<()> {
284    if got == expected {
285        Ok(())
286    } else {
287        Err(GeometryError::DimensionMismatch {
288            context,
289            expected,
290            got,
291        })
292    }
293}
294
295pub(crate) fn dot(a: ArrayView1<'_, f64>, b: ArrayView1<'_, f64>) -> f64 {
296    assert_eq!(a.len(), b.len());
297    let mut out = 0.0;
298    for i in 0..a.len() {
299        out += a[i] * b[i];
300    }
301    out
302}
303
304/// Multi-GPU row-tiled matrix product `A·B`, fanned across **all** usable
305/// devices.
306///
307/// `A` is `m×k` and `B` is `k×n`; the result is `m×n`. The single-device
308/// `fast_ab` shim already offloads this GEMM, but it pins the launch to the
309/// primary device. For a tall `A` (many independent output rows — the common
310/// case when a manifold operation is applied to a large batch of points/atoms),
311/// the rows split cleanly across the pool: we reshape `A` into a
312/// `tiles × rows_per_tile × k` batch and call the broadcast-`B` strided-batched
313/// GEMM, which [`crate::gpu::pool::scatter_batched`]es one cuBLAS call per device
314/// on its own bound context (`b` is shared across every tile). The output tiles
315/// are stitched back into the `m×n` result. Any leftover rows that don't fill a
316/// whole tile, and the entire batch when the pool has one device / the workload
317/// is below the multi-GPU floor / the runtime is unavailable, fall through to the
318/// auto-dispatch `fast_ab` (single-device GPU or faer). f64 throughout, so the
319/// result is identical regardless of which path produced it.
320///
321/// Choosing the tiling: we target as many equal tiles as there are output rows
322/// can support while keeping each tile a non-trivial GEMM, so the batch axis is
323/// long enough to cross `crate::gpu::linalg_dispatch`'s multi-GPU batch floor and spread
324/// across every device.
325pub(crate) fn fast_ab_rows_multi_gpu(
326    a: ArrayView2<'_, f64>,
327    b: ArrayView2<'_, f64>,
328) -> Array2<f64> {
329    use gam_linalg::faer_ndarray::fast_ab;
330    let (m, k) = a.dim();
331    let (kb, n) = b.dim();
332    assert_eq!(k, kb, "fast_ab_rows_multi_gpu inner dimension mismatch");
333
334    // Only worth the reshape/stitch overhead when the pool actually has more than
335    // one device and there are enough rows to tile across it; otherwise the plain
336    // single-device shim is strictly better.
337    let multi_gpu = gam_linalg::gpu_hook::gpu_dispatch().is_some_and(|d| d.device_count() > 1);
338    // The batch axis must clear the multi-GPU floor used inside the dispatch
339    // layer (64) for the split to engage, so we need at least that many tiles.
340    const MIN_TILES: usize = 64;
341    const MIN_TILE_ROWS: usize = 4;
342    if multi_gpu && m >= MIN_TILES * MIN_TILE_ROWS && n > 0 {
343        let rows_per_tile = (m / MIN_TILES).max(MIN_TILE_ROWS);
344        let tiles = m / rows_per_tile;
345        let covered = tiles * rows_per_tile;
346        // Reshape the first `covered` rows into a tiles×rows_per_tile×k batch
347        // (row-major reshape is exactly the row-block tiling we want).
348        let a3 = a
349            .slice(ndarray::s![0..covered, ..])
350            .to_owned()
351            .into_shape_with_order((tiles, rows_per_tile, k));
352        if let Ok(a3) = a3 {
353            if let Some(result3) = gam_linalg::gpu_hook::gpu_dispatch()
354                .and_then(|d| d.try_fast_ab_broadcast_b_batched(a3.view(), b.view()))
355            {
356                let mut out = Array2::<f64>::zeros((m, n));
357                for t in 0..tiles {
358                    let block = result3.index_axis(ndarray::Axis(0), t);
359                    out.slice_mut(ndarray::s![t * rows_per_tile..(t + 1) * rows_per_tile, ..])
360                        .assign(&block);
361                }
362                // Tail rows that didn't fill a whole tile finish on the
363                // single-device shim; the result is bit-identical f64.
364                if covered < m {
365                    let tail = fast_ab(&a.slice(ndarray::s![covered..m, ..]), &b);
366                    out.slice_mut(ndarray::s![covered..m, ..]).assign(&tail);
367                }
368                return out;
369            }
370        }
371    }
372    // Single device / small batch / no runtime: plain auto-dispatch GEMM.
373    fast_ab(&a, &b)
374}
375
376pub(crate) fn norm(a: ArrayView1<'_, f64>) -> f64 {
377    dot(a, a).sqrt()
378}
379
380/// Metric inner product `aᵀ G b` for a (symmetric) metric tensor `G`.
381///
382/// For a manifold whose `metric_tensor` is the ambient identity this reduces
383/// to the Euclidean `dot`; for one with a genuine Riemannian metric (e.g. the
384/// affine-invariant SPD metric) it evaluates the correct geometric inner
385/// product on the tangent space.
386pub(crate) fn quad_form(
387    g: ArrayView2<'_, f64>,
388    a: ArrayView1<'_, f64>,
389    b: ArrayView1<'_, f64>,
390) -> f64 {
391    let n = a.len();
392    assert_eq!(g.nrows(), n);
393    assert_eq!(g.ncols(), b.len());
394    // aᵀ G b: the inner matrix–vector product G·b is the O(n²) cost and is the
395    // hot kernel of every metric inner product (g_inner / g_norm) and of the
396    // metric Gram–Schmidt tangent basis. Route it through the GPU-dispatched
397    // fast_av shim so large-ambient metrics (SPD/Stiefel/Grassmann n²×n²) offload
398    // to the GPU; the trailing a·(Gb) is an O(n) dot.
399    let gb = gam_linalg::faer_ndarray::fast_av(&g, &b);
400    dot(a, gb.view())
401}
402
403pub(crate) fn identity(n: usize) -> Array2<f64> {
404    let mut out = Array2::<f64>::zeros((n, n));
405    for i in 0..n {
406        out[[i, i]] = 1.0;
407    }
408    out
409}
410
411pub(crate) fn zero_christoffel(dim: usize) -> Vec<Array2<f64>> {
412    (0..dim).map(|_| Array2::<f64>::zeros((dim, dim))).collect()
413}
414
415pub(crate) fn wrap_angle(theta: f64) -> f64 {
416    let two_pi = std::f64::consts::PI * 2.0;
417    (theta + std::f64::consts::PI).rem_euclid(two_pi) - std::f64::consts::PI
418}
419
420pub(crate) fn sym(a: &Array2<f64>) -> Array2<f64> {
421    let mut out = a.clone();
422    for i in 0..a.nrows() {
423        for j in 0..a.ncols() {
424            out[[i, j]] = 0.5 * (a[[i, j]] + a[[j, i]]);
425        }
426    }
427    out
428}
429
430pub(crate) fn from_flat(
431    v: ArrayView1<'_, f64>,
432    rows: usize,
433    cols: usize,
434) -> GeometryResult<Array2<f64>> {
435    check_len("flat matrix", v.len(), rows * cols)?;
436    let mut out = Array2::<f64>::zeros((rows, cols));
437    for i in 0..rows {
438        for j in 0..cols {
439            out[[i, j]] = v[i * cols + j];
440        }
441    }
442    Ok(out)
443}
444
445pub(crate) fn flatten(a: &Array2<f64>) -> Array1<f64> {
446    let mut out = Array1::<f64>::zeros(a.nrows() * a.ncols());
447    for i in 0..a.nrows() {
448        for j in 0..a.ncols() {
449            out[i * a.ncols() + j] = a[[i, j]];
450        }
451    }
452    out
453}
454
455/// Build a **Euclidean-orthonormal** basis of the tangent space at `point` by
456/// modified Gram–Schmidt over the projected ambient standard basis.
457///
458/// The returned columns satisfy `Qᵀ Q = I` under the *ambient Euclidean* inner
459/// product (the plain `dot`). This is the correct, intended basis for a
460/// manifold whose Riemannian metric *is* the embedded Euclidean metric on its
461/// horizontal tangent space — notably the **Grassmann** manifold, where the
462/// tangent inner product is `tr(Δ₁ᵀΔ₂)`.
463///
464/// It is **not** metric-orthonormal for a manifold with a non-Euclidean metric
465/// (Stiefel's canonical metric `⟨Δ₁,Δ₂⟩ = tr(Δ₁ᵀ(I−½YYᵀ)Δ₂)`, or SPD's
466/// affine-invariant metric): for those, use
467/// [`tangent_basis_metric_orthonormal`], which Gram–Schmidts under the
468/// manifold's own `metric_tensor`.
469///
470/// This is the shared engine behind [`tangent_basis`](RiemannianManifold::tangent_basis)
471/// for the matrix manifolds whose tangent space has no closed-form basis. It
472/// walks the `n × k` standard basis in column-major order (outer `col`, inner
473/// `row`), projects each `e_{row,col}` onto the tangent space via
474/// `m.project_tangent`, re-orthogonalizes against the columns accepted so far,
475/// and keeps it iff its residual norm exceeds the `1e-10` drop tolerance,
476/// stopping the moment `m.dim()` independent directions have been collected.
477/// Each caller keeps its own input validation and then delegates here, so the
478/// numerically delicate orthogonalization order, drop tolerance, and early-exit
479/// logic live in exactly one place.
480pub(crate) fn projected_standard_basis_tangent<M: RiemannianManifold + ?Sized>(
481    m: &M,
482    point: ArrayView1<'_, f64>,
483    n: usize,
484    k: usize,
485) -> GeometryResult<Array2<f64>> {
486    let mut columns: Vec<Array1<f64>> = Vec::with_capacity(m.dim());
487    for col in 0..k {
488        for row in 0..n {
489            let mut e = Array2::<f64>::zeros((n, k));
490            e[[row, col]] = 1.0;
491            let mut v = m.project_tangent(point, flatten(&e).view())?;
492            for q in &columns {
493                let proj = dot(q.view(), v.view());
494                v -= &(q * proj);
495            }
496            let nrm = dot(v.view(), v.view()).sqrt();
497            if nrm > 1.0e-10 {
498                columns.push(v / nrm);
499            }
500            if columns.len() == m.dim() {
501                let mut out = Array2::<f64>::zeros((m.ambient_dim(), m.dim()));
502                for j in 0..columns.len() {
503                    for i in 0..m.ambient_dim() {
504                        out[[i, j]] = columns[j][i];
505                    }
506                }
507                return Ok(out);
508            }
509        }
510    }
511    Ok(Array2::<f64>::zeros((m.ambient_dim(), columns.len())))
512}
513
514/// Build a **metric-orthonormal** basis of the tangent space at `point`, i.e. a
515/// set of columns `Q` satisfying `Qᵀ W Q = I` where `W = m.metric_tensor(point)`
516/// is the manifold's Riemannian metric in flattened ambient coordinates.
517///
518/// This is the correct tangent basis for a manifold whose metric is **not** the
519/// embedded Euclidean inner product — Stiefel's canonical metric
520/// `⟨Δ₁,Δ₂⟩ = tr(Δ₁ᵀ(I−½YYᵀ)Δ₂)` and SPD's affine-invariant metric. (For a
521/// Euclidean-metric manifold like Grassmann, `W = I` and this coincides with
522/// [`projected_standard_basis_tangent`].)
523///
524/// Same projected-standard-basis walk as the Euclidean routine, but every inner
525/// product is the metric inner product `⟨u,v⟩_W = uᵀ W v` (via
526/// [`quad_form`]): Gram–Schmidt projections subtract `⟨q,v⟩_W · q` and the
527/// retained columns are normalized by `‖v‖_W = sqrt(⟨v,v⟩_W)`, so the resulting
528/// `Q` is orthonormal *in the manifold's metric*.
529///
530/// Concretely on `St(3, 2)` at `Y = [e₁, e₂]`, the vertical tangent
531/// `Δ = Y·[[0,−1],[1,0]]` has Euclidean norm² 2 but canonical-metric norm² 1, so
532/// a metric-orthonormal basis must reflect that — the Euclidean routine would
533/// mis-scale it.
534pub(crate) fn tangent_basis_metric_orthonormal<M: RiemannianManifold + ?Sized>(
535    m: &M,
536    point: ArrayView1<'_, f64>,
537    n: usize,
538    k: usize,
539) -> GeometryResult<Array2<f64>> {
540    let w = m.metric_tensor(point)?;
541    let mut columns: Vec<Array1<f64>> = Vec::with_capacity(m.dim());
542    for col in 0..k {
543        for row in 0..n {
544            let mut e = Array2::<f64>::zeros((n, k));
545            e[[row, col]] = 1.0;
546            let mut v = m.project_tangent(point, flatten(&e).view())?;
547            for q in &columns {
548                let proj = quad_form(w.view(), q.view(), v.view());
549                v -= &(q * proj);
550            }
551            let nrm = quad_form(w.view(), v.view(), v.view()).max(0.0).sqrt();
552            if nrm > 1.0e-10 {
553                columns.push(v / nrm);
554            }
555            if columns.len() == m.dim() {
556                let mut out = Array2::<f64>::zeros((m.ambient_dim(), m.dim()));
557                for j in 0..columns.len() {
558                    for i in 0..m.ambient_dim() {
559                        out[[i, j]] = columns[j][i];
560                    }
561                }
562                return Ok(out);
563            }
564        }
565    }
566    Ok(Array2::<f64>::zeros((m.ambient_dim(), columns.len())))
567}
568
569/// Thin/compact Gram–Schmidt QR factorization `A = Q·R` for an `n×k` input
570/// (`n ≥ k`). The returned `Q` is `n×k` with **orthonormal columns**
571/// (`QᵀQ = I`) and `R` is `k×k` upper-triangular.
572///
573/// On a rank-deficient column (residual ≈ 0 after orthogonalizing against the
574/// previously accepted columns) the diagonal `R[j, j]` is set to 0 and a
575/// *fallback* unit column is synthesized so the column count stays `k` and `Q`
576/// remains a valid orthonormal frame. The fallback is a standard axis `e_a`
577/// Gram–Schmidted against ALL previously accepted columns and renormalized; if
578/// that residual also vanishes (the axis lies in the accepted span) the next
579/// axis is tried, until an axis with a nonzero orthogonal residual is found.
580/// Simply planting `e_j` (the old behavior) breaks orthonormality — e.g. two
581/// identical columns `(1,1)/√2` would yield a fallback `e₂` with
582/// `q₁·q₂ = 1/√2 ≠ 0`.
583pub(crate) fn qr_thin(a: &Array2<f64>) -> (Array2<f64>, Array2<f64>) {
584    let n = a.nrows();
585    let k = a.ncols();
586    let mut q = Array2::<f64>::zeros((n, k));
587    let mut r = Array2::<f64>::zeros((k, k));
588    for j in 0..k {
589        let mut v = a.column(j).to_owned();
590        for i in 0..j {
591            let qi = q.column(i);
592            let rij = dot(qi, v.view());
593            r[[i, j]] = rij;
594            for row in 0..n {
595                v[row] -= rij * q[[row, i]];
596            }
597        }
598        let nrm = norm(v.view());
599        if nrm > GEOMETRY_EPS {
600            r[[j, j]] = nrm;
601            for row in 0..n {
602                q[[row, j]] = v[row] / nrm;
603            }
604        } else {
605            // Rank-deficient column: `R[j, j] = 0`. Synthesize a fallback unit
606            // column orthogonal to ALL accepted columns 0..j by Gram–Schmidting
607            // a standard axis against them; try successive axes until one has a
608            // nonzero orthogonal residual (always succeeds for j < n since the
609            // accepted columns span a j-dimensional subspace of ℝⁿ, leaving an
610            // (n−j)-dimensional orthogonal complement that at least one axis
611            // touches).
612            r[[j, j]] = 0.0;
613            for axis in 0..n {
614                let mut f = Array1::<f64>::zeros(n);
615                f[axis] = 1.0;
616                for i in 0..j {
617                    let qi = q.column(i);
618                    let proj = dot(qi, f.view());
619                    for row in 0..n {
620                        f[row] -= proj * q[[row, i]];
621                    }
622                }
623                let fnrm = norm(f.view());
624                if fnrm > GEOMETRY_EPS {
625                    for row in 0..n {
626                        q[[row, j]] = f[row] / fnrm;
627                    }
628                    break;
629                }
630            }
631        }
632    }
633    (q, r)
634}
635
636pub(crate) fn inverse(a: &Array2<f64>) -> GeometryResult<Array2<f64>> {
637    let n = a.nrows();
638    if n != a.ncols() {
639        return Err(GeometryError::Singular("inverse requires a square matrix"));
640    }
641    let mut aug = Array2::<f64>::zeros((n, 2 * n));
642    for i in 0..n {
643        for j in 0..n {
644            aug[[i, j]] = a[[i, j]];
645        }
646        aug[[i, n + i]] = 1.0;
647    }
648    for col in 0..n {
649        let mut pivot = col;
650        let mut best = aug[[col, col]].abs();
651        for row in col + 1..n {
652            let val = aug[[row, col]].abs();
653            if val > best {
654                best = val;
655                pivot = row;
656            }
657        }
658        if best < GEOMETRY_EPS {
659            return Err(GeometryError::Singular("matrix inverse pivot underflow"));
660        }
661        if pivot != col {
662            for j in 0..2 * n {
663                let tmp = aug[[col, j]];
664                aug[[col, j]] = aug[[pivot, j]];
665                aug[[pivot, j]] = tmp;
666            }
667        }
668        let scale = aug[[col, col]];
669        for j in 0..2 * n {
670            aug[[col, j]] /= scale;
671        }
672        for row in 0..n {
673            if row == col {
674                continue;
675            }
676            let factor = aug[[row, col]];
677            for j in 0..2 * n {
678                aug[[row, j]] -= factor * aug[[col, j]];
679            }
680        }
681    }
682    let mut out = Array2::<f64>::zeros((n, n));
683    for i in 0..n {
684        for j in 0..n {
685            out[[i, j]] = aug[[i, n + j]];
686        }
687    }
688    Ok(out)
689}
690
691/// Sweep budget multiplier for the classical Jacobi eigensolver: the iteration
692/// cap is `JACOBI_SWEEP_BUDGET · n²`. Classical (largest-off-diagonal) Jacobi
693/// converges quadratically once the off-diagonals are small, needing only a
694/// handful of full `O(n²)` sweeps; this generous multiple lets even clustered
695/// spectra finish while still failing loudly on a genuinely stalled matrix.
696const JACOBI_SWEEP_BUDGET: usize = 64;
697
698/// Relative off-diagonal convergence threshold for [`jacobi_symmetric`]: the
699/// largest off-diagonal magnitude must fall below `JACOBI_REL_TOL · ‖A‖_F`. Near
700/// `f64` precision so the diagonalization is accurate to working precision.
701const JACOBI_REL_TOL: f64 = 1.0e-13;
702
703pub(crate) fn jacobi_symmetric(a: &Array2<f64>) -> GeometryResult<(Array1<f64>, Array2<f64>)> {
704    let n = a.nrows();
705    if n != a.ncols() {
706        return Err(GeometryError::InvalidPoint(
707            "Jacobi eigensolver requires square input",
708        ));
709    }
710    let mut d = sym(a);
711    let mut v = identity(n);
712    let max_iter = JACOBI_SWEEP_BUDGET * n.max(1) * n.max(1);
713    // Relative convergence threshold: the largest off-diagonal magnitude must
714    // fall to `1e-13 * ||A||_F`. A fixed absolute `1e-13` is meaningless for
715    // matrices whose scale is far from unity (a well-scaled large-norm matrix
716    // could never reach it; a tiny-norm matrix would "converge" trivially),
717    // and silently returning the partially-diagonalized state after exhausting
718    // `max_iter` hides genuine non-convergence (e.g. clustered/degenerate
719    // spectra that stall the classical sweep). The Frobenius norm is invariant
720    // under the orthogonal Jacobi rotations, so it is computed once from the
721    // symmetrized input.
722    let frob_norm = {
723        let mut acc = 0.0;
724        for i in 0..n {
725            for j in 0..n {
726                acc += d[[i, j]] * d[[i, j]];
727            }
728        }
729        acc.sqrt()
730    };
731    let threshold = JACOBI_REL_TOL * frob_norm;
732    let mut converged = false;
733    for _ in 0..max_iter {
734        let mut p = 0usize;
735        let mut q = 0usize;
736        let mut best = 0.0;
737        for i in 0..n {
738            for j in i + 1..n {
739                let val = d[[i, j]].abs();
740                if val > best {
741                    best = val;
742                    p = i;
743                    q = j;
744                }
745            }
746        }
747        // `best <= threshold` (rather than `<`) makes the exactly-diagonal and
748        // zero-norm cases (`best == threshold == 0`) converge immediately.
749        if best <= threshold {
750            converged = true;
751            break;
752        }
753        let tau = (d[[q, q]] - d[[p, p]]) / (2.0 * d[[p, q]]);
754        let t = tau.signum() / (tau.abs() + (1.0 + tau * tau).sqrt());
755        let c = 1.0 / (1.0 + t * t).sqrt();
756        let s = t * c;
757        for k in 0..n {
758            let dpk = d[[p, k]];
759            let dqk = d[[q, k]];
760            d[[p, k]] = c * dpk - s * dqk;
761            d[[q, k]] = s * dpk + c * dqk;
762        }
763        for k in 0..n {
764            let dkp = d[[k, p]];
765            let dkq = d[[k, q]];
766            d[[k, p]] = c * dkp - s * dkq;
767            d[[k, q]] = s * dkp + c * dkq;
768        }
769        for k in 0..n {
770            let vkp = v[[k, p]];
771            let vkq = v[[k, q]];
772            v[[k, p]] = c * vkp - s * vkq;
773            v[[k, q]] = s * vkp + c * vkq;
774        }
775    }
776    if !converged {
777        return Err(GeometryError::Singular(
778            "Jacobi eigensolver did not converge within max_iter (off-diagonal mass above 1e-13 * Frobenius norm)",
779        ));
780    }
781    let mut evals = Array1::<f64>::zeros(n);
782    for i in 0..n {
783        evals[i] = d[[i, i]];
784    }
785    Ok((evals, v))
786}
787
788pub(crate) fn spectral_map_spd(
789    a: &Array2<f64>,
790    f: impl Fn(f64) -> GeometryResult<f64>,
791) -> GeometryResult<Array2<f64>> {
792    let (evals, evecs) = jacobi_symmetric(a)?;
793    let n = a.nrows();
794    let mut diag = Array2::<f64>::zeros((n, n));
795    for i in 0..n {
796        if evals[i] <= 0.0 || !evals[i].is_finite() {
797            return Err(GeometryError::InvalidPoint(
798                "SPD eigenvalue is not positive",
799            ));
800        }
801        diag[[i, i]] = f(evals[i])?;
802    }
803    // Reconstruction V·f(Λ)·Vᵀ: two dense n×n products GPU-dispatched via
804    // fast_ab/fast_abt for large ambient dimension.
805    use gam_linalg::faer_ndarray::{fast_ab, fast_abt};
806    Ok(fast_abt(&fast_ab(&evecs, &diag), &evecs))
807}
808
809pub(crate) fn spectral_map_symmetric(
810    a: &Array2<f64>,
811    f: impl Fn(f64) -> GeometryResult<f64>,
812) -> GeometryResult<Array2<f64>> {
813    let (evals, evecs) = jacobi_symmetric(a)?;
814    let n = a.nrows();
815    let mut diag = Array2::<f64>::zeros((n, n));
816    for i in 0..n {
817        diag[[i, i]] = f(evals[i])?;
818    }
819    // Reconstruction V·f(Λ)·Vᵀ, GPU-dispatched via fast_ab/fast_abt.
820    use gam_linalg::faer_ndarray::{fast_ab, fast_abt};
821    Ok(fast_abt(&fast_ab(&evecs, &diag), &evecs))
822}
823
824/// Thin singular value decomposition of a tall matrix `Y` (`n × k`, `n ≥ k`)
825/// via the symmetric eigendecomposition of the small `k × k` Gram matrix
826/// `YᵀY = V Σ² Vᵀ`: returns `(U, σ, V)` with `Y = U diag(σ) Vᵀ`, where `U` is
827/// `n × k` with orthonormal columns spanning `range(Y)`, `σ` holds the singular
828/// values, and `V` is `k × k` orthogonal. Forming the Gram keeps the
829/// eigenproblem at the small dimension `k`; the two products that carry the
830/// large ambient dimension `n` (`YᵀY` and `U = Y V Σ⁻¹`) are GPU-dispatched.
831///
832/// A numerically-zero singular value (`σ ≤ GEOMETRY_EPS`) leaves the
833/// corresponding `U` column zero rather than dividing through, which is what the
834/// Grassmann/Stiefel geodesic needs (a zero singular value is a vanishing
835/// principal angle); a caller requiring full rank inspects `σ` itself.
836pub(crate) fn thin_svd_gram(
837    y: &Array2<f64>,
838) -> GeometryResult<(Array2<f64>, Array1<f64>, Array2<f64>)> {
839    use gam_linalg::faer_ndarray::{fast_ab, fast_atb};
840    let (n, k) = y.dim();
841    let gram = fast_atb(y, y);
842    let (evals, v) = jacobi_symmetric(&gram)?;
843    let yv = fast_ab(y, &v);
844    let mut sigma = Array1::<f64>::zeros(k);
845    let mut u = Array2::<f64>::zeros((n, k));
846    for j in 0..k {
847        sigma[j] = evals[j].max(0.0).sqrt();
848        if sigma[j] > GEOMETRY_EPS {
849            let inv_sigma = 1.0 / sigma[j];
850            for i in 0..n {
851                u[[i, j]] = yv[[i, j]] * inv_sigma;
852            }
853        }
854    }
855    Ok((u, sigma, v))
856}
857
858/// Dense matrix exponential `exp(A)` via scaling-and-squaring with a truncated
859/// Taylor series. The Frobenius norm of `A` is driven below 1/4 by repeated
860/// halving (`A → A / 2^s`), where Taylor converges rapidly and stably; the
861/// result is then squared `s` times. With the scaled norm `θ < 1/4`, the
862/// degree-12 Taylor tail is bounded by `θ^{13} / 13! · 1/(1 - θ)`; since `13! ≈
863/// 6.23e9`, this is below `4·0.25^{13}/6.23e9 ≈ 3.8e-18`, i.e. under one f64 ulp,
864/// so the fixed degree truly reaches full f64 precision (the `< 1/2` threshold
865/// previously used left a ~2e-14 tail, two orders above an ulp). This is the
866/// standard, exact algorithm; no eigendecomposition is assumed (the inputs here
867/// are the non-normal canonical-metric block matrices on Stiefel, which are
868/// skew-like but not symmetric, so `spectral_map_*` does not apply).
869pub(crate) fn matrix_exp(a: &Array2<f64>) -> GeometryResult<Array2<f64>> {
870    let n = a.nrows();
871    if n != a.ncols() {
872        return Err(GeometryError::InvalidPoint(
873            "matrix exponential requires square input",
874        ));
875    }
876    if !a.iter().all(|v| v.is_finite()) {
877        return Err(GeometryError::InvalidPoint(
878            "matrix exponential requires finite entries",
879        ));
880    }
881    // Frobenius norm; choose the squaring count so the scaled matrix has norm
882    // below 1/4, which keeps the degree-12 Taylor truncation under one f64 ulp.
883    let mut frob = 0.0;
884    for v in a.iter() {
885        frob += v * v;
886    }
887    let frob = frob.sqrt();
888    let squarings = if frob > 0.25 {
889        (frob / 0.25).log2().ceil() as i32
890    } else {
891        0
892    };
893    let scale = 2.0_f64.powi(squarings);
894    let a_scaled = a / scale;
895
896    // exp(A_scaled) = sum_{k>=0} A_scaled^k / k! by term recurrence:
897    //   term_k = term_{k-1} · A_scaled / k.
898    // Both the Taylor term recurrence and the scaling-and-squaring use dense
899    // n×n products; GPU-dispatch them via fast_ab for large blocks.
900    use gam_linalg::faer_ndarray::fast_ab;
901    let mut result = identity(n);
902    let mut term = identity(n);
903    for k in 1..=12 {
904        term = fast_ab(&term, &a_scaled) / (k as f64);
905        result = result + &term;
906    }
907    // exp(A) = exp(A_scaled)^{2^squarings}.
908    for _ in 0..squarings {
909        result = fast_ab(&result, &result);
910    }
911    Ok(result)
912}
913
914/// Principal real logarithm of a real **orthogonal** matrix `V`, returned as
915/// the skew-symmetric `S` with `exp(S) = V`.
916///
917/// Rather than reach for a general (Schur-based) matrix logarithm — which the
918/// linear-algebra backend does not expose — this exploits the structure of an
919/// orthogonal matrix. Split `V = M + K` into its symmetric and skew parts
920///
921/// ```text
922///   M = ½(V + Vᵀ)   (symmetric, eigenvalues cos θⱼ ∈ [−1, 1])
923///   K = ½(V − Vᵀ)   (skew)
924/// ```
925///
926/// For an orthogonal (hence normal) `V`, `M` and `K` are both polynomials in
927/// `V`, so they **commute** and are simultaneously block-diagonalizable. In an
928/// eigenbasis `Q` of the symmetric `M` (which the self-adjoint eigensolver
929/// returns), `K̃ = QᵀKQ` is block-diagonal across distinct eigenvalues of `M`.
930/// On each 2-D rotation plane `M` has the degenerate eigenvalue `cos θ` and `K`
931/// acts as a skew `[[0,−sin θ],[sin θ,0]]`, whose principal logarithm is the
932/// same skew matrix scaled by `θ / sin θ`. Because `cos θ ↦ θ = arccos(cos θ)`
933/// is single-valued on `(0, π)`, the scale `c(λ) = arccos(λ)/√(1−λ²)` is a
934/// well-defined function of the eigenvalue `λ` of `M`, independent of the
935/// arbitrary in-plane basis the eigensolver picks. The whole logarithm is then
936///
937/// ```text
938///   S = Q · (c(λ̄ᵢⱼ) ⊙ K̃) · Qᵀ ,    λ̄ᵢⱼ = ½(λᵢ + λⱼ),
939/// ```
940///
941/// the element-wise scaling being exact on-block (where `λᵢ = λⱼ`) and
942/// multiplying a numerically-zero entry off-block (where `K̃ᵢⱼ ≈ 0` because the
943/// blocks are decoupled). The scaling is symmetric in `(i, j)`, so `S` stays
944/// skew.
945///
946/// An eigenvalue `λ → −1` is a rotation by `π`: the geodesic to that point is
947/// not unique (it is the cut locus / beyond the injectivity radius), so the
948/// principal logarithm does not exist. We refuse rather than return a value
949/// that silently picks one of the two equal-length geodesics.
950pub(crate) fn skew_log_orthogonal(v: &Array2<f64>) -> GeometryResult<Array2<f64>> {
951    use faer::Side;
952    use gam_linalg::faer_ndarray::{FaerEigh, fast_ab, fast_abt, fast_atb};
953
954    let n = v.nrows();
955    if v.ncols() != n {
956        return Err(GeometryError::InvalidPoint(
957            "matrix logarithm requires a square matrix",
958        ));
959    }
960    if !v.iter().all(|x| x.is_finite()) {
961        return Err(GeometryError::InvalidPoint(
962            "matrix logarithm requires finite entries",
963        ));
964    }
965    let mut m = Array2::<f64>::zeros((n, n));
966    let mut k = Array2::<f64>::zeros((n, n));
967    for i in 0..n {
968        for j in 0..n {
969            m[[i, j]] = 0.5 * (v[[i, j]] + v[[j, i]]);
970            k[[i, j]] = 0.5 * (v[[i, j]] - v[[j, i]]);
971        }
972    }
973    let (evals, q) = m.eigh(Side::Lower).map_err(|_| {
974        GeometryError::Singular("matrix logarithm: symmetric eigendecomposition failed")
975    })?;
976    // A rotation by π (eigenvalue −1 of V) is the cut locus: the logarithm is
977    // not single-valued there. Detect it from M's spectrum directly — on such a
978    // plane sin θ = 0 so K carries no signal and an element-wise scaling would
979    // silently drop the π rotation.
980    const CUT_LOCUS_EPS: f64 = 1.0e-7;
981    if evals.iter().any(|&lam| lam <= -1.0 + CUT_LOCUS_EPS) {
982        return Err(GeometryError::Unsupported(
983            "matrix logarithm undefined: rotation angle at π (beyond the injectivity radius)",
984        ));
985    }
986    let kt = fast_ab(&fast_atb(&q, &k), &q); // K̃ = Qᵀ K Q
987    let mut st = Array2::<f64>::zeros((n, n));
988    for i in 0..n {
989        for j in 0..n {
990            let lam = (0.5 * (evals[i] + evals[j])).clamp(-1.0, 1.0);
991            let sin_theta = (1.0 - lam * lam).max(0.0).sqrt();
992            // c(λ) = θ / sin θ, with the removable singularity at θ = 0
993            // (λ = 1) taken in the limit c → 1.
994            let scale = if sin_theta <= 1.0e-9 {
995                1.0
996            } else {
997                lam.acos() / sin_theta
998            };
999            st[[i, j]] = scale * kt[[i, j]];
1000        }
1001    }
1002    let s = fast_abt(&fast_ab(&q, &st), &q); // Q S̃ Qᵀ
1003    // Project out the rounding-level symmetric part so the result is exactly
1004    // skew, as the logarithm of an orthogonal matrix must be.
1005    let mut out = Array2::<f64>::zeros((n, n));
1006    for i in 0..n {
1007        for j in 0..n {
1008            out[[i, j]] = 0.5 * (s[[i, j]] - s[[j, i]]);
1009        }
1010    }
1011    Ok(out)
1012}
1013
1014/// Complete the `m × p` matrix `cols` (assumed to have orthonormal columns) to
1015/// a full `m × m` orthogonal matrix `[cols | C]`, returning the completion in
1016/// place: the first `p` columns are `cols`, the remaining `m − p` are an
1017/// orthonormal basis of the orthogonal complement of `cols`'s column space.
1018///
1019/// The complement is built by Gram–Schmidt-ing the standard axes `e₀ … e_{m−1}`
1020/// (in order) against the accumulated columns, with one reorthogonalization
1021/// pass for numerical safety. Taking the axes in order means that when `cols`
1022/// is `[Iₚ; 0]` the completion is exactly `[0; I_{m−p}]`, so the assembled
1023/// matrix is the identity — the property the Stiefel logarithm relies on to
1024/// start its iteration near `I₂ₚ` for nearby frames. The result is forced into
1025/// `SO(m)` (determinant `+1`) by flipping the sign of the last completion
1026/// column when needed, so its principal logarithm is skew-symmetric.
1027pub(crate) fn orthonormal_completion(cols: &Array2<f64>) -> Array2<f64> {
1028    let m = cols.nrows();
1029    let p = cols.ncols();
1030    let mut basis = Array2::<f64>::zeros((m, m));
1031    for j in 0..p {
1032        for i in 0..m {
1033            basis[[i, j]] = cols[[i, j]];
1034        }
1035    }
1036    let mut filled = p;
1037    let mut axis = 0usize;
1038    while filled < m && axis < m {
1039        let mut f = Array1::<f64>::zeros(m);
1040        f[axis] = 1.0;
1041        // Two Gram–Schmidt passes against the columns accepted so far.
1042        for _ in 0..2 {
1043            for c in 0..filled {
1044                let col = basis.column(c);
1045                let proj = dot(col, f.view());
1046                for i in 0..m {
1047                    f[i] -= proj * basis[[i, c]];
1048                }
1049            }
1050        }
1051        let nrm = norm(f.view());
1052        if nrm > GEOMETRY_EPS {
1053            for i in 0..m {
1054                basis[[i, filled]] = f[i] / nrm;
1055            }
1056            filled += 1;
1057        }
1058        axis += 1;
1059    }
1060    // Force det = +1 so the completion lies in SO(m) and its principal log is
1061    // skew. det of an orthogonal matrix is ±1; flip the last *appended* column
1062    // if −1. When nothing was appended (`p == m`, e.g. a square input) the
1063    // input columns are returned untouched — flipping one would corrupt the
1064    // caller's frame, and a square input's orientation is the caller's to own.
1065    if filled == m && m > p && matrix_det(&basis) < 0.0 {
1066        for i in 0..m {
1067            basis[[i, m - 1]] = -basis[[i, m - 1]];
1068        }
1069    }
1070    basis
1071}
1072
1073/// Determinant via Gaussian elimination with partial pivoting. Used only for
1074/// small orientation checks (e.g. forcing a completion into `SO(n)`); not a
1075/// hot path.
1076pub(crate) fn matrix_det(a: &Array2<f64>) -> f64 {
1077    let n = a.nrows();
1078    if n == 0 || a.ncols() != n {
1079        return 1.0;
1080    }
1081    let mut lu = a.clone();
1082    let mut det = 1.0_f64;
1083    for col in 0..n {
1084        // Partial pivot.
1085        let mut pivot = col;
1086        let mut best = lu[[col, col]].abs();
1087        for r in (col + 1)..n {
1088            let v = lu[[r, col]].abs();
1089            if v > best {
1090                best = v;
1091                pivot = r;
1092            }
1093        }
1094        if best == 0.0 {
1095            return 0.0;
1096        }
1097        if pivot != col {
1098            for c in 0..n {
1099                lu.swap([col, c], [pivot, c]);
1100            }
1101            det = -det;
1102        }
1103        det *= lu[[col, col]];
1104        for r in (col + 1)..n {
1105            let factor = lu[[r, col]] / lu[[col, col]];
1106            for c in col..n {
1107                lu[[r, c]] -= factor * lu[[col, c]];
1108            }
1109        }
1110    }
1111    det
1112}
1113
1114/// Cholesky factor `L` of a symmetric positive-definite matrix (`A = L Lᵀ`).
1115///
1116/// This is a *positive-definiteness* test, not a conditioning test: a genuine
1117/// SPD matrix with tiny eigenvalues (e.g. `[[1e-16]]`) must factor
1118/// successfully. A pivot is rejected only when it is non-finite or fails to be
1119/// strictly positive *relative to the matrix scale*. The floor
1120/// `GEOMETRY_EPS · max(1, trace(A)/n)` is the ambient scale of the matrix
1121/// multiplied by the relative machine-noise tolerance, so a positive pivot that
1122/// is merely small in absolute terms (but large relative to nothing — the whole
1123/// matrix is small) passes, while a zero, negative, or numerically-noise pivot
1124/// (indefinite / singular directions) is rejected.
1125///
1126/// Callers needing a *conditioning* margin (a lower bound on the smallest
1127/// eigenvalue) must check that separately; overloading this PD test with an
1128/// absolute `GEOMETRY_EPS` floor wrongly rejected well-formed small-scale SPD
1129/// points. No current caller (only `SpdManifold::matrix`, which validates SPD
1130/// membership) depends on a conditioning margin here.
1131pub(crate) fn cholesky_spd(a: &Array2<f64>) -> GeometryResult<Array2<f64>> {
1132    let n = a.nrows();
1133    if n != a.ncols() {
1134        return Err(GeometryError::InvalidPoint(
1135            "Cholesky requires square input",
1136        ));
1137    }
1138    // Scale-relative positive-definiteness floor. `trace(A)/n` is the mean
1139    // diagonal, which equals `mean(eigenvalues)` and is therefore the natural
1140    // scale of an SPD matrix's spectrum. The acceptance floor scales WITH the
1141    // matrix (it shrinks for tiny matrices), so a uniformly small but genuine
1142    // SPD matrix like `[[1e-16]]` — scale 1e-16, floor GEOMETRY_EPS·1e-16 =
1143    // 1e-28 — passes, while a pivot that has collapsed to numerical noise
1144    // relative to the matrix's own scale (the indefinite/singular directions)
1145    // is rejected. An absolute `GEOMETRY_EPS` floor would have wrongly rejected
1146    // such tiny SPD matrices; clamping the floor up to a constant would do the
1147    // same, so we deliberately let it shrink with the spectrum.
1148    let mut trace = 0.0_f64;
1149    for i in 0..n {
1150        trace += a[[i, i]];
1151    }
1152    if !trace.is_finite() {
1153        return Err(GeometryError::InvalidPoint(
1154            "matrix is not positive definite",
1155        ));
1156    }
1157    // Reference scale of the matrix's spectrum. The acceptance floor is this
1158    // scale times the relative tolerance, so a uniformly-tiny SPD matrix (small
1159    // scale) has a correspondingly tiny floor and still factors, while a pivot
1160    // that has collapsed to noise *relative to the matrix's own scale* (the
1161    // indefinite/singular case) is rejected.
1162    let scale = (trace / n as f64).abs().max(f64::MIN_POSITIVE);
1163    let scale_eps = GEOMETRY_EPS * scale;
1164    let mut l = Array2::<f64>::zeros((n, n));
1165    for i in 0..n {
1166        for j in 0..=i {
1167            let mut sum = a[[i, j]];
1168            for k in 0..j {
1169                sum -= l[[i, k]] * l[[j, k]];
1170            }
1171            if i == j {
1172                if !sum.is_finite() || sum <= scale_eps {
1173                    return Err(GeometryError::InvalidPoint(
1174                        "matrix is not positive definite",
1175                    ));
1176                }
1177                l[[i, j]] = sum.sqrt();
1178            } else {
1179                l[[i, j]] = sum / l[[j, j]];
1180            }
1181        }
1182    }
1183    Ok(l)
1184}
1185
1186#[cfg(test)]
1187mod cholesky_tests {
1188    use super::{GeometryError, cholesky_spd};
1189    use ndarray::Array2;
1190
1191    /// A genuine SPD matrix with a uniformly tiny spectrum (`[[1e-16]]`) must
1192    /// factor: the issue is positive-definiteness, not absolute scale. The old
1193    /// absolute `GEOMETRY_EPS` floor wrongly rejected it.
1194    #[test]
1195    fn cholesky_accepts_tiny_spd() {
1196        let mut a = Array2::<f64>::zeros((1, 1));
1197        a[[0, 0]] = 1.0e-16;
1198        let l = cholesky_spd(&a).expect("tiny positive 1x1 must be SPD");
1199        assert!((l[[0, 0]] - 1.0e-8).abs() <= 1.0e-16);
1200    }
1201
1202    /// A well-scaled SPD matrix factors and reproduces `L Lᵀ = A`.
1203    #[test]
1204    fn cholesky_accepts_well_scaled_spd() {
1205        // [[4, 2], [2, 3]] is SPD (eigenvalues ≈ 5.56, 1.44).
1206        let mut a = Array2::<f64>::zeros((2, 2));
1207        a[[0, 0]] = 4.0;
1208        a[[0, 1]] = 2.0;
1209        a[[1, 0]] = 2.0;
1210        a[[1, 1]] = 3.0;
1211        let l = cholesky_spd(&a).expect("well-scaled SPD must factor");
1212        let recon = l.dot(&l.t());
1213        for i in 0..2 {
1214            for j in 0..2 {
1215                assert!(
1216                    (recon[[i, j]] - a[[i, j]]).abs() <= 1.0e-12,
1217                    "L Lᵀ != A at ({i},{j})"
1218                );
1219            }
1220        }
1221    }
1222
1223    /// A zero pivot (singular) and an indefinite matrix must be rejected as not
1224    /// positive definite — the scale-relative floor still catches the genuine
1225    /// non-PD case.
1226    #[test]
1227    fn cholesky_rejects_zero_and_indefinite() {
1228        let zero = Array2::<f64>::zeros((1, 1));
1229        match cholesky_spd(&zero) {
1230            Err(GeometryError::InvalidPoint(_)) => {}
1231            other => panic!("expected non-PD rejection of zero pivot, got {other:?}"),
1232        }
1233        // [[1, 2], [2, 1]] has eigenvalues 3 and −1 (indefinite): the Schur
1234        // complement pivot 1 − 4 = −3 is negative.
1235        let mut indef = Array2::<f64>::zeros((2, 2));
1236        indef[[0, 0]] = 1.0;
1237        indef[[0, 1]] = 2.0;
1238        indef[[1, 0]] = 2.0;
1239        indef[[1, 1]] = 1.0;
1240        match cholesky_spd(&indef) {
1241            Err(GeometryError::InvalidPoint(_)) => {}
1242            other => panic!("expected non-PD rejection of indefinite matrix, got {other:?}"),
1243        }
1244    }
1245}
1246
1247#[cfg(test)]
1248mod qr_thin_tests {
1249    use super::qr_thin;
1250    use ndarray::Array2;
1251
1252    /// Two identical columns make the second residual vanish; the fallback axis
1253    /// must be Gram–Schmidted against the first accepted column so `QᵀQ = I`.
1254    /// The old behavior planted `e₂` directly, giving `q₁·q₂ = 1/√2`.
1255    #[test]
1256    fn qr_thin_duplicated_columns_orthonormal() {
1257        let mut a = Array2::<f64>::zeros((2, 2));
1258        // Both columns = (1, 1).
1259        a[[0, 0]] = 1.0;
1260        a[[1, 0]] = 1.0;
1261        a[[0, 1]] = 1.0;
1262        a[[1, 1]] = 1.0;
1263        let (q, r) = qr_thin(&a);
1264        // Deficient second column ⇒ R[1,1] = 0.
1265        assert!(
1266            r[[1, 1]].abs() <= 1.0e-14,
1267            "deficient column must set R[1,1]=0"
1268        );
1269        let gram = q.t().dot(&q);
1270        for i in 0..2 {
1271            for j in 0..2 {
1272                let want = if i == j { 1.0 } else { 0.0 };
1273                assert!(
1274                    (gram[[i, j]] - want).abs() <= 1.0e-12,
1275                    "QᵀQ != I at ({i},{j}): got {}",
1276                    gram[[i, j]]
1277                );
1278            }
1279        }
1280    }
1281
1282    /// A full-rank input still gives `QᵀQ = I` and reconstructs `A = QR`.
1283    #[test]
1284    fn qr_thin_full_rank_reconstructs() {
1285        let mut a = Array2::<f64>::zeros((3, 2));
1286        a[[0, 0]] = 1.0;
1287        a[[1, 0]] = 1.0;
1288        a[[2, 0]] = 0.0;
1289        a[[0, 1]] = 1.0;
1290        a[[1, 1]] = 0.0;
1291        a[[2, 1]] = 1.0;
1292        let (q, r) = qr_thin(&a);
1293        let gram = q.t().dot(&q);
1294        for i in 0..2 {
1295            for j in 0..2 {
1296                let want = if i == j { 1.0 } else { 0.0 };
1297                assert!(
1298                    (gram[[i, j]] - want).abs() <= 1.0e-12,
1299                    "QᵀQ != I at ({i},{j})"
1300                );
1301            }
1302        }
1303        let recon = q.dot(&r);
1304        for i in 0..3 {
1305            for j in 0..2 {
1306                assert!(
1307                    (recon[[i, j]] - a[[i, j]]).abs() <= 1.0e-12,
1308                    "QR != A at ({i},{j})"
1309                );
1310            }
1311        }
1312    }
1313}
1314
1315#[cfg(test)]
1316mod matrix_log_tests {
1317    use super::{matrix_exp, orthonormal_completion, skew_log_orthogonal};
1318    use ndarray::Array2;
1319
1320    /// `exp(skew_log_orthogonal(V)) = V` for a block-diagonal rotation built
1321    /// from two planes — including one with angle θ > π/2, which an
1322    /// `arcsin`-only scheme (no `cos θ` disambiguation) would get wrong.
1323    #[test]
1324    fn log_then_exp_recovers_rotation() {
1325        // 5×5 orthogonal: rotation by 2.3 rad in (0,1), by 0.4 rad in (2,3),
1326        // identity on axis 4.
1327        let mut v = Array2::<f64>::zeros((5, 5));
1328        let (c0, s0) = (2.3_f64.cos(), 2.3_f64.sin());
1329        let (c1, s1) = (0.4_f64.cos(), 0.4_f64.sin());
1330        v[[0, 0]] = c0;
1331        v[[0, 1]] = -s0;
1332        v[[1, 0]] = s0;
1333        v[[1, 1]] = c0;
1334        v[[2, 2]] = c1;
1335        v[[2, 3]] = -s1;
1336        v[[3, 2]] = s1;
1337        v[[3, 3]] = c1;
1338        v[[4, 4]] = 1.0;
1339        let s = skew_log_orthogonal(&v).expect("log of rotation");
1340        // S must be skew.
1341        for i in 0..5 {
1342            for j in 0..5 {
1343                assert!(
1344                    (s[[i, j]] + s[[j, i]]).abs() < 1e-12,
1345                    "log not skew at ({i},{j})"
1346                );
1347            }
1348        }
1349        let back = matrix_exp(&s).expect("exp of skew");
1350        let mut worst = 0.0_f64;
1351        for i in 0..5 {
1352            for j in 0..5 {
1353                worst = worst.max((back[[i, j]] - v[[i, j]]).abs());
1354            }
1355        }
1356        assert!(worst < 1e-10, "exp∘log != id for rotation: {worst:.3e}");
1357    }
1358
1359    /// A rotation by exactly π (eigenvalue −1) is the cut locus: the logarithm
1360    /// is not single-valued and must be refused, not silently dropped.
1361    #[test]
1362    fn log_refuses_pi_rotation() {
1363        // Rotation by π in the (0,1) plane: diag block [[-1,0],[0,-1]].
1364        let mut v = Array2::<f64>::zeros((3, 3));
1365        v[[0, 0]] = -1.0;
1366        v[[1, 1]] = -1.0;
1367        v[[2, 2]] = 1.0;
1368        assert!(
1369            skew_log_orthogonal(&v).is_err(),
1370            "π rotation must be refused as the cut locus"
1371        );
1372    }
1373
1374    /// Completing an `m×p` orthonormal block must yield an `SO(m)` matrix whose
1375    /// first `p` columns are the input, and a square (`p==m`) input must be
1376    /// returned untouched (never sign-flipped).
1377    #[test]
1378    fn completion_is_orthogonal_and_preserves_input() {
1379        // 4×2 orthonormal block.
1380        let mut cols = Array2::<f64>::zeros((4, 2));
1381        cols[[0, 0]] = 1.0;
1382        cols[[1, 1]] = 1.0;
1383        let full = orthonormal_completion(&cols);
1384        let gram = full.t().dot(&full);
1385        for i in 0..4 {
1386            for j in 0..4 {
1387                let want = if i == j { 1.0 } else { 0.0 };
1388                assert!((gram[[i, j]] - want).abs() < 1e-12, "not orthogonal");
1389            }
1390        }
1391        for j in 0..2 {
1392            for i in 0..4 {
1393                assert!(
1394                    (full[[i, j]] - cols[[i, j]]).abs() < 1e-14,
1395                    "input column changed"
1396                );
1397            }
1398        }
1399        // Square input with det −1 must be returned verbatim (no flip).
1400        let mut sq = Array2::<f64>::zeros((2, 2));
1401        sq[[0, 0]] = 1.0;
1402        sq[[1, 1]] = -1.0; // det = −1
1403        let out = orthonormal_completion(&sq);
1404        assert!(
1405            (out[[1, 1]] + 1.0).abs() < 1e-14,
1406            "square input was modified"
1407        );
1408    }
1409}
1410
1411#[cfg(test)]
1412mod jacobi_tests {
1413    use super::{GeometryError, jacobi_symmetric};
1414    use ndarray::Array2;
1415
1416    /// A large-norm SPD matrix has off-diagonal residuals after
1417    /// diagonalization that scale with `||A||_F`, so they sit far above the
1418    /// old *absolute* `1e-13` cutoff even when the decomposition is, in fact,
1419    /// fully converged. The relative threshold (`1e-13 * ||A||_F`) recognizes
1420    /// convergence here and returns the correct spectrum instead of grinding
1421    /// through `max_iter` sweeps and silently returning a partial diagonal.
1422    #[test]
1423    fn jacobi_converges_on_large_norm_spd() {
1424        // Q diag(1e8, 2e8, 3e8) Qᵀ for an orthogonal Q built from a planar
1425        // rotation in the (0,1) plane; eigenvalues are huge so the matrix
1426        // norm is ~1e8 and any absolute 1e-13 off-diagonal test is hopeless.
1427        let theta = 0.7_f64;
1428        let (c, s) = (theta.cos(), theta.sin());
1429        let mut q = Array2::<f64>::eye(3);
1430        q[[0, 0]] = c;
1431        q[[0, 1]] = -s;
1432        q[[1, 0]] = s;
1433        q[[1, 1]] = c;
1434        let lambda = [1.0e8_f64, 2.0e8, 3.0e8];
1435        let mut diag = Array2::<f64>::zeros((3, 3));
1436        for i in 0..3 {
1437            diag[[i, i]] = lambda[i];
1438        }
1439        let a = q.dot(&diag).dot(&q.t());
1440
1441        let (evals, evecs) = jacobi_symmetric(&a).expect("large-norm SPD must converge");
1442        let mut sorted: Vec<f64> = evals.to_vec();
1443        sorted.sort_by(|x, y| x.partial_cmp(y).unwrap());
1444        for (got, want) in sorted.iter().zip(lambda.iter()) {
1445            assert!(
1446                (got - want).abs() <= 1.0e-6 * want,
1447                "eigenvalue mismatch: got {got}, want {want}"
1448            );
1449        }
1450        // V diag(evals) Vᵀ must reconstruct A (relative to its scale).
1451        let mut diag_e = Array2::<f64>::zeros((3, 3));
1452        for i in 0..3 {
1453            diag_e[[i, i]] = evals[i];
1454        }
1455        let recon = evecs.dot(&diag_e).dot(&evecs.t());
1456        for i in 0..3 {
1457            for j in 0..3 {
1458                assert!(
1459                    (recon[[i, j]] - a[[i, j]]).abs() <= 1.0e-6 * 3.0e8,
1460                    "reconstruction mismatch at ({i},{j})"
1461                );
1462            }
1463        }
1464    }
1465
1466    /// A clustered/degenerate spectrum (two coincident eigenvalues) must still
1467    /// converge and reproduce the multiplicity. This guards against the
1468    /// relative threshold being so tight that ordinary near-degenerate SPD
1469    /// inputs trip the new non-convergence error.
1470    #[test]
1471    fn jacobi_handles_clustered_spectrum() {
1472        // diag(5, 5, 1) rotated in the (0,2) plane; the degenerate pair stays
1473        // degenerate under rotation.
1474        let theta = 0.4_f64;
1475        let (c, s) = (theta.cos(), theta.sin());
1476        let mut q = Array2::<f64>::eye(3);
1477        q[[0, 0]] = c;
1478        q[[0, 2]] = -s;
1479        q[[2, 0]] = s;
1480        q[[2, 2]] = c;
1481        let lambda = [5.0_f64, 5.0, 1.0];
1482        let mut diag = Array2::<f64>::zeros((3, 3));
1483        for i in 0..3 {
1484            diag[[i, i]] = lambda[i];
1485        }
1486        let a = q.dot(&diag).dot(&q.t());
1487
1488        let (evals, evecs) = jacobi_symmetric(&a).expect("clustered SPD must converge");
1489        let mut sorted: Vec<f64> = evals.to_vec();
1490        sorted.sort_by(|x, y| x.partial_cmp(y).unwrap());
1491        assert!((sorted[0] - 1.0).abs() <= 1.0e-12);
1492        assert!((sorted[1] - 5.0).abs() <= 1.0e-12);
1493        assert!((sorted[2] - 5.0).abs() <= 1.0e-12);
1494        // Eigenvectors must remain orthonormal even across the degenerate pair.
1495        let gram = evecs.t().dot(&evecs);
1496        for i in 0..3 {
1497            for j in 0..3 {
1498                let want = if i == j { 1.0 } else { 0.0 };
1499                assert!(
1500                    (gram[[i, j]] - want).abs() <= 1.0e-12,
1501                    "eigenvectors not orthonormal at ({i},{j})"
1502                );
1503            }
1504        }
1505    }
1506
1507    /// Non-convergence must now surface as `GeometryError::Singular` instead
1508    /// of a silently-returned partial diagonal. A symmetric input carrying a
1509    /// non-finite off-diagonal can never drive the largest off-diagonal
1510    /// magnitude below `1e-13 * ||A||_F` (the norm itself is non-finite), so
1511    /// the sweep exhausts `max_iter` and the solver must error rather than
1512    /// hand back the un-diagonalized matrix's diagonal.
1513    #[test]
1514    fn jacobi_errors_on_non_convergence() {
1515        let mut a = Array2::<f64>::eye(3);
1516        a[[0, 1]] = f64::NAN;
1517        a[[1, 0]] = f64::NAN;
1518        match jacobi_symmetric(&a) {
1519            Err(GeometryError::Singular(_)) => {}
1520            other => panic!("expected Singular non-convergence error, got {other:?}"),
1521        }
1522    }
1523}