gam_solve/arrow_schur/
prelude.rs

1//! Shared prelude for the arrow-Schur solver: external imports re-exported
2//! crate-wide, the module-level tuning constants, and the matvec function-
3//! pointer type aliases. Every sibling concern module pulls these in through
4//! `use super::*;`, preserving the single-namespace resolution the previous
5//! `include!`-based layout relied on.
6
7pub(crate) use super::reduced_solve::ArrowSchurError;
8pub(crate) use super::system::ArrowRowBlock;
9pub(crate) use gam_linalg::faer_ndarray::{FaerArrayView, FaerEigh, FaerLlt};
10pub(crate) use gam_linalg::triangular::{
11    cholesky_solve_matrix, cholesky_solve_vector, forward_substitution_lower_matrix,
12};
13pub(crate) use gam_terms::analytic_penalties::{
14    AnalyticPenaltyKind, AnalyticPenaltyRegistry, PenaltyTier,
15};
16pub(crate) use gam_terms::latent::{LatentCoordValues, LatentManifold};
17pub(crate) use faer::Side;
18pub(crate) use gam_runtime::warm_start::Fingerprinter;
19pub(crate) use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
20pub(crate) use std::ops::Range;
21pub(crate) use std::sync::Arc;
22
23pub(crate) const DIRECT_SOLVE_MAX_K: usize = 2_000;
24
25pub(crate) const DEFAULT_PCG_MAX_ITERATIONS: usize = 200;
26
27pub(crate) const DEFAULT_PCG_RELATIVE_TOLERANCE: f64 = 1e-4;
28
29/// Absolute floor on the Steihaug-CG residual stopping threshold.
30///
31/// The native PCG criterion is purely relative: `tol = rel_tol · ‖rhs‖`. When
32/// `‖rhs‖` is tiny (degenerate / near-stationary reduced systems) this product
33/// can fall below the roundoff resolution of `metric_norm` (~1e-15 for f64),
34/// so the loop would "converge" on floating-point noise rather than a genuinely
35/// accurate solution. Floor the threshold at 1e-14: above machine epsilon
36/// (~2.2e-16) yet below any practical single-iteration residual reduction, so
37/// well-scaled problems are unaffected while degenerate ones stop cleanly.
38pub(crate) const PCG_ABSOLUTE_TOLERANCE_FLOOR: f64 = 1e-14;
39
40pub(crate) const DEFAULT_TRUST_REGION_RADIUS: f64 = f64::INFINITY;
41
42pub const DEFAULT_PROXIMAL_INITIAL_RIDGE: f64 = 1e-8;
43
44pub(crate) const F32_UNIT_ROUNDOFF: f64 = (f32::EPSILON as f64) * 0.5;
45
46pub(crate) const DEFAULT_MIXED_PRECISION_MAX_REFINEMENTS: usize = 6;
47
48pub(crate) const DEFAULT_MIXED_PRECISION_CERTIFICATE_TOLERANCE: f64 = 1e-11;
49
50pub(crate) const DEFAULT_MIXED_PRECISION_KAPPA_MARGIN: f64 = 0.5;
51
52/// Backward-error certificate floor, expressed as a small multiple of f64 epsilon.
53pub(crate) const MIXED_PRECISION_CERTIFICATE_EPSILON_MULTIPLIER: f64 = 64.0;
54
55/// User-supplied kappa margins above this are no stricter than the unit gate.
56pub(crate) const MIXED_PRECISION_KAPPA_MARGIN_CEILING: f64 = 1.0;
57pub const DEFAULT_PROXIMAL_RIDGE_GROWTH: f64 = 10.0;
58
59/// Number of geometric proximal-ridge escalations the adaptive correction
60/// attempts before giving up. Raised from 16 to 22 so the ridge can climb from
61/// `1e-8` to `~1e14` (`1e-8 · 10^21`): when the penalised Hessian curvature
62/// along the gradient exceeds `~1e9`, the damped Newton step at ridge `1e9`
63/// still overshoots, and the extra decades let the step length collapse far
64/// enough to either find descent or reach the near-stationary resolution floor
65/// that triggers the convergence exit. The cost of the extra attempts is paid
66/// only on configs that would otherwise have failed.
67pub const DEFAULT_PROXIMAL_MAX_ATTEMPTS: usize = 22;
68
69pub(crate) const DEFAULT_ARMIJO_C1: f64 = 1e-4;
70
71pub(crate) const DEFAULT_GRADIENT_TOLERANCE: f64 = 1e-10;
72
73/// Relative objective resolution for the proximal-correction convergence exit.
74///
75/// When the best achievable change in the penalised objective across all ridge
76/// attempts is within `rel_tol · (|f| + 1)` of the incumbent value, the damped
77/// Newton model has reached the floating-point resolution of the objective and
78/// no further productive decrease exists. `8e-12` sits a few decades above the
79/// `~2.2e-16` f64 epsilon (so genuine reductions of a well-scaled objective are
80/// never swallowed) yet comfortably above the accumulated rounding of the
81/// `O(N·M·p)` reductions that form the objective, so a truly stationary state
82/// is recognised rather than chased into a spurious failure.
83pub(crate) const DEFAULT_PROXIMAL_CONVERGENCE_REL_TOL: f64 = 8e-12;
84
85pub(crate) const EUCLIDEAN_MANIFOLD_MODE_FINGERPRINT: u64 = 0;
86
87pub(crate) const ARROW_FACTOR_CACHE_HTBETA_BUDGET_BYTES: usize = 256 * 1024 * 1024;
88
89/// Matrix-free shared-block multiply for large BA/SAE Schur PCG.
90///
91/// The closure writes `out = H_ββ x` without the LM ridge. This is the hook
92/// that lets SAE-manifold scale callers avoid materializing a dense `K × K`
93/// shared block before Agarwal-style inexact Schur PCG.
94pub type SharedBetaMatvec =
95    Arc<dyn for<'a> Fn(ArrayView1<'a, f64>, &mut Array1<f64>) + Send + Sync>;
96
97pub type RowHtbetaMatvec =
98    Arc<dyn for<'a> Fn(usize, ArrayView1<'a, f64>, &mut Array1<f64>) + Send + Sync>;
99
100/// Row-local matrix-free transpose multiply `out += H_βt^(i) · v` (length `K`).
101///
102/// This is the adjoint of [`RowHtbetaMatvec`]: it scatters a per-row latent
103/// vector `v` (length `d_i`) back into the shared β gradient, **adding** its
104/// contribution to `out`. For the SAE Kronecker form this is the sparse
105/// `scatter_jbeta_t` over the row's active atoms — `O(m_i · p)` per row, the
106/// per-row sparse apply that replaces the `O(K)` column-probe in the GPU and
107/// streaming Schur matvec.
108pub type RowHtbetaTransposeMatvec =
109    Arc<dyn for<'a> Fn(usize, ArrayView1<'a, f64>, &mut Array1<f64>) + Send + Sync>;
110
111pub type StreamingArrowRowBuilder =
112    Arc<dyn Fn(usize) -> Result<ArrowRowBlock, ArrowSchurError> + Send + Sync>;
113
114/// GPU-backed Schur matvec for CPU-driven PCG at K ≥ 5000.
115///
116/// The closure writes `out = S·x` where `S = H_ββ + ρ·I − Σ_i Y_i^T Y_i`
117/// is the reduced shared system, with `Y_i = L_i^{-1} H_tβ^(i)` pre-computed
118/// on device from the same forward kernel that Layer D uses for the dense Schur
119/// build. The CPU-driven Steihaug-CG outer loop uploads `x` (K doubles),
120/// receives `out` (K doubles), and handles the H_ββ contribution on the CPU side.
121///
122/// Constructed by `crate::gpu_kernels::arrow_schur::gpu_schur_matvec_backend` when
123/// `cuda_selected()` and K ≥ 5000. The closure is `Send + Sync` so PCG callers
124/// can hold it in an `Arc`.
125pub type GpuSchurMatvec = Arc<dyn Fn(&Array1<f64>, &mut Array1<f64>) + Send + Sync>;
126
127pub(crate) type MetricWeights = [f64];
gam_solve/arrow_schur/prelude.rs

gam_solve/arrow_schur/
prelude.rs