1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
//! Shared prelude for the arrow-Schur solver: external imports re-exported
//! crate-wide, the module-level tuning constants, and the matvec function-
//! pointer type aliases. Every sibling concern module pulls these in through
//! `use super::*;`, preserving the single-namespace resolution the previous
//! `include!`-based layout relied on.
pub use ArrowSchurError;
pub use ArrowRowBlock;
pub use crate;
pub use crate;
pub use crate;
pub use crate;
pub use crateFingerprinter;
pub use Side;
pub use ;
pub use Range;
pub use Arc;
pub const DIRECT_SOLVE_MAX_K: usize = 2_000;
pub const DEFAULT_PCG_MAX_ITERATIONS: usize = 200;
pub const DEFAULT_PCG_RELATIVE_TOLERANCE: f64 = 1e-4;
/// Absolute floor on the Steihaug-CG residual stopping threshold.
///
/// The native PCG criterion is purely relative: `tol = rel_tol · ‖rhs‖`. When
/// `‖rhs‖` is tiny (degenerate / near-stationary reduced systems) this product
/// can fall below the roundoff resolution of `metric_norm` (~1e-15 for f64),
/// so the loop would "converge" on floating-point noise rather than a genuinely
/// accurate solution. Floor the threshold at 1e-14: above machine epsilon
/// (~2.2e-16) yet below any practical single-iteration residual reduction, so
/// well-scaled problems are unaffected while degenerate ones stop cleanly.
pub const PCG_ABSOLUTE_TOLERANCE_FLOOR: f64 = 1e-14;
pub const DEFAULT_TRUST_REGION_RADIUS: f64 = f64INFINITY;
pub const DEFAULT_PROXIMAL_INITIAL_RIDGE: f64 = 1e-8;
pub const F32_UNIT_ROUNDOFF: f64 = * 0.5;
pub const DEFAULT_MIXED_PRECISION_MAX_REFINEMENTS: usize = 6;
pub const DEFAULT_MIXED_PRECISION_CERTIFICATE_TOLERANCE: f64 = 1e-11;
pub const DEFAULT_MIXED_PRECISION_KAPPA_MARGIN: f64 = 0.5;
/// Backward-error certificate floor, expressed as a small multiple of f64 epsilon.
pub const MIXED_PRECISION_CERTIFICATE_EPSILON_MULTIPLIER: f64 = 64.0;
/// User-supplied kappa margins above this are no stricter than the unit gate.
pub const MIXED_PRECISION_KAPPA_MARGIN_CEILING: f64 = 1.0;
pub const DEFAULT_PROXIMAL_RIDGE_GROWTH: f64 = 10.0;
/// Number of geometric proximal-ridge escalations the adaptive correction
/// attempts before giving up. Raised from 16 to 22 so the ridge can climb from
/// `1e-8` to `~1e14` (`1e-8 · 10^21`): when the penalised Hessian curvature
/// along the gradient exceeds `~1e9`, the damped Newton step at ridge `1e9`
/// still overshoots, and the extra decades let the step length collapse far
/// enough to either find descent or reach the near-stationary resolution floor
/// that triggers the convergence exit. The cost of the extra attempts is paid
/// only on configs that would otherwise have failed.
pub const DEFAULT_PROXIMAL_MAX_ATTEMPTS: usize = 22;
pub const DEFAULT_ARMIJO_C1: f64 = 1e-4;
pub const DEFAULT_GRADIENT_TOLERANCE: f64 = 1e-10;
/// Relative objective resolution for the proximal-correction convergence exit.
///
/// When the best achievable change in the penalised objective across all ridge
/// attempts is within `rel_tol · (|f| + 1)` of the incumbent value, the damped
/// Newton model has reached the floating-point resolution of the objective and
/// no further productive decrease exists. `8e-12` sits a few decades above the
/// `~2.2e-16` f64 epsilon (so genuine reductions of a well-scaled objective are
/// never swallowed) yet comfortably above the accumulated rounding of the
/// `O(N·M·p)` reductions that form the objective, so a truly stationary state
/// is recognised rather than chased into a spurious failure.
pub const DEFAULT_PROXIMAL_CONVERGENCE_REL_TOL: f64 = 8e-12;
pub const EUCLIDEAN_MANIFOLD_MODE_FINGERPRINT: u64 = 0;
pub const ARROW_FACTOR_CACHE_HTBETA_BUDGET_BYTES: usize = 256 * 1024 * 1024;
/// Matrix-free shared-block multiply for large BA/SAE Schur PCG.
///
/// The closure writes `out = H_ββ x` without the LM ridge. This is the hook
/// that lets SAE-manifold scale callers avoid materializing a dense `K × K`
/// shared block before Agarwal-style inexact Schur PCG.
pub type SharedBetaMatvec =
;
pub type RowHtbetaMatvec =
;
/// Row-local matrix-free transpose multiply `out += H_βt^(i) · v` (length `K`).
///
/// This is the adjoint of [`RowHtbetaMatvec`]: it scatters a per-row latent
/// vector `v` (length `d_i`) back into the shared β gradient, **adding** its
/// contribution to `out`. For the SAE Kronecker form this is the sparse
/// `scatter_jbeta_t` over the row's active atoms — `O(m_i · p)` per row, the
/// per-row sparse apply that replaces the `O(K)` column-probe in the GPU and
/// streaming Schur matvec.
pub type RowHtbetaTransposeMatvec =
;
pub type StreamingArrowRowBuilder =
;
/// GPU-backed Schur matvec for CPU-driven PCG at K ≥ 5000.
///
/// The closure writes `out = S·x` where `S = H_ββ + ρ·I − Σ_i Y_i^T Y_i`
/// is the reduced shared system, with `Y_i = L_i^{-1} H_tβ^(i)` pre-computed
/// on device from the same forward kernel that Layer D uses for the dense Schur
/// build. The CPU-driven Steihaug-CG outer loop uploads `x` (K doubles),
/// receives `out` (K doubles), and handles the H_ββ contribution on the CPU side.
///
/// Constructed by `crate::gpu::kernels::arrow_schur::gpu_schur_matvec_backend` when
/// `cuda_selected()` and K ≥ 5000. The closure is `Send + Sync` so PCG callers
/// can hold it in an `Arc`.
pub type GpuSchurMatvec = ;
pub type MetricWeights = ;