gam_gpu/policy.rs
1use serde::{Deserialize, Serialize};
2
3#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
4pub enum GpuMixedPrecisionPolicy {
5 /// Always use fp64 factorization; no refinement attempted.
6 Off,
7 /// Attempt fp32 Cholesky factorization followed by up to
8 /// `REFINEMENT_MAX_STEPS` fp64-residual refinement steps. Policy admits
9 /// the attempt only when `p ≥ REFINEMENT_MIN_P` (so that the fp64 GEMV
10 /// overhead is amortized) and the measured residual drops monotonically.
11 /// Falls back to fp64 factorization automatically when the residual does
12 /// not decrease (κ(A)·u ≥ 1 regime) or when the fp32 POTRF itself fails.
13 Refinement,
14 /// Always use fp64 factorization; equivalent to `Off` but signals that
15 /// an explicit policy decision was taken.
16 Never,
17}
18
19#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
20pub struct GpuDispatchPolicy {
21 pub xtwx_n_min: usize,
22 pub xtwx_flops_min: usize,
23 pub xtwx_use_fused_below_p: usize,
24 pub gemm_min_flops: usize,
25 pub potrf_min_p: usize,
26 pub small_dense_batched_potrf_max_p: usize,
27 pub small_dense_batched_potrf_min_batch: usize,
28 pub syevd_min_p: usize,
29 pub sparse_min_nnz: usize,
30 pub fused_kernel_min_n: usize,
31 pub keep_design_resident_min_bytes: usize,
32 pub prefer_gpu_factorization_min_p: usize,
33 pub row_kernel_min_n: usize,
34 pub mixed_precision: GpuMixedPrecisionPolicy,
35}
36
37impl Default for GpuDispatchPolicy {
38 /// Conservative seed thresholds used before device calibration and when
39 /// calibration cannot run on the current host.
40 ///
41 /// The production runtime replaces these with
42 /// [`crate::calibration::calibrated_policy_for_device`] after the CUDA
43 /// probe selects a concrete device. Keep these values conservative: they
44 /// are the typed baseline for CPU-only builds, failed calibration, and unit
45 /// tests that exercise policy predicates without initializing CUDA.
46 fn default() -> Self {
47 Self {
48 xtwx_n_min: 50_000,
49 xtwx_flops_min: 100_000_000,
50 xtwx_use_fused_below_p: 256,
51 gemm_min_flops: 100_000_000,
52 potrf_min_p: 512,
53 small_dense_batched_potrf_max_p: 32,
54 small_dense_batched_potrf_min_batch: 8,
55 syevd_min_p: 256,
56 sparse_min_nnz: 1_000_000,
57 fused_kernel_min_n: 100_000,
58 keep_design_resident_min_bytes: 32 * 1024 * 1024,
59 prefer_gpu_factorization_min_p: 512,
60 row_kernel_min_n: 50_000,
61 mixed_precision: GpuMixedPrecisionPolicy::Refinement,
62 }
63 }
64}
65
66impl GpuDispatchPolicy {
67 /// Minimum problem dimension for the fp32+refinement path.
68 ///
69 /// Below this threshold the fp64 GEMV needed for the residual check costs
70 /// more than the savings from fp32 factorization. The threshold is set so
71 /// that a single `p × p` DGEMV (2p² flops) is at least 10× cheaper than
72 /// the `p³/3` POTRF (i.e. p ≥ 64) while still leaving margin for the
73 /// POTRF/POTRS launches. In practice `p ≥ 64` matches the existing
74 /// `potrf_min_p = 512` floor for GPU dispatch, so the refinement path only
75 /// activates when the GPU factorization path is already chosen.
76 pub const REFINEMENT_MIN_P: usize = 64;
77
78 /// Maximum number of fp32-correction steps per solve.
79 ///
80 /// Two steps suffice for κ(A) ≤ 10⁵ at fp32 (u ≈ 6 × 10⁻⁸): after step
81 /// 1 the error is O(κ u)² ≈ 10⁻⁶, after step 2 it is O(κ u)⁴ ≈ 10⁻¹²,
82 /// which is well within the fp64 unit roundoff of 10⁻¹⁶ × κ. A cap of 3
83 /// is used defensively.
84 pub const REFINEMENT_MAX_STEPS: usize = 3;
85
86 /// Relative residual tolerance for declaring convergence.
87 ///
88 /// `‖r‖ / ‖b‖ ≤ tol` is considered a converged solve. 10⁻¹² is two
89 /// orders of magnitude above the fp64 machine epsilon times a moderate
90 /// condition number, leaving the policy conservative.
91 pub const REFINEMENT_TOL: f64 = 1e-12;
92
93 /// Return `true` when the policy and problem size together suggest that
94 /// attempting fp32 factorization + iterative refinement will be profitable.
95 ///
96 /// The predicate is conservative:
97 /// * `GpuMixedPrecisionPolicy::Off` or `Never` → always `false`.
98 /// * `Refinement` with `p < REFINEMENT_MIN_P` → `false` (GEMV overhead
99 /// not amortised by fp32 POTRF savings below this threshold).
100 /// * Otherwise `true`; the caller still falls back to fp64 factorization
101 /// when the runtime fp32 POTRF fails or when the measured residual is
102 /// non-monotone.
103 #[inline]
104 pub const fn iterative_refinement_should_attempt(&self, p: usize) -> bool {
105 match self.mixed_precision {
106 GpuMixedPrecisionPolicy::Off | GpuMixedPrecisionPolicy::Never => false,
107 GpuMixedPrecisionPolicy::Refinement => p >= Self::REFINEMENT_MIN_P,
108 }
109 }
110
111 pub const fn dense_gemv_target_is_gpu(&self, n: usize, p: usize, resident: bool) -> bool {
112 resident || n.saturating_mul(p).saturating_mul(2) >= self.gemm_min_flops
113 }
114
115 pub const fn xtwx_target_is_gpu(&self, n: usize, p: usize, materialized: bool) -> bool {
116 materialized && n > 0 && p > 0 && self.xtwx_flops(n, p) >= self.dense_reduction_flops_min()
117 }
118
119 pub const fn xtwy_target_is_gpu(
120 &self,
121 n: usize,
122 px: usize,
123 q: usize,
124 materialized: bool,
125 ) -> bool {
126 materialized
127 && n > 0
128 && px > 0
129 && q > 0
130 && self.xtwy_flops(n, px, q) >= self.dense_reduction_flops_min()
131 }
132
133 pub const fn potrf_target_is_gpu(&self, p: usize, h_resident: bool) -> bool {
134 h_resident && p >= self.potrf_min_p
135 }
136
137 pub const fn dense_hessian_work_target_is_gpu(&self, n: usize, p: usize) -> bool {
138 n > 0
139 && p >= Self::DEVICE_LOOP_MIN_P
140 && self.xtwx_flops(n, p) >= self.dense_reduction_flops_min()
141 }
142
143 const fn dense_reduction_flops_min(&self) -> u128 {
144 if self.xtwx_flops_min < self.gemm_min_flops {
145 self.xtwx_flops_min as u128
146 } else {
147 self.gemm_min_flops as u128
148 }
149 }
150
151 const fn xtwx_flops(&self, n: usize, p: usize) -> u128 {
152 2u128 * (n as u128) * (p as u128) * (p as u128)
153 }
154
155 const fn xtwy_flops(&self, n: usize, px: usize, q: usize) -> u128 {
156 2u128 * (n as u128) * (px as u128) * (q as u128)
157 }
158
159 /// Minimum total CG-amortised matvec flops below which the host↔device
160 /// transfer of the row frames + CG vectors is not repaid by the device
161 /// matvec, so the reduced-Schur PCG hot loop stays on the CPU.
162 ///
163 /// The dense-Direct path keys on `dense_reduction_flops_min` (a single big
164 /// factorization). The matrix-free SAE matvec is different: no single apply
165 /// trips that floor (each is a stack of `n` tiny `d×d` solves + sparse
166 /// `m·k` gather/scatter), but the *whole CG solve* runs the apply
167 /// `O(cg_iters)` times over the same resident frames. The device wins when
168 /// the **summed** matvec work over the solve exceeds the one-time staging
169 /// cost — so the gate keys on `cg_iters · per_apply_flops`, not one apply.
170 ///
171 /// Set one order of magnitude below the dense floor: the matvec frames stay
172 /// resident across CG iterations (uploaded once), so the per-flop transfer
173 /// amortization is `1/cg_iters` of a cold dense launch, and the breakeven
174 /// drops accordingly.
175 pub const MATVEC_OFFLOAD_FLOPS_MIN: u128 = 10_000_000;
176
177 /// Conservative seed for the reduced-Schur PCG iteration count when the
178 /// caller cannot supply a measured budget. InexactPCG on an SAE β-block of
179 /// width `k` converges in `O(√κ)` iterations; this floor keeps the work
180 /// estimate honest (≥ this many applies) without over-claiming a tight
181 /// solve. Used only to amortise the staging cost in the work estimate.
182 pub const MATVEC_OFFLOAD_MIN_CG_ITERS: usize = 8;
183
184 /// Per-apply flop estimate for one reduced-Schur matvec `S·x` of a
185 /// matrix-free SAE Kronecker system, as a pure function of the system shape.
186 ///
187 /// Per row block `i` the apply does: a forward cross-block GEMV
188 /// `v_i = H_tβ^(i)·x` (`≈ 2·d·k` multiply-adds, with the per-row latent
189 /// depth `d` as the M-frame width and `k` the border), a `d×d` triangular
190 /// solve through the cached Cholesky factor (`≈ d²`), and a transpose
191 /// cross-block GEMV `H_βt^(i)·w_i` (`≈ 2·d·k`). The two `2·d·k` GEMVs would
192 /// sum to `4·d·k`; this estimate deliberately undercounts to a single
193 /// `2·d·k` cross term as a conservative (lower-bound) admission floor, so
194 /// the apply is modelled as `≈ n·(2·d·k + d²)`. This is a deliberate
195 /// lower bound on the true `≈ n·(4·d·k + d²)` arithmetic — admitting a
196 /// shape under the smaller figure can only be more conservative, never
197 /// over-eager. It is keyed on the *frame depth* `d` (M) and border width
198 /// `k` (p), not row count alone, so LLM shapes (few rows, wide `k`, modest
199 /// `d`) register arithmetic the row-count gate misses.
200 ///
201 /// USE FOR DISPATCH GATING ONLY. This is **not** a flop count: it omits the
202 /// transpose cross-block GEMV (`2·d·k`), so it is a strict lower bound on the
203 /// true per-apply work `n·(4·d·k + d²)`. The gate can therefore only
204 /// under-admit, never over-admit. Do not reuse it for benchmark / speedup
205 /// accounting.
206 const fn admission_work_lower_bound(n: usize, k: usize, d: usize) -> u128 {
207 let n = n as u128;
208 let k = k as u128;
209 let d = d as u128;
210 // 2·d·k cross-block apply (forward only) + d² per-row solve — the
211 // transpose GEMV is intentionally dropped so this stays a lower bound.
212 n.saturating_mul(
213 2u128
214 .saturating_mul(d)
215 .saturating_mul(k)
216 .saturating_add(d * d),
217 )
218 }
219
220 /// Work-based admission for offloading the **reduced-Schur PCG matvec**
221 /// (the InexactPCG hot loop for matrix-free SAE β-blocks) to the device.
222 ///
223 /// This is the Phase-1 (#1017) re-keying: the dense gates key on row count
224 /// (`xtwx_n_min`, `row_kernel_min_n` at 50k) or a single big-factorization
225 /// flop floor, neither of which the SAE LLM shape trips — `(n≈2000) ×
226 /// (k≈2048) × (d≈8)` is *thousands of small dense ops*, no single op large,
227 /// so the row-count gate keeps the whole fit on one CPU core. Here the gate
228 /// is the **total batched work over the CG solve**:
229 ///
230 /// ```text
231 /// estimated_device_flops = cg_iters · per_apply_flops(n, k, d)
232 /// should_offload = estimated_device_flops ≥ T_breakeven
233 /// ```
234 ///
235 /// where `T_breakeven = MATVEC_OFFLOAD_FLOPS_MIN` accounts for the
236 /// host↔device staging of the row frames + CG vectors amortised over the
237 /// `cg_iters` applies that reuse the resident frames (so the per-flop
238 /// transfer cost is `1/cg_iters` of a cold launch, an order of magnitude
239 /// below the dense-Direct floor).
240 ///
241 /// Pure function of the shape: no device needed to evaluate, so it is unit-
242 /// testable. The caller still falls back to the bit-identical CPU matvec
243 /// whenever the backend build declines, so admitting a shape never changes
244 /// the numerics — only where the `Σ_i Y_iᵀ(Y_i x)` flops execute.
245 ///
246 /// * `n` — number of row blocks (SAE observations / latent rows).
247 /// * `k` — border β width (the SAE decoder atom count `K`).
248 /// * `d` — per-row latent / active-frame depth (the M dimension).
249 /// * `cg_iters` — expected PCG iteration budget; the per-apply work is
250 /// multiplied by this because the frames stay resident across iterations.
251 /// Pass [`Self::MATVEC_OFFLOAD_MIN_CG_ITERS`] when no measured budget is
252 /// available; a tighter (smaller) value only makes the gate stricter.
253 ///
254 /// ## Live arrow-Schur call site
255 ///
256 /// `crate::solver::arrow_schur::maybe_inject_gpu_schur_matvec` gates the
257 /// InexactPCG reduced-Schur matvec injection on this predicate:
258 /// `reduced_schur_matvec_should_offload(sys.rows.len(), sys.k, sys.d,
259 /// options.pcg.max_iterations.min(options.trust_region.max_iterations))`,
260 /// where `sys.d` is the system's max per-row latent depth and the iteration
261 /// budget is the same `max_iterations` the PCG loop launches with.
262 /// `try_device_arrow_direct` (the **dense** Direct point solve) correctly
263 /// keeps `dense_hessian_work_target_is_gpu`: that path is a single large
264 /// factorization, not the amortised matvec.
265 pub const fn reduced_schur_matvec_should_offload(
266 &self,
267 n: usize,
268 k: usize,
269 d: usize,
270 cg_iters: usize,
271 ) -> bool {
272 if n == 0 || k == 0 || d == 0 || cg_iters == 0 {
273 return false;
274 }
275 // The border width must clear the device-loop floor: below it the per-
276 // apply launch latency (one kernel sequence per matvec) dominates any
277 // arithmetic regardless of how many CG iterations run.
278 if k < Self::DEVICE_LOOP_MIN_P {
279 return false;
280 }
281 let per_apply = Self::admission_work_lower_bound(n, k, d);
282 let total = per_apply.saturating_mul(cg_iters as u128);
283 total >= Self::MATVEC_OFFLOAD_FLOPS_MIN
284 }
285}
286
287/// Factorization strategy for the arrow-Schur border (shared `β`) solve, chosen
288/// from the *shape* of the joint system rather than a single fixed border-width
289/// cut (`ArrowSolverMode::automatic`'s `DIRECT_SOLVE_MAX_K = 2000`).
290///
291/// The border width alone is a blunt selector: it cannot see that the data-fit
292/// contribution to the `k × k` border is only rank `Σ_i d_i ≈ n·d`. For the
293/// #1017 color arm (`n = 180`, per-row depth `d = 2`, border `k = 15360`) the
294/// data information is rank `360` yet a dense Direct solve pays a full `k³/3 ≈
295/// 1.2e12`-flop Cholesky — the measured 26-min-class fit. This maps cleanly onto
296/// the two `ArrowSolverMode` variants the solver already implements.
297#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
298pub enum ArrowBorderStrategy {
299 /// Eliminate the per-row blocks, form the dense `k × k` reduced Schur, and
300 /// Cholesky-factor it (`ArrowSolverMode::Direct`). Appropriate for modest,
301 /// near-square borders where the `k³/3` factorization is cheap and the
302 /// data-fit rank is comparable to `k`.
303 DenseDirect,
304 /// Solve the reduced Schur iteratively by matrix-free PCG
305 /// (`ArrowSolverMode::InexactPCG`), never materialising the `k × k` factor.
306 /// Appropriate when the dense `k³` factorization dominates and/or the
307 /// data-fit contribution to the border is rank-deficient (`n·d < k`).
308 ReducedIterative,
309}
310
311/// Cost model + recommendation for the arrow-Schur border solve, a pure function
312/// of the joint-system shape (unit-testable, no device required).
313///
314/// This operationalises the measured #1017 finding that the full arrow-Schur
315/// Newton solve is dominated by the dense `k × k` border Cholesky (the on-device
316/// dense Direct solve was measured at ~0.94× — a slowdown — because the `k³/3`
317/// factorization, not the GPU-favourable batched per-row work, is the bottleneck
318/// at LLM/SAE border widths). The lever the issue calls for is to *shrink or
319/// factor the dense border* so the batched `n`-row work dominates; the plan
320/// makes that decision inspectable and honest.
321///
322/// ## Flop model (deliberate, documented approximations)
323///
324/// * **Dense Direct** ≈ `2·n·d·k²` (assemble the reduced Schur: per row a
325/// rank-`d` symmetric update `H_βt (H_tt)⁻¹ H_tβ` to the `k × k` border,
326/// `≈ 2·d·k²` flops) `+ k³/3` (Cholesky of the dense `k × k` Schur).
327/// * **Reduced iterative** ≈ `cg_iters · n·(4·d·k + d²)` (matrix-free PCG:
328/// per matvec a forward + transpose cross-block GEMV `4·d·k` plus the per-row
329/// `d × d` solve `d²`, summed over `n` row blocks, over `cg_iters` applies).
330///
331/// Both are dispatch-grade estimates, not exact operation counts; they omit
332/// preconditioner setup and lower-order terms symmetrically, so their ratio (the
333/// only thing the recommendation consumes) is meaningful while neither figure
334/// should be reused for speedup accounting.
335///
336/// ## Status
337///
338/// Advisory / diagnostic. It is **not** wired into the live
339/// `ArrowSolverMode::automatic` selector: replacing the fixed `DIRECT_SOLVE_MAX_K`
340/// cut with this shape-driven crossover changes which production fits take the
341/// Direct vs PCG path and must be validated on GPU hardware (#1017 Phase 2–4)
342/// before it can change numerics. Today it is consumed by the honest
343/// `examples/full_color_fit_1017.rs` measurement harness (modeled-vs-measured)
344/// and by the unit tests below.
345#[derive(Clone, Copy, Debug, Eq, PartialEq)]
346pub struct ArrowBorderSolvePlan {
347 /// Number of per-row blocks (SAE observations / latent rows).
348 pub n: usize,
349 /// Border `β` width (the SAE decoder atom count `K` × basis width).
350 pub k: usize,
351 /// Per-row latent / active-frame depth (the `M` dimension).
352 pub d: usize,
353 /// CG iteration budget assumed for the iterative estimate.
354 pub cg_iters: usize,
355 /// Effective rank of the data-fit contribution to the `k × k` border,
356 /// bounded by `Σ_i d_i ≈ n·d` and never more than `k`.
357 pub data_fit_rank: usize,
358 /// True when `n·d < k`: the dense `k × k` Cholesky spends `O(k³)` factorising
359 /// a border whose data information is only rank `n·d` — the pathological
360 /// wide-sparse-border regime (color arm: `n·d = 360 ≪ k = 15360`).
361 pub dense_border_rank_deficient: bool,
362 /// `≈ 2·n·d·k² + k³/3` — reduced-Schur assembly plus dense border Cholesky.
363 pub dense_direct_flops: u128,
364 /// `≈ cg_iters · n·(4·d·k + d²)` — matrix-free PCG matvecs.
365 pub reduced_iterative_flops: u128,
366 /// The recommended strategy: `ReducedIterative` iff the dense factorization
367 /// path costs strictly more arithmetic than the iterative path at
368 /// `cg_iters`.
369 pub recommended: ArrowBorderStrategy,
370 /// Whether running the *recommended* strategy on the device is expected to
371 /// pay off. For `ReducedIterative` this is `reduced_schur_matvec_should_offload`;
372 /// for `DenseDirect` the device wins only when the batched per-row assembly
373 /// work (`2·n·d·k²`, GPU-favourable batched GEMM/POTRF) at least matches the
374 /// border Cholesky (`k³/3`) *and* clears the dense flop floor — the honest
375 /// encoding of the measured 0.94× dense-Direct-on-device slowdown.
376 pub device_favorable: bool,
377}
378
379impl GpuDispatchPolicy {
380 /// Assembly flops for the dense reduced Schur: per row a rank-`d` update to
381 /// the `k × k` border (`≈ 2·d·k²`), summed over `n` rows.
382 const fn dense_schur_assembly_flops(n: usize, k: usize, d: usize) -> u128 {
383 2u128
384 .saturating_mul(n as u128)
385 .saturating_mul(d as u128)
386 .saturating_mul((k as u128).saturating_mul(k as u128))
387 }
388
389 /// Cholesky flops for the dense `k × k` reduced Schur: `≈ k³/3`.
390 const fn dense_border_cholesky_flops(k: usize) -> u128 {
391 let k = k as u128;
392 k.saturating_mul(k).saturating_mul(k) / 3
393 }
394
395 /// Total matrix-free PCG flops: `cg_iters · n·(4·d·k + d²)`.
396 const fn reduced_iterative_flops(n: usize, k: usize, d: usize, cg_iters: usize) -> u128 {
397 let n = n as u128;
398 let k = k as u128;
399 let d = d as u128;
400 let per_apply = n.saturating_mul(
401 4u128
402 .saturating_mul(d)
403 .saturating_mul(k)
404 .saturating_add(d.saturating_mul(d)),
405 );
406 per_apply.saturating_mul(cg_iters as u128)
407 }
408
409 /// Build the shape-driven [`ArrowBorderSolvePlan`] for a joint arrow-Schur
410 /// system with `n` row blocks, border width `k`, per-row depth `d`, and an
411 /// assumed CG budget `cg_iters` (pass
412 /// [`Self::MATVEC_OFFLOAD_MIN_CG_ITERS`] when none is measured; a smaller
413 /// value only biases the recommendation toward `DenseDirect`, never the
414 /// reverse).
415 ///
416 /// Degenerate shapes (`n`, `k`, or `d` zero) return an all-zero plan
417 /// recommending `DenseDirect` (the trivial/empty solve stays on the simple
418 /// path) with `device_favorable = false`.
419 pub fn arrow_border_solve_plan(
420 &self,
421 n: usize,
422 k: usize,
423 d: usize,
424 cg_iters: usize,
425 ) -> ArrowBorderSolvePlan {
426 if n == 0 || k == 0 || d == 0 {
427 return ArrowBorderSolvePlan {
428 n,
429 k,
430 d,
431 cg_iters,
432 data_fit_rank: 0,
433 dense_border_rank_deficient: false,
434 dense_direct_flops: 0,
435 reduced_iterative_flops: 0,
436 recommended: ArrowBorderStrategy::DenseDirect,
437 device_favorable: false,
438 };
439 }
440
441 let assembly = Self::dense_schur_assembly_flops(n, k, d);
442 let border_chol = Self::dense_border_cholesky_flops(k);
443 let dense_direct_flops = assembly.saturating_add(border_chol);
444 let iters = if cg_iters == 0 { 1 } else { cg_iters };
445 let reduced_iterative_flops = Self::reduced_iterative_flops(n, k, d, iters);
446
447 let data_fit_rank = (n.saturating_mul(d)).min(k);
448 let dense_border_rank_deficient = n.saturating_mul(d) < k;
449
450 let recommended = if dense_direct_flops > reduced_iterative_flops {
451 ArrowBorderStrategy::ReducedIterative
452 } else {
453 ArrowBorderStrategy::DenseDirect
454 };
455
456 let device_favorable = match recommended {
457 ArrowBorderStrategy::ReducedIterative => {
458 self.reduced_schur_matvec_should_offload(n, k, d, iters)
459 }
460 ArrowBorderStrategy::DenseDirect => {
461 // Dense Direct wins on device only when the batched per-row
462 // assembly work dominates the (poorly GPU-scaling, and here
463 // rank-deficient) border Cholesky, and the total clears the
464 // dense reduction floor. This is the honest encoding of the
465 // measured 0.94× on-device dense-Direct slowdown: when the k³
466 // Cholesky dominates, stay on the CPU.
467 assembly >= border_chol
468 && dense_direct_flops >= self.dense_reduction_flops_min()
469 }
470 };
471
472 ArrowBorderSolvePlan {
473 n,
474 k,
475 d,
476 cg_iters: iters,
477 data_fit_rank,
478 dense_border_rank_deficient,
479 dense_direct_flops,
480 reduced_iterative_flops,
481 recommended,
482 device_favorable,
483 }
484 }
485}
486
487/// The aspirational single-GPU design-row throughput the #1412 decision gate is
488/// supposed to establish for the LLM-shape batched-Cholesky + tile-GEMM fit
489/// pipeline: 100 000 design rows processed per wall-clock second per device.
490///
491/// The original gate *claimed* this number without ever measuring it. The
492/// honest contract is the other way around: a benchmark
493/// (`examples/throughput_1412.rs`) measures the true rows/sec on a real device,
494/// and [`GpuThroughputVerdict::from_measurement`] reports whether the measured
495/// value meets the target — the verdict is a *function of the measurement*, not
496/// a hardcoded assertion. See `tests/owed_1412.rs`.
497pub const GPU_THROUGHPUT_TARGET_ROWS_PER_SEC: f64 = 100_000.0;
498
499/// Outcome of comparing a *measured* GPU throughput against the target. The
500/// only way to construct one is [`Self::from_measurement`], so a verdict can
501/// never assert a target that was not actually established by a measurement.
502#[derive(Clone, Copy, Debug, PartialEq)]
503pub struct GpuThroughputVerdict {
504 /// The measured design-rows-per-second on the device under test.
505 pub measured_rows_per_sec: f64,
506 /// The target the measurement is compared against.
507 pub target_rows_per_sec: f64,
508 /// `measured / target`. ≥ 1.0 means the target was established.
509 pub fraction_of_target: f64,
510 /// True iff `measured_rows_per_sec >= target_rows_per_sec`.
511 pub meets_target: bool,
512}
513
514impl GpuThroughputVerdict {
515 /// Build a verdict from a measured throughput against
516 /// [`GPU_THROUGHPUT_TARGET_ROWS_PER_SEC`]. A non-finite or non-positive
517 /// measurement can never meet the target (it is not a usable measurement).
518 #[inline]
519 pub fn from_measurement(measured_rows_per_sec: f64) -> Self {
520 Self::from_measurement_against(measured_rows_per_sec, GPU_THROUGHPUT_TARGET_ROWS_PER_SEC)
521 }
522
523 /// Build a verdict against an explicit target (used by tests that probe the
524 /// comparison logic without depending on the global target constant).
525 #[inline]
526 pub fn from_measurement_against(measured_rows_per_sec: f64, target_rows_per_sec: f64) -> Self {
527 let usable = measured_rows_per_sec.is_finite() && measured_rows_per_sec > 0.0;
528 let fraction_of_target = if usable && target_rows_per_sec > 0.0 {
529 measured_rows_per_sec / target_rows_per_sec
530 } else {
531 0.0
532 };
533 Self {
534 measured_rows_per_sec,
535 target_rows_per_sec,
536 fraction_of_target,
537 meets_target: usable && measured_rows_per_sec >= target_rows_per_sec,
538 }
539 }
540}
541
542/// Why a Stage-3 encode deployment decision could not be made from a real device
543/// measurement (#988, #1412). Each variant is a state in which the
544/// `100_000` rows/sec/GPU target was neither established NOR refuted on a
545/// device — the decision is blocked on hardware, not green-washed from a CPU
546/// proxy.
547#[derive(Clone, Copy, Debug, PartialEq, Eq)]
548pub enum EncodeDecisionBlocked {
549 /// No CUDA device on this host: the exact encode could not be measured on a
550 /// device at all (a CPU rate cannot substitute — that was the #1412 defect).
551 NoDevice,
552 /// A device is present but there is no device-resident *exact-encode* kernel,
553 /// so the FULL per-row encode cannot be measured on the device. (The resident
554 /// normal-equations solve in [`crate::encode_throughput`] is only ONE
555 /// component of the encode, not the encode; a component measurement cannot
556 /// decide the encode surrogate question — #988.)
557 NoDeviceEncodeKernel,
558 /// A device is present and a measurement was attempted, but the device path
559 /// did not engage (false routing) — refused rather than reported as a pass.
560 DeviceNotEngaged,
561}
562
563/// Tri-state Stage-3 encode deployment / amortized-surrogate decision
564/// (#988, #1412).
565///
566/// The decision the throughput gate exists to make is empirical: does the EXACT
567/// per-row encode clear the `100_000` rows/sec/GPU deployment target on a real
568/// device? Only a real device measurement can answer it:
569/// * [`Self::Met`] — a device measurement CLEARED the target: ship the exact
570/// encode; the certified amortized surrogate is NOT needed.
571/// * [`Self::Unmet`] — a device measurement MISSED the target: the certified
572/// amortized surrogate becomes justified.
573/// * [`Self::Undetermined`] — no device measurement is available. The decision
574/// is BLOCKED on hardware; it is neither "surrogate unneeded" nor "surrogate
575/// justified".
576///
577/// The critical anti-green-wash property (#1412): there is NO constructor that
578/// takes a CPU rate. A CPU measurement, however fast, can never move the decision
579/// out of [`Self::Undetermined`]. Projecting a CPU rate through an assumed
580/// CPU→GPU factor to declare the target met was the exact #1412 defect and is
581/// structurally impossible here — [`Self::Met`] / [`Self::Unmet`] come only from
582/// [`Self::from_device_measurement`] with `engaged == true`.
583#[derive(Clone, Copy, Debug, PartialEq)]
584pub enum EncodeDeploymentDecision {
585 /// A device measurement established the deployment target.
586 Met {
587 /// The measured device rows/sec that cleared the target.
588 measured_rows_per_sec: f64,
589 /// The target it was compared against.
590 target_rows_per_sec: f64,
591 },
592 /// A device measurement fell short of the deployment target.
593 Unmet {
594 /// The measured device rows/sec that missed the target.
595 measured_rows_per_sec: f64,
596 /// The target it was compared against.
597 target_rows_per_sec: f64,
598 },
599 /// No device measurement is available; the decision is blocked on hardware.
600 Undetermined {
601 /// Why no device measurement could be made.
602 reason: EncodeDecisionBlocked,
603 },
604}
605
606impl EncodeDeploymentDecision {
607 /// The ONLY path to a `Met`/`Unmet` decision: a device measurement that
608 /// actually engaged the device and produced a usable rate. `engaged == false`
609 /// (false routing / CPU decline) or a non-finite / non-positive rate yields
610 /// [`Self::Undetermined`] — never a fabricated pass or fail.
611 #[must_use]
612 pub fn from_device_measurement(engaged: bool, measured_rows_per_sec: f64) -> Self {
613 Self::from_device_measurement_against(
614 engaged,
615 measured_rows_per_sec,
616 GPU_THROUGHPUT_TARGET_ROWS_PER_SEC,
617 )
618 }
619
620 /// [`Self::from_device_measurement`] against an explicit target (for tests
621 /// that probe the decision logic without the global target constant).
622 #[must_use]
623 pub fn from_device_measurement_against(
624 engaged: bool,
625 measured_rows_per_sec: f64,
626 target_rows_per_sec: f64,
627 ) -> Self {
628 let usable = measured_rows_per_sec.is_finite() && measured_rows_per_sec > 0.0;
629 if !engaged || !usable {
630 return Self::Undetermined {
631 reason: EncodeDecisionBlocked::DeviceNotEngaged,
632 };
633 }
634 if measured_rows_per_sec >= target_rows_per_sec {
635 Self::Met {
636 measured_rows_per_sec,
637 target_rows_per_sec,
638 }
639 } else {
640 Self::Unmet {
641 measured_rows_per_sec,
642 target_rows_per_sec,
643 }
644 }
645 }
646
647 /// Construct the blocked decision for a host that cannot measure the exact
648 /// encode on a device. This is the honest CPU-only / no-device-kernel outcome
649 /// — the deployment target is left undetermined rather than projected.
650 #[must_use]
651 pub fn blocked(reason: EncodeDecisionBlocked) -> Self {
652 Self::Undetermined { reason }
653 }
654
655 /// True ONLY when a device measurement cleared the target: the exact encode
656 /// ships and no surrogate is built. Never true from a CPU proxy.
657 #[must_use]
658 pub fn surrogate_unneeded(&self) -> bool {
659 matches!(self, Self::Met { .. })
660 }
661
662 /// True ONLY when a device measurement missed the target: the certified
663 /// amortized surrogate becomes justified. Never true without a measurement.
664 #[must_use]
665 pub fn surrogate_justified(&self) -> bool {
666 matches!(self, Self::Unmet { .. })
667 }
668
669 /// True when no device measurement is available and the decision is blocked
670 /// on hardware (neither [`Self::surrogate_unneeded`] nor
671 /// [`Self::surrogate_justified`]).
672 #[must_use]
673 pub fn is_undetermined(&self) -> bool {
674 matches!(self, Self::Undetermined { .. })
675 }
676}
677
678/// Which `(response, link)` family the Stage 3.3 device-resident PIRLS loop
679/// can evaluate without going through the Level-B raw-body NVRTC path.
680///
681/// Mirrors `PirlsRowFamily::ALL` at the policy layer so the predicate stays
682/// linkable from the CPU PIRLS entry without dragging a Linux-only enum into
683/// every host compilation unit.
684#[derive(Clone, Copy, Debug, Eq, PartialEq)]
685pub enum PirlsLoopFamilyKind {
686 BernoulliLogit,
687 BernoulliProbit,
688 BernoulliCLogLog,
689 PoissonLog,
690 GaussianIdentity,
691 GammaLog,
692}
693
694#[derive(Clone, Copy, Debug, Eq, PartialEq)]
695pub enum PirlsLoopCurvatureKind {
696 Fisher,
697 Observed,
698}
699
700/// Inputs to [`should_run_reml_outer_on_device`]. The admission predicate
701/// for routing the *outer* REML BFGS-over-ρ loop onto a fully device-resident
702/// driver (rather than the host orchestrator that hops out per step).
703///
704/// Fields are intentionally lifted from data the CPU REML entry has on hand
705/// before it touches the seed generator or the inner P-IRLS loop, so the
706/// admission check is allocation-free and can short-circuit before any
707/// device call.
708#[derive(Clone, Copy, Debug)]
709pub struct RemlOuterAdmission {
710 /// Active design rows (post-transform).
711 pub n: usize,
712 /// Active design columns / penalised-Hessian dimension.
713 pub p: usize,
714 /// Number of smoothing parameters ρ the outer BFGS optimises over.
715 pub num_rho: usize,
716 /// Inner family / link pair the device-resident PIRLS loop can evaluate.
717 /// `None` means the family does not map onto the six JIT-cached row
718 /// kernels — the outer loop must stay on the host orchestrator because
719 /// the inner step would already hop out anyway.
720 pub family: Option<PirlsLoopFamilyKind>,
721 /// Curvature surface the inner loop will use; tied to `family` via
722 /// `pirls_loop_curvature_for`.
723 pub curvature: PirlsLoopCurvatureKind,
724 /// True when the CUDA runtime is initialised on this host.
725 pub gpu_available: bool,
726}
727
728/// Inputs to [`should_use_gpu_pirls_loop`]. Each field comes from data the
729/// CPU PIRLS entry has on hand before it touches the eigendecomposition
730/// engine, so the admission check itself is allocation-free and can short-
731/// circuit before any heavy work happens.
732#[derive(Clone, Copy, Debug)]
733pub struct PirlsLoopAdmission {
734 /// Number of rows in the active (post-transform) design matrix.
735 pub n: usize,
736 /// Number of columns in the active design (i.e. `p` of `Xᵀ X`).
737 pub p: usize,
738 /// `Some(_)` when the inner family maps onto one of the six JIT-cached
739 /// `PirlsRowFamily` variants; `None` for custom families that still
740 /// require Stage 6 Level B and have not yet been admitted here.
741 pub family: Option<PirlsLoopFamilyKind>,
742 /// Curvature surface the inner loop will use; the GPU loop has Fisher +
743 /// Observed kernels, anything else (e.g. expected-projection surrogates)
744 /// is not admitted.
745 pub curvature: PirlsLoopCurvatureKind,
746 /// True when the CUDA runtime is initialised on this host (i.e.
747 /// `GpuRuntime::global().is_some()`).
748 pub gpu_available: bool,
749}
750
751impl GpuDispatchPolicy {
752 /// Minimum design column count for the device-resident inner/outer loops.
753 ///
754 /// Below this width the per-iteration `XᵀWX + Cholesky` is dominated by
755 /// launch latency and PCIe staging rather than arithmetic, so the host LM
756 /// loop (which populates the full `PirlsResult` surface as a free
757 /// side-effect) is strictly cheaper. Shared by both the inner PIRLS and
758 /// outer REML admission predicates so they cannot drift apart.
759 pub const DEVICE_LOOP_MIN_P: usize = 32;
760
761 /// Conservative admission predicate for routing
762 /// `fit_model_for_fixed_rho_with_adaptive_kkt` through the Stage 3.3
763 /// device-resident PIRLS loop instead of the CPU LM loop.
764 ///
765 /// The threshold is the dense `XᵀWX` work estimate, not row count alone:
766 /// LLM/SAE fits can have only a few thousand rows but thousands of columns,
767 /// so `2*n*p^2` already dwarfs launch/staging overhead. Smaller fits stay on
768 /// the CPU LM loop where the full `PirlsResult` surface (firth, EDF,
769 /// per-row weights, …) is already populated as a free side-effect of the
770 /// iteration.
771 pub const fn should_use_gpu_pirls_loop(&self, adm: PirlsLoopAdmission) -> bool {
772 if !adm.gpu_available {
773 return false;
774 }
775 if !self.dense_hessian_work_target_is_gpu(adm.n, adm.p) {
776 return false;
777 }
778 match adm.family {
779 Some(_) => true,
780 None => false,
781 }
782 }
783
784 /// Admission predicate for routing the outer REML BFGS-over-ρ loop onto
785 /// a device-resident driver that keeps the BFGS state (ρ, gradient,
786 /// Hessian approx) on-device and only downloads the per-step scalar
787 /// metrics (objective value, gradient norm, convergence flag).
788 ///
789 /// The dense-work threshold piggybacks on the existing inner-PIRLS admission
790 /// predicate because the device-resident outer loop calls
791 /// `pirls_loop_on_stream` per step and must not pay the host hop for small
792 /// fits the inner loop would have rejected anyway. The
793 /// `num_rho ≥ 2` floor rules out the trivial single-smoother case where
794 /// host orchestration is already negligible and the device BFGS state
795 /// (one length-`num_rho` gradient + a `num_rho × num_rho` Hessian
796 /// approx) collapses to a couple of scalars not worth keeping on device.
797 pub const fn should_run_reml_outer_on_device(&self, adm: RemlOuterAdmission) -> bool {
798 if !adm.gpu_available {
799 return false;
800 }
801 if !self.dense_hessian_work_target_is_gpu(adm.n, adm.p) {
802 return false;
803 }
804 if adm.num_rho < 2 {
805 return false;
806 }
807 match adm.family {
808 Some(_) => true,
809 None => false,
810 }
811 }
812}
813
814#[cfg(test)]
815mod refinement_policy_tests {
816 use super::*;
817
818 #[test]
819 fn refinement_policy_admits_large_p() {
820 let pol = GpuDispatchPolicy::default();
821 // Default policy is Refinement; large p should be admitted.
822 assert!(pol.iterative_refinement_should_attempt(512));
823 assert!(pol.iterative_refinement_should_attempt(GpuDispatchPolicy::REFINEMENT_MIN_P));
824 }
825
826 #[test]
827 fn refinement_policy_rejects_small_p() {
828 let pol = GpuDispatchPolicy::default();
829 assert!(!pol.iterative_refinement_should_attempt(GpuDispatchPolicy::REFINEMENT_MIN_P - 1));
830 assert!(!pol.iterative_refinement_should_attempt(0));
831 }
832
833 #[test]
834 fn off_policy_never_attempts_refinement() {
835 let pol = GpuDispatchPolicy {
836 mixed_precision: GpuMixedPrecisionPolicy::Off,
837 ..Default::default()
838 };
839 assert!(!pol.iterative_refinement_should_attempt(1024));
840 }
841
842 #[test]
843 fn never_policy_never_attempts_refinement() {
844 let pol = GpuDispatchPolicy {
845 mixed_precision: GpuMixedPrecisionPolicy::Never,
846 ..Default::default()
847 };
848 assert!(!pol.iterative_refinement_should_attempt(1024));
849 }
850}
851
852#[cfg(test)]
853mod reduced_schur_matvec_offload_tests {
854 use super::*;
855
856 /// The LLM/SAE shape the whole #1017 Phase-1 re-keying targets: a few
857 /// thousand row blocks, a *wide* border (decoder atom count in the
858 /// thousands), a modest per-row frame depth, and a realistic CG budget.
859 /// The row-count gate (50k) and the dense-Direct flop floor both miss this
860 /// "thousands of tiny dense ops" shape; the work-amortised matvec gate must
861 /// fire on it.
862 #[test]
863 fn admits_llm_sae_matvec_shape() {
864 let pol = GpuDispatchPolicy::default();
865 // n≈2000 rows, k≈2048 atoms, M≈8 frame depth — n is far below the 50k
866 // row gate, yet the summed CG matvec work is large.
867 assert!(pol.reduced_schur_matvec_should_offload(
868 2_000,
869 2_048,
870 8,
871 GpuDispatchPolicy::MATVEC_OFFLOAD_MIN_CG_ITERS,
872 ));
873 // The same shape would be rejected by the row-count-style dense gate,
874 // confirming the re-keying is what admits it.
875 assert!(!pol.dense_hessian_work_target_is_gpu(2_000, 8));
876 }
877
878 /// Even with only a single conservative CG iteration the wide LLM border
879 /// clears the breakeven (the per-apply work alone is `2_000·(2·8·2_048 +
880 /// 8²) ≈ 6.6e7` flops > 1e7 by the conservative `n·(2·d·k + d²)` model;
881 /// the true `n·(4·d·k + d²)` arithmetic is ≈1.3e8),
882 /// so the gate is not relying on an inflated iteration count.
883 #[test]
884 fn admits_llm_shape_with_one_cg_iter() {
885 let pol = GpuDispatchPolicy::default();
886 assert!(pol.reduced_schur_matvec_should_offload(2_000, 2_048, 8, 1));
887 }
888
889 /// Tiny shapes where the host↔device transfer dominates must stay on the
890 /// CPU: a handful of rows, a narrow border, shallow frames. The summed
891 /// matvec work is orders of magnitude below the staging breakeven.
892 #[test]
893 fn rejects_tiny_shape_where_transfer_dominates() {
894 let pol = GpuDispatchPolicy::default();
895 assert!(!pol.reduced_schur_matvec_should_offload(
896 30,
897 8,
898 2,
899 GpuDispatchPolicy::MATVEC_OFFLOAD_MIN_CG_ITERS,
900 ));
901 // The 300×8 shape the production seam tests use as the "stay CPU"
902 // canary is rejected here too.
903 assert!(!pol.reduced_schur_matvec_should_offload(300, 8, 4, 16));
904 }
905
906 /// A narrow border (k below the device-loop floor) is rejected regardless
907 /// of how much row/iteration work is piled on: per-apply launch latency
908 /// dominates a sub-`DEVICE_LOOP_MIN_P` border.
909 #[test]
910 fn rejects_narrow_border_even_with_huge_row_count() {
911 let pol = GpuDispatchPolicy::default();
912 let narrow = GpuDispatchPolicy::DEVICE_LOOP_MIN_P - 1;
913 assert!(!pol.reduced_schur_matvec_should_offload(1_000_000, narrow, 64, 64));
914 }
915
916 /// Degenerate dimensions are never offloaded (no work, or no solve).
917 #[test]
918 fn rejects_degenerate_dimensions() {
919 let pol = GpuDispatchPolicy::default();
920 assert!(!pol.reduced_schur_matvec_should_offload(0, 2_048, 8, 8));
921 assert!(!pol.reduced_schur_matvec_should_offload(2_000, 0, 8, 8));
922 assert!(!pol.reduced_schur_matvec_should_offload(2_000, 2_048, 0, 8));
923 assert!(!pol.reduced_schur_matvec_should_offload(2_000, 2_048, 8, 0));
924 }
925
926 /// The gate is monotone in the CG budget: once a shape is admitted at a
927 /// given iteration count it stays admitted for any larger count (more
928 /// applies over the same resident frames only improves amortization), and
929 /// a borderline shape crosses the breakeven as iterations grow.
930 #[test]
931 fn monotone_in_cg_iters() {
932 let pol = GpuDispatchPolicy::default();
933 // A border at the floor with shallow frames and few rows: per-apply
934 // work ~ n·(2·d·k + d²). Choose a shape that is below breakeven at 1
935 // iter but above it once enough iterations accumulate.
936 let (n, k, d) = (200usize, GpuDispatchPolicy::DEVICE_LOOP_MIN_P, 4usize);
937 // per_apply ≈ 200·(2·4·32 + 16) = 200·272 = 54_400 flops.
938 assert!(!pol.reduced_schur_matvec_should_offload(n, k, d, 1));
939 // Once the summed work clears 1e7 the gate fires; ~184 iters here.
940 assert!(pol.reduced_schur_matvec_should_offload(n, k, d, 1_000));
941 // Monotonicity: admitted at 1_000 ⇒ admitted at every larger budget.
942 assert!(pol.reduced_schur_matvec_should_offload(n, k, d, 5_000));
943 }
944
945 /// The admission lower bound must stay strictly below the true per-apply
946 /// work `n·(4·d·k + d²)` for any non-degenerate cross-block shape (it drops
947 /// the transpose GEMV). Treating the lower bound as a flop count would
948 /// over-report device speedups, so this asserts the gap is real.
949 #[test]
950 fn admission_lower_bound_undercounts_actual_work() {
951 for &(n, k, d) in &[
952 (2_000usize, 2_048usize, 8usize),
953 (200, GpuDispatchPolicy::DEVICE_LOOP_MIN_P, 4),
954 (1, 1, 1),
955 ] {
956 let lower = GpuDispatchPolicy::admission_work_lower_bound(n, k, d);
957 // True per-apply work models the full forward+transpose GEMV pair
958 // plus the d×d solve: n·(4·d·k + d²).
959 let actual = (n as u128) * (4 * (d as u128) * (k as u128) + (d as u128) * (d as u128));
960 assert!(
961 lower < actual,
962 "admission lower bound {lower} must undercount actual work {actual} for ({n},{k},{d})"
963 );
964 }
965 }
966}
967
968#[cfg(test)]
969mod arrow_border_solve_plan_tests {
970 use super::*;
971
972 /// The #1017 color arm — few rows, shallow per-row depth, a very wide border
973 /// (`k = 15360 = 3 × 5120`). The dense `k³/3` Cholesky (`≈ 1.2e12` flops)
974 /// dwarfs a matrix-free PCG solve at any realistic CG budget, and the border
975 /// is grossly rank-deficient (`n·d = 360 ≪ k`). The plan must recommend
976 /// `ReducedIterative` and flag the rank deficiency.
977 #[test]
978 fn color_arm_recommends_reduced_iterative_and_flags_rank_deficiency() {
979 let pol = GpuDispatchPolicy::default();
980 let plan = pol.arrow_border_solve_plan(180, 15_360, 2, 30);
981 assert_eq!(plan.recommended, ArrowBorderStrategy::ReducedIterative);
982 assert!(plan.dense_border_rank_deficient);
983 assert_eq!(plan.data_fit_rank, 360);
984 // The dense path is orders of magnitude more expensive here.
985 assert!(plan.dense_direct_flops > plan.reduced_iterative_flops * 100);
986 // The recommended (iterative) path is device-favorable at this shape:
987 // the wide border × summed CG work clears the matvec offload floor.
988 assert!(plan.device_favorable);
989 }
990
991 /// A modest, near-square border where the data-fit rank is comparable to `k`
992 /// and the `k³/3` Cholesky is cheap: dense Direct is the right call.
993 #[test]
994 fn small_square_border_recommends_dense_direct() {
995 let pol = GpuDispatchPolicy::default();
996 // n·d = 400 > k = 64: not rank-deficient; a 64³/3 Cholesky is trivial.
997 let plan = pol.arrow_border_solve_plan(200, 64, 2, 8);
998 assert_eq!(plan.recommended, ArrowBorderStrategy::DenseDirect);
999 assert!(!plan.dense_border_rank_deficient);
1000 assert_eq!(plan.data_fit_rank, 64);
1001 }
1002
1003 /// The rank-deficiency flag is exactly `n·d < k`, and `data_fit_rank` is
1004 /// clamped at `k` (the border can carry no more than `k` data directions).
1005 #[test]
1006 fn rank_flag_and_clamp_track_n_d_versus_k() {
1007 let pol = GpuDispatchPolicy::default();
1008 // n·d == k exactly: full-rank border, not deficient.
1009 let exact = pol.arrow_border_solve_plan(50, 100, 2, 8);
1010 assert!(!exact.dense_border_rank_deficient);
1011 assert_eq!(exact.data_fit_rank, 100);
1012 // n·d one below k: deficient.
1013 let deficient = pol.arrow_border_solve_plan(49, 100, 2, 8);
1014 assert!(deficient.dense_border_rank_deficient);
1015 assert_eq!(deficient.data_fit_rank, 98);
1016 }
1017
1018 /// The recommendation is monotone toward `ReducedIterative` as the border
1019 /// widens at fixed row work: once the dense `k³` term overtakes the linear-
1020 /// in-`k` iterative cost, growing `k` keeps it recommending iterative.
1021 #[test]
1022 fn wider_border_only_moves_toward_iterative() {
1023 let pol = GpuDispatchPolicy::default();
1024 let narrow = pol.arrow_border_solve_plan(200, 128, 4, 16);
1025 let wide = pol.arrow_border_solve_plan(200, 8_192, 4, 16);
1026 // The wide border must recommend iterative.
1027 assert_eq!(wide.recommended, ArrowBorderStrategy::ReducedIterative);
1028 // If the narrow one already recommends iterative, the wide one still
1029 // does (monotone); if not, the wide one is a strict switch. Either way
1030 // the wide border's dense/iterative flop ratio exceeds the narrow one's.
1031 let narrow_ratio = narrow.dense_direct_flops as f64 / narrow.reduced_iterative_flops as f64;
1032 let wide_ratio = wide.dense_direct_flops as f64 / wide.reduced_iterative_flops as f64;
1033 assert!(wide_ratio > narrow_ratio);
1034 }
1035
1036 /// A larger CG budget makes the iterative path more expensive, so the
1037 /// crossover can only move toward `DenseDirect`, never away from it. If a
1038 /// shape is `DenseDirect` at a small budget it stays `DenseDirect` at a
1039 /// larger one.
1040 #[test]
1041 fn larger_cg_budget_never_switches_away_from_dense() {
1042 let pol = GpuDispatchPolicy::default();
1043 let shape = (200usize, 96usize, 3usize);
1044 let small = pol.arrow_border_solve_plan(shape.0, shape.1, shape.2, 4);
1045 let large = pol.arrow_border_solve_plan(shape.0, shape.1, shape.2, 400);
1046 if small.recommended == ArrowBorderStrategy::DenseDirect {
1047 assert_eq!(large.recommended, ArrowBorderStrategy::DenseDirect);
1048 }
1049 assert!(large.reduced_iterative_flops >= small.reduced_iterative_flops);
1050 }
1051
1052 /// Degenerate shapes yield an all-zero plan on the trivial `DenseDirect`
1053 /// path and are never device-favorable.
1054 #[test]
1055 fn degenerate_shapes_are_trivial_dense_and_not_device_favorable() {
1056 let pol = GpuDispatchPolicy::default();
1057 for shape in [(0usize, 100usize, 2usize), (100, 0, 2), (100, 100, 0)] {
1058 let plan = pol.arrow_border_solve_plan(shape.0, shape.1, shape.2, 8);
1059 assert_eq!(plan.recommended, ArrowBorderStrategy::DenseDirect);
1060 assert!(!plan.device_favorable);
1061 assert_eq!(plan.dense_direct_flops, 0);
1062 assert_eq!(plan.reduced_iterative_flops, 0);
1063 }
1064 }
1065
1066 /// A zero CG budget is treated as one apply (a plan must still be
1067 /// comparable), never a divide-by-zero or an all-free iterative path.
1068 #[test]
1069 fn zero_cg_budget_is_treated_as_one_apply() {
1070 let pol = GpuDispatchPolicy::default();
1071 let plan = pol.arrow_border_solve_plan(180, 15_360, 2, 0);
1072 assert_eq!(plan.cg_iters, 1);
1073 assert!(plan.reduced_iterative_flops > 0);
1074 }
1075}
1076
1077#[cfg(test)]
1078mod encode_deployment_decision_tests {
1079 use super::*;
1080
1081 /// #1412 anti-green-wash core: a CPU rate can NEVER produce a `Met`/`Unmet`
1082 /// decision. The only Met/Unmet constructor requires `engaged == true`; a
1083 /// CPU-only host has no device measurement, so it can only ever be
1084 /// `Undetermined`, no matter how fast the CPU is.
1085 #[test]
1086 fn cpu_rate_can_never_meet_or_refute_the_target() {
1087 // Even a CPU rate a thousand times the target cannot certify the gate:
1088 // there is simply no `from_cpu_measurement` — the type has no such door.
1089 // The blocked constructor is the only CPU-side option.
1090 let cpu_only = EncodeDeploymentDecision::blocked(EncodeDecisionBlocked::NoDevice);
1091 assert!(cpu_only.is_undetermined());
1092 assert!(!cpu_only.surrogate_unneeded());
1093 assert!(!cpu_only.surrogate_justified());
1094
1095 // A "device" measurement that did not engage (false routing) is refused —
1096 // it becomes Undetermined even with a huge rate.
1097 let false_routed = EncodeDeploymentDecision::from_device_measurement(false, 1.0e9);
1098 assert!(false_routed.is_undetermined());
1099 assert!(!false_routed.surrogate_unneeded());
1100 }
1101
1102 #[test]
1103 fn engaged_measurement_decides_by_the_number() {
1104 let target = GPU_THROUGHPUT_TARGET_ROWS_PER_SEC;
1105 // Clears the target => Met => surrogate unneeded.
1106 let met = EncodeDeploymentDecision::from_device_measurement(true, target * 2.0);
1107 assert!(matches!(met, EncodeDeploymentDecision::Met { .. }));
1108 assert!(met.surrogate_unneeded());
1109 assert!(!met.surrogate_justified());
1110 assert!(!met.is_undetermined());
1111
1112 // Misses the target => Unmet => surrogate justified.
1113 let unmet = EncodeDeploymentDecision::from_device_measurement(true, target * 0.25);
1114 assert!(matches!(unmet, EncodeDeploymentDecision::Unmet { .. }));
1115 assert!(unmet.surrogate_justified());
1116 assert!(!unmet.surrogate_unneeded());
1117
1118 // Exact boundary meets the target.
1119 let boundary = EncodeDeploymentDecision::from_device_measurement(true, target);
1120 assert!(boundary.surrogate_unneeded());
1121 }
1122
1123 #[test]
1124 fn engaged_but_non_usable_rate_is_undetermined_not_a_pass() {
1125 for bad in [0.0, -1.0, f64::NAN, f64::INFINITY] {
1126 let d = EncodeDeploymentDecision::from_device_measurement(true, bad);
1127 assert!(
1128 d.is_undetermined(),
1129 "an engaged-but-unusable rate {bad} must be Undetermined, not a decision"
1130 );
1131 assert!(!d.surrogate_unneeded());
1132 assert!(!d.surrogate_justified());
1133 }
1134 }
1135
1136 #[test]
1137 fn blocked_reasons_are_all_undetermined() {
1138 for reason in [
1139 EncodeDecisionBlocked::NoDevice,
1140 EncodeDecisionBlocked::NoDeviceEncodeKernel,
1141 EncodeDecisionBlocked::DeviceNotEngaged,
1142 ] {
1143 let d = EncodeDeploymentDecision::blocked(reason);
1144 assert!(d.is_undetermined());
1145 assert!(!d.surrogate_unneeded());
1146 assert!(!d.surrogate_justified());
1147 }
1148 }
1149}