Skip to main content

gam_gpu/
policy.rs

1use serde::{Deserialize, Serialize};
2
3#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
4pub enum GpuMixedPrecisionPolicy {
5    /// Always use fp64 factorization; no refinement attempted.
6    Off,
7    /// Attempt fp32 Cholesky factorization followed by up to
8    /// `REFINEMENT_MAX_STEPS` fp64-residual refinement steps. Policy admits
9    /// the attempt only when `p ≥ REFINEMENT_MIN_P` (so that the fp64 GEMV
10    /// overhead is amortized) and the measured residual drops monotonically.
11    /// Falls back to fp64 factorization automatically when the residual does
12    /// not decrease (κ(A)·u ≥ 1 regime) or when the fp32 POTRF itself fails.
13    Refinement,
14    /// Always use fp64 factorization; equivalent to `Off` but signals that
15    /// an explicit policy decision was taken.
16    Never,
17}
18
19#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
20pub struct GpuDispatchPolicy {
21    pub xtwx_n_min: usize,
22    pub xtwx_flops_min: usize,
23    pub xtwx_use_fused_below_p: usize,
24    pub gemm_min_flops: usize,
25    pub potrf_min_p: usize,
26    pub small_dense_batched_potrf_max_p: usize,
27    pub small_dense_batched_potrf_min_batch: usize,
28    pub syevd_min_p: usize,
29    pub sparse_min_nnz: usize,
30    pub fused_kernel_min_n: usize,
31    pub keep_design_resident_min_bytes: usize,
32    pub prefer_gpu_factorization_min_p: usize,
33    pub row_kernel_min_n: usize,
34    pub mixed_precision: GpuMixedPrecisionPolicy,
35}
36
37impl Default for GpuDispatchPolicy {
38    /// Conservative seed thresholds used before device calibration and when
39    /// calibration cannot run on the current host.
40    ///
41    /// The production runtime replaces these with
42    /// [`crate::calibration::calibrated_policy_for_device`] after the CUDA
43    /// probe selects a concrete device. Keep these values conservative: they
44    /// are the typed baseline for CPU-only builds, failed calibration, and unit
45    /// tests that exercise policy predicates without initializing CUDA.
46    fn default() -> Self {
47        Self {
48            xtwx_n_min: 50_000,
49            xtwx_flops_min: 100_000_000,
50            xtwx_use_fused_below_p: 256,
51            gemm_min_flops: 100_000_000,
52            potrf_min_p: 512,
53            small_dense_batched_potrf_max_p: 32,
54            small_dense_batched_potrf_min_batch: 8,
55            syevd_min_p: 256,
56            sparse_min_nnz: 1_000_000,
57            fused_kernel_min_n: 100_000,
58            keep_design_resident_min_bytes: 32 * 1024 * 1024,
59            prefer_gpu_factorization_min_p: 512,
60            row_kernel_min_n: 50_000,
61            mixed_precision: GpuMixedPrecisionPolicy::Refinement,
62        }
63    }
64}
65
66impl GpuDispatchPolicy {
67    /// Minimum problem dimension for the fp32+refinement path.
68    ///
69    /// Below this threshold the fp64 GEMV needed for the residual check costs
70    /// more than the savings from fp32 factorization. The threshold is set so
71    /// that a single `p × p` DGEMV (2p² flops) is at least 10× cheaper than
72    /// the `p³/3` POTRF (i.e. p ≥ 64) while still leaving margin for the
73    /// POTRF/POTRS launches. In practice `p ≥ 64` matches the existing
74    /// `potrf_min_p = 512` floor for GPU dispatch, so the refinement path only
75    /// activates when the GPU factorization path is already chosen.
76    pub const REFINEMENT_MIN_P: usize = 64;
77
78    /// Maximum number of fp32-correction steps per solve.
79    ///
80    /// Two steps suffice for κ(A) ≤ 10⁵ at fp32 (u ≈ 6 × 10⁻⁸): after step
81    /// 1 the error is O(κ u)² ≈ 10⁻⁶, after step 2 it is O(κ u)⁴ ≈ 10⁻¹²,
82    /// which is well within the fp64 unit roundoff of 10⁻¹⁶ × κ. A cap of 3
83    /// is used defensively.
84    pub const REFINEMENT_MAX_STEPS: usize = 3;
85
86    /// Relative residual tolerance for declaring convergence.
87    ///
88    /// `‖r‖ / ‖b‖ ≤ tol` is considered a converged solve. 10⁻¹² is two
89    /// orders of magnitude above the fp64 machine epsilon times a moderate
90    /// condition number, leaving the policy conservative.
91    pub const REFINEMENT_TOL: f64 = 1e-12;
92
93    /// Return `true` when the policy and problem size together suggest that
94    /// attempting fp32 factorization + iterative refinement will be profitable.
95    ///
96    /// The predicate is conservative:
97    ///   * `GpuMixedPrecisionPolicy::Off` or `Never` → always `false`.
98    ///   * `Refinement` with `p < REFINEMENT_MIN_P` → `false` (GEMV overhead
99    ///     not amortised by fp32 POTRF savings below this threshold).
100    ///   * Otherwise `true`; the caller still falls back to fp64 factorization
101    ///     when the runtime fp32 POTRF fails or when the measured residual is
102    ///     non-monotone.
103    #[inline]
104    pub const fn iterative_refinement_should_attempt(&self, p: usize) -> bool {
105        match self.mixed_precision {
106            GpuMixedPrecisionPolicy::Off | GpuMixedPrecisionPolicy::Never => false,
107            GpuMixedPrecisionPolicy::Refinement => p >= Self::REFINEMENT_MIN_P,
108        }
109    }
110
111    pub const fn dense_gemv_target_is_gpu(&self, n: usize, p: usize, resident: bool) -> bool {
112        resident || n.saturating_mul(p).saturating_mul(2) >= self.gemm_min_flops
113    }
114
115    pub const fn xtwx_target_is_gpu(&self, n: usize, p: usize, materialized: bool) -> bool {
116        materialized && n > 0 && p > 0 && self.xtwx_flops(n, p) >= self.dense_reduction_flops_min()
117    }
118
119    pub const fn xtwy_target_is_gpu(
120        &self,
121        n: usize,
122        px: usize,
123        q: usize,
124        materialized: bool,
125    ) -> bool {
126        materialized
127            && n > 0
128            && px > 0
129            && q > 0
130            && self.xtwy_flops(n, px, q) >= self.dense_reduction_flops_min()
131    }
132
133    pub const fn potrf_target_is_gpu(&self, p: usize, h_resident: bool) -> bool {
134        h_resident && p >= self.potrf_min_p
135    }
136
137    pub const fn dense_hessian_work_target_is_gpu(&self, n: usize, p: usize) -> bool {
138        n > 0
139            && p >= Self::DEVICE_LOOP_MIN_P
140            && self.xtwx_flops(n, p) >= self.dense_reduction_flops_min()
141    }
142
143    const fn dense_reduction_flops_min(&self) -> u128 {
144        if self.xtwx_flops_min < self.gemm_min_flops {
145            self.xtwx_flops_min as u128
146        } else {
147            self.gemm_min_flops as u128
148        }
149    }
150
151    const fn xtwx_flops(&self, n: usize, p: usize) -> u128 {
152        2u128 * (n as u128) * (p as u128) * (p as u128)
153    }
154
155    const fn xtwy_flops(&self, n: usize, px: usize, q: usize) -> u128 {
156        2u128 * (n as u128) * (px as u128) * (q as u128)
157    }
158
159    /// Minimum total CG-amortised matvec flops below which the host↔device
160    /// transfer of the row frames + CG vectors is not repaid by the device
161    /// matvec, so the reduced-Schur PCG hot loop stays on the CPU.
162    ///
163    /// The dense-Direct path keys on `dense_reduction_flops_min` (a single big
164    /// factorization). The matrix-free SAE matvec is different: no single apply
165    /// trips that floor (each is a stack of `n` tiny `d×d` solves + sparse
166    /// `m·k` gather/scatter), but the *whole CG solve* runs the apply
167    /// `O(cg_iters)` times over the same resident frames. The device wins when
168    /// the **summed** matvec work over the solve exceeds the one-time staging
169    /// cost — so the gate keys on `cg_iters · per_apply_flops`, not one apply.
170    ///
171    /// Set one order of magnitude below the dense floor: the matvec frames stay
172    /// resident across CG iterations (uploaded once), so the per-flop transfer
173    /// amortization is `1/cg_iters` of a cold dense launch, and the breakeven
174    /// drops accordingly.
175    pub const MATVEC_OFFLOAD_FLOPS_MIN: u128 = 10_000_000;
176
177    /// Thin-curve (`d_atom = 1`) SAE dictionaries are the common manifold-SAE
178    /// production shape: each per-row frame is a scalar, so the staged device
179    /// payload is much smaller than the general `d > 1` row-frame bundle, while
180    /// the work is still a large batched gather/scatter over `K` atoms and `n`
181    /// rows.  Use a lower admission floor for this scalar-frame regime so a
182    /// realistic token block with a moderately wide curve dictionary is not kept
183    /// on the CPU solely because the conservative general-frame lower-bound
184    /// undercounts the transpose cross term.
185    pub const THIN_CURVE_MATVEC_OFFLOAD_FLOPS_MIN: u128 = 1_000_000;
186
187    /// Conservative seed for the reduced-Schur PCG iteration count when the
188    /// caller cannot supply a measured budget. InexactPCG on an SAE β-block of
189    /// width `k` converges in `O(√κ)` iterations; this floor keeps the work
190    /// estimate honest (≥ this many applies) without over-claiming a tight
191    /// solve. Used only to amortise the staging cost in the work estimate.
192    pub const MATVEC_OFFLOAD_MIN_CG_ITERS: usize = 8;
193
194    /// Per-apply flop estimate for one reduced-Schur matvec `S·x` of a
195    /// matrix-free SAE Kronecker system, as a pure function of the system shape.
196    ///
197    /// Per row block `i` the apply does: a forward cross-block GEMV
198    /// `v_i = H_tβ^(i)·x` (`≈ 2·d·k` multiply-adds, with the per-row latent
199    /// depth `d` as the M-frame width and `k` the border), a `d×d` triangular
200    /// solve through the cached Cholesky factor (`≈ d²`), and a transpose
201    /// cross-block GEMV `H_βt^(i)·w_i` (`≈ 2·d·k`). The two `2·d·k` GEMVs would
202    /// sum to `4·d·k`; this estimate deliberately undercounts to a single
203    /// `2·d·k` cross term as a conservative (lower-bound) admission floor, so
204    /// the apply is modelled as `≈ n·(2·d·k + d²)`. This is a deliberate
205    /// lower bound on the true `≈ n·(4·d·k + d²)` arithmetic — admitting a
206    /// shape under the smaller figure can only be more conservative, never
207    /// over-eager. It is keyed on the *frame depth* `d` (M) and border width
208    /// `k` (p), not row count alone, so LLM shapes (few rows, wide `k`, modest
209    /// `d`) register arithmetic the row-count gate misses.
210    ///
211    /// USE FOR DISPATCH GATING ONLY. This is **not** a flop count: it omits the
212    /// transpose cross-block GEMV (`2·d·k`), so it is a strict lower bound on the
213    /// true per-apply work `n·(4·d·k + d²)`. The gate can therefore only
214    /// under-admit, never over-admit. Do not reuse it for benchmark / speedup
215    /// accounting.
216    const fn admission_work_lower_bound(n: usize, k: usize, d: usize) -> u128 {
217        let n = n as u128;
218        let k = k as u128;
219        let d = d as u128;
220        // 2·d·k cross-block apply (forward only) + d² per-row solve — the
221        // transpose GEMV is intentionally dropped so this stays a lower bound.
222        n.saturating_mul(
223            2u128
224                .saturating_mul(d)
225                .saturating_mul(k)
226                .saturating_add(d * d),
227        )
228    }
229
230    /// Work-based admission for offloading the **reduced-Schur PCG matvec**
231    /// (the InexactPCG hot loop for matrix-free SAE β-blocks) to the device.
232    ///
233    /// This is the Phase-1 (#1017) re-keying: the dense gates key on row count
234    /// (`xtwx_n_min`, `row_kernel_min_n` at 50k) or a single big-factorization
235    /// flop floor, neither of which the SAE LLM shape trips — `(n≈2000) ×
236    /// (k≈2048) × (d≈8)` is *thousands of small dense ops*, no single op large,
237    /// so the row-count gate keeps the whole fit on one CPU core. Here the gate
238    /// is the **total batched work over the CG solve**:
239    ///
240    /// ```text
241    /// estimated_device_flops = cg_iters · per_apply_flops(n, k, d)
242    /// should_offload = estimated_device_flops ≥ T_breakeven
243    /// ```
244    ///
245    /// where `T_breakeven = MATVEC_OFFLOAD_FLOPS_MIN` accounts for the
246    /// host↔device staging of the row frames + CG vectors amortised over the
247    /// `cg_iters` applies that reuse the resident frames (so the per-flop
248    /// transfer cost is `1/cg_iters` of a cold launch, an order of magnitude
249    /// below the dense-Direct floor).
250    ///
251    /// Pure function of the shape: no device needed to evaluate, so it is unit-
252    /// testable. The caller still falls back to the bit-identical CPU matvec
253    /// whenever the backend build declines, so admitting a shape never changes
254    /// the numerics — only where the `Σ_i Y_iᵀ(Y_i x)` flops execute.
255    ///
256    /// * `n`        — number of row blocks (SAE observations / latent rows).
257    /// * `k`        — border β width (the SAE decoder atom count `K`).
258    /// * `d`        — per-row latent / active-frame depth (the M dimension).
259    /// * `cg_iters` — expected PCG iteration budget; the per-apply work is
260    ///   multiplied by this because the frames stay resident across iterations.
261    ///   Pass [`Self::MATVEC_OFFLOAD_MIN_CG_ITERS`] when no measured budget is
262    ///   available; a tighter (smaller) value only makes the gate stricter.
263    ///
264    /// ## Live arrow-Schur call site
265    ///
266    /// `crate::solver::arrow_schur::maybe_inject_gpu_schur_matvec` gates the
267    /// InexactPCG reduced-Schur matvec injection on this predicate:
268    /// `reduced_schur_matvec_should_offload(sys.rows.len(), sys.k, sys.d,
269    /// options.pcg.max_iterations.min(options.trust_region.max_iterations))`,
270    /// where `sys.d` is the system's max per-row latent depth and the iteration
271    /// budget is the same `max_iterations` the PCG loop launches with.
272    /// `try_device_arrow_direct` (the **dense** Direct point solve) correctly
273    /// keeps `dense_hessian_work_target_is_gpu`: that path is a single large
274    /// factorization, not the amortised matvec.
275    pub const fn reduced_schur_matvec_should_offload(
276        &self,
277        n: usize,
278        k: usize,
279        d: usize,
280        cg_iters: usize,
281    ) -> bool {
282        if n == 0 || k == 0 || d == 0 || cg_iters == 0 {
283            return false;
284        }
285        // The border width must clear the device-loop floor: below it the per-
286        // apply launch latency (one kernel sequence per matvec) dominates any
287        // arithmetic regardless of how many CG iterations run.
288        if k < Self::DEVICE_LOOP_MIN_P {
289            return false;
290        }
291        let per_apply = Self::admission_work_lower_bound(n, k, d);
292        let total = per_apply.saturating_mul(cg_iters as u128);
293        let floor = if d == 1 {
294            Self::THIN_CURVE_MATVEC_OFFLOAD_FLOPS_MIN
295        } else {
296            Self::MATVEC_OFFLOAD_FLOPS_MIN
297        };
298        total >= floor
299    }
300}
301
302/// Factorization strategy for the arrow-Schur border (shared `β`) solve, chosen
303/// from the *shape* of the joint system rather than a single fixed border-width
304/// cut (`ArrowSolverMode::automatic`'s `DIRECT_SOLVE_MAX_K = 2000`).
305///
306/// The border width alone is a blunt selector: it cannot see that the data-fit
307/// contribution to the `k × k` border is only rank `Σ_i d_i ≈ n·d`. For the
308/// #1017 color arm (`n = 180`, per-row depth `d = 2`, border `k = 15360`) the
309/// data information is rank `360` yet a dense Direct solve pays a full `k³/3 ≈
310/// 1.2e12`-flop Cholesky — the measured 26-min-class fit. This maps cleanly onto
311/// the two `ArrowSolverMode` variants the solver already implements.
312#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
313pub enum ArrowBorderStrategy {
314    /// Eliminate the per-row blocks, form the dense `k × k` reduced Schur, and
315    /// Cholesky-factor it (`ArrowSolverMode::Direct`). Appropriate for modest,
316    /// near-square borders where the `k³/3` factorization is cheap and the
317    /// data-fit rank is comparable to `k`.
318    DenseDirect,
319    /// Solve the reduced Schur iteratively by matrix-free PCG
320    /// (`ArrowSolverMode::InexactPCG`), never materialising the `k × k` factor.
321    /// Appropriate when the dense `k³` factorization dominates and/or the
322    /// data-fit contribution to the border is rank-deficient (`n·d < k`).
323    ReducedIterative,
324}
325
326/// Cost model + recommendation for the arrow-Schur border solve, a pure function
327/// of the joint-system shape (unit-testable, no device required).
328///
329/// This operationalises the measured #1017 finding that the full arrow-Schur
330/// Newton solve is dominated by the dense `k × k` border Cholesky (the on-device
331/// dense Direct solve was measured at ~0.94× — a slowdown — because the `k³/3`
332/// factorization, not the GPU-favourable batched per-row work, is the bottleneck
333/// at LLM/SAE border widths). The lever the issue calls for is to *shrink or
334/// factor the dense border* so the batched `n`-row work dominates; the plan
335/// makes that decision inspectable and honest.
336///
337/// ## Flop model (deliberate, documented approximations)
338///
339/// * **Dense Direct** ≈ `2·n·d·k²` (assemble the reduced Schur: per row a
340///   rank-`d` symmetric update `H_βt (H_tt)⁻¹ H_tβ` to the `k × k` border,
341///   `≈ 2·d·k²` flops) `+ k³/3` (Cholesky of the dense `k × k` Schur).
342/// * **Reduced iterative** ≈ `cg_iters · n·(4·d·k + d²)` (matrix-free PCG:
343///   per matvec a forward + transpose cross-block GEMV `4·d·k` plus the per-row
344///   `d × d` solve `d²`, summed over `n` row blocks, over `cg_iters` applies).
345///
346/// Both are dispatch-grade estimates, not exact operation counts; they omit
347/// preconditioner setup and lower-order terms symmetrically, so their ratio (the
348/// only thing the recommendation consumes) is meaningful while neither figure
349/// should be reused for speedup accounting.
350///
351/// ## Status
352///
353/// Advisory / diagnostic. It is **not** wired into the live
354/// `ArrowSolverMode::automatic` selector: replacing the fixed `DIRECT_SOLVE_MAX_K`
355/// cut with this shape-driven crossover changes which production fits take the
356/// Direct vs PCG path and must be validated on GPU hardware (#1017 Phase 2–4)
357/// before it can change numerics. Today it is consumed by the honest
358/// `examples/full_color_fit_1017.rs` measurement harness (modeled-vs-measured)
359/// and by the unit tests below.
360#[derive(Clone, Copy, Debug, Eq, PartialEq)]
361pub struct ArrowBorderSolvePlan {
362    /// Number of per-row blocks (SAE observations / latent rows).
363    pub n: usize,
364    /// Border `β` width (the SAE decoder atom count `K` × basis width).
365    pub k: usize,
366    /// Per-row latent / active-frame depth (the `M` dimension).
367    pub d: usize,
368    /// CG iteration budget assumed for the iterative estimate.
369    pub cg_iters: usize,
370    /// Effective rank of the data-fit contribution to the `k × k` border,
371    /// bounded by `Σ_i d_i ≈ n·d` and never more than `k`.
372    pub data_fit_rank: usize,
373    /// True when `n·d < k`: the dense `k × k` Cholesky spends `O(k³)` factorising
374    /// a border whose data information is only rank `n·d` — the pathological
375    /// wide-sparse-border regime (color arm: `n·d = 360 ≪ k = 15360`).
376    pub dense_border_rank_deficient: bool,
377    /// `≈ 2·n·d·k² + k³/3` — reduced-Schur assembly plus dense border Cholesky.
378    pub dense_direct_flops: u128,
379    /// `≈ cg_iters · n·(4·d·k + d²)` — matrix-free PCG matvecs.
380    pub reduced_iterative_flops: u128,
381    /// The recommended strategy: `ReducedIterative` iff the dense factorization
382    /// path costs strictly more arithmetic than the iterative path at
383    /// `cg_iters`.
384    pub recommended: ArrowBorderStrategy,
385    /// Whether running the *recommended* strategy on the device is expected to
386    /// pay off. For `ReducedIterative` this is `reduced_schur_matvec_should_offload`;
387    /// for `DenseDirect` the device wins only when the batched per-row assembly
388    /// work (`2·n·d·k²`, GPU-favourable batched GEMM/POTRF) at least matches the
389    /// border Cholesky (`k³/3`) *and* clears the dense flop floor — the honest
390    /// encoding of the measured 0.94× dense-Direct-on-device slowdown.
391    pub device_favorable: bool,
392}
393
394impl GpuDispatchPolicy {
395    /// Assembly flops for the dense reduced Schur: per row a rank-`d` update to
396    /// the `k × k` border (`≈ 2·d·k²`), summed over `n` rows.
397    const fn dense_schur_assembly_flops(n: usize, k: usize, d: usize) -> u128 {
398        2u128
399            .saturating_mul(n as u128)
400            .saturating_mul(d as u128)
401            .saturating_mul((k as u128).saturating_mul(k as u128))
402    }
403
404    /// Cholesky flops for the dense `k × k` reduced Schur: `≈ k³/3`.
405    const fn dense_border_cholesky_flops(k: usize) -> u128 {
406        let k = k as u128;
407        k.saturating_mul(k).saturating_mul(k) / 3
408    }
409
410    /// Total matrix-free PCG flops: `cg_iters · n·(4·d·k + d²)`.
411    const fn reduced_iterative_flops(n: usize, k: usize, d: usize, cg_iters: usize) -> u128 {
412        let n = n as u128;
413        let k = k as u128;
414        let d = d as u128;
415        let per_apply = n.saturating_mul(
416            4u128
417                .saturating_mul(d)
418                .saturating_mul(k)
419                .saturating_add(d.saturating_mul(d)),
420        );
421        per_apply.saturating_mul(cg_iters as u128)
422    }
423
424    /// Build the shape-driven [`ArrowBorderSolvePlan`] for a joint arrow-Schur
425    /// system with `n` row blocks, border width `k`, per-row depth `d`, and an
426    /// assumed CG budget `cg_iters` (pass
427    /// [`Self::MATVEC_OFFLOAD_MIN_CG_ITERS`] when none is measured; a smaller
428    /// value only biases the recommendation toward `DenseDirect`, never the
429    /// reverse).
430    ///
431    /// Degenerate shapes (`n`, `k`, or `d` zero) return an all-zero plan
432    /// recommending `DenseDirect` (the trivial/empty solve stays on the simple
433    /// path) with `device_favorable = false`.
434    pub fn arrow_border_solve_plan(
435        &self,
436        n: usize,
437        k: usize,
438        d: usize,
439        cg_iters: usize,
440    ) -> ArrowBorderSolvePlan {
441        if n == 0 || k == 0 || d == 0 {
442            return ArrowBorderSolvePlan {
443                n,
444                k,
445                d,
446                cg_iters,
447                data_fit_rank: 0,
448                dense_border_rank_deficient: false,
449                dense_direct_flops: 0,
450                reduced_iterative_flops: 0,
451                recommended: ArrowBorderStrategy::DenseDirect,
452                device_favorable: false,
453            };
454        }
455
456        let assembly = Self::dense_schur_assembly_flops(n, k, d);
457        let border_chol = Self::dense_border_cholesky_flops(k);
458        let dense_direct_flops = assembly.saturating_add(border_chol);
459        let iters = if cg_iters == 0 { 1 } else { cg_iters };
460        let reduced_iterative_flops = Self::reduced_iterative_flops(n, k, d, iters);
461
462        let data_fit_rank = (n.saturating_mul(d)).min(k);
463        let dense_border_rank_deficient = n.saturating_mul(d) < k;
464
465        let recommended = if dense_direct_flops > reduced_iterative_flops {
466            ArrowBorderStrategy::ReducedIterative
467        } else {
468            ArrowBorderStrategy::DenseDirect
469        };
470
471        let device_favorable = match recommended {
472            ArrowBorderStrategy::ReducedIterative => {
473                self.reduced_schur_matvec_should_offload(n, k, d, iters)
474            }
475            ArrowBorderStrategy::DenseDirect => {
476                // Dense Direct wins on device only when the batched per-row
477                // assembly work dominates the (poorly GPU-scaling, and here
478                // rank-deficient) border Cholesky, and the total clears the
479                // dense reduction floor. This is the honest encoding of the
480                // measured 0.94× on-device dense-Direct slowdown: when the k³
481                // Cholesky dominates, stay on the CPU.
482                assembly >= border_chol
483                    && dense_direct_flops >= self.dense_reduction_flops_min()
484            }
485        };
486
487        ArrowBorderSolvePlan {
488            n,
489            k,
490            d,
491            cg_iters: iters,
492            data_fit_rank,
493            dense_border_rank_deficient,
494            dense_direct_flops,
495            reduced_iterative_flops,
496            recommended,
497            device_favorable,
498        }
499    }
500}
501
502/// The aspirational single-GPU design-row throughput the #1412 decision gate is
503/// supposed to establish for the LLM-shape batched-Cholesky + tile-GEMM fit
504/// pipeline: 100 000 design rows processed per wall-clock second per device.
505///
506/// The original gate *claimed* this number without ever measuring it. The
507/// honest contract is the other way around: a benchmark
508/// (`examples/throughput_1412.rs`) measures the true rows/sec on a real device,
509/// and [`GpuThroughputVerdict::from_measurement`] reports whether the measured
510/// value meets the target — the verdict is a *function of the measurement*, not
511/// a hardcoded assertion. See `tests/owed_1412.rs`.
512pub const GPU_THROUGHPUT_TARGET_ROWS_PER_SEC: f64 = 100_000.0;
513
514/// Outcome of comparing a *measured* GPU throughput against the target. The
515/// only way to construct one is [`Self::from_measurement`], so a verdict can
516/// never assert a target that was not actually established by a measurement.
517#[derive(Clone, Copy, Debug, PartialEq)]
518pub struct GpuThroughputVerdict {
519    /// The measured design-rows-per-second on the device under test.
520    pub measured_rows_per_sec: f64,
521    /// The target the measurement is compared against.
522    pub target_rows_per_sec: f64,
523    /// `measured / target`. ≥ 1.0 means the target was established.
524    pub fraction_of_target: f64,
525    /// True iff `measured_rows_per_sec >= target_rows_per_sec`.
526    pub meets_target: bool,
527}
528
529impl GpuThroughputVerdict {
530    /// Build a verdict from a measured throughput against
531    /// [`GPU_THROUGHPUT_TARGET_ROWS_PER_SEC`]. A non-finite or non-positive
532    /// measurement can never meet the target (it is not a usable measurement).
533    #[inline]
534    pub fn from_measurement(measured_rows_per_sec: f64) -> Self {
535        Self::from_measurement_against(measured_rows_per_sec, GPU_THROUGHPUT_TARGET_ROWS_PER_SEC)
536    }
537
538    /// Build a verdict against an explicit target (used by tests that probe the
539    /// comparison logic without depending on the global target constant).
540    #[inline]
541    pub fn from_measurement_against(measured_rows_per_sec: f64, target_rows_per_sec: f64) -> Self {
542        let usable = measured_rows_per_sec.is_finite() && measured_rows_per_sec > 0.0;
543        let fraction_of_target = if usable && target_rows_per_sec > 0.0 {
544            measured_rows_per_sec / target_rows_per_sec
545        } else {
546            0.0
547        };
548        Self {
549            measured_rows_per_sec,
550            target_rows_per_sec,
551            fraction_of_target,
552            meets_target: usable && measured_rows_per_sec >= target_rows_per_sec,
553        }
554    }
555}
556
557/// Why a Stage-3 encode deployment decision could not be made from a real device
558/// measurement (#988, #1412). Each variant is a state in which the
559/// `100_000` rows/sec/GPU target was neither established NOR refuted on a
560/// device — the decision is blocked on hardware, not green-washed from a CPU
561/// proxy.
562#[derive(Clone, Copy, Debug, PartialEq, Eq)]
563pub enum EncodeDecisionBlocked {
564    /// No CUDA device on this host: the exact encode could not be measured on a
565    /// device at all (a CPU rate cannot substitute — that was the #1412 defect).
566    NoDevice,
567    /// A device is present but there is no device-resident *exact-encode* kernel,
568    /// so the FULL per-row encode cannot be measured on the device. (The resident
569    /// normal-equations solve in [`crate::encode_throughput`] is only ONE
570    /// component of the encode, not the encode; a component measurement cannot
571    /// decide the encode surrogate question — #988.)
572    NoDeviceEncodeKernel,
573    /// A device is present and a measurement was attempted, but the device path
574    /// did not engage (false routing) — refused rather than reported as a pass.
575    DeviceNotEngaged,
576}
577
578/// Tri-state Stage-3 encode deployment / amortized-surrogate decision
579/// (#988, #1412).
580///
581/// The decision the throughput gate exists to make is empirical: does the EXACT
582/// per-row encode clear the `100_000` rows/sec/GPU deployment target on a real
583/// device? Only a real device measurement can answer it:
584///   * [`Self::Met`] — a device measurement CLEARED the target: ship the exact
585///     encode; the certified amortized surrogate is NOT needed.
586///   * [`Self::Unmet`] — a device measurement MISSED the target: the certified
587///     amortized surrogate becomes justified.
588///   * [`Self::Undetermined`] — no device measurement is available. The decision
589///     is BLOCKED on hardware; it is neither "surrogate unneeded" nor "surrogate
590///     justified".
591///
592/// The critical anti-green-wash property (#1412): there is NO constructor that
593/// takes a CPU rate. A CPU measurement, however fast, can never move the decision
594/// out of [`Self::Undetermined`]. Projecting a CPU rate through an assumed
595/// CPU→GPU factor to declare the target met was the exact #1412 defect and is
596/// structurally impossible here — [`Self::Met`] / [`Self::Unmet`] come only from
597/// [`Self::from_device_measurement`] with `engaged == true`.
598#[derive(Clone, Copy, Debug, PartialEq)]
599pub enum EncodeDeploymentDecision {
600    /// A device measurement established the deployment target.
601    Met {
602        /// The measured device rows/sec that cleared the target.
603        measured_rows_per_sec: f64,
604        /// The target it was compared against.
605        target_rows_per_sec: f64,
606    },
607    /// A device measurement fell short of the deployment target.
608    Unmet {
609        /// The measured device rows/sec that missed the target.
610        measured_rows_per_sec: f64,
611        /// The target it was compared against.
612        target_rows_per_sec: f64,
613    },
614    /// No device measurement is available; the decision is blocked on hardware.
615    Undetermined {
616        /// Why no device measurement could be made.
617        reason: EncodeDecisionBlocked,
618    },
619}
620
621impl EncodeDeploymentDecision {
622    /// The ONLY path to a `Met`/`Unmet` decision: a device measurement that
623    /// actually engaged the device and produced a usable rate. `engaged == false`
624    /// (false routing / CPU decline) or a non-finite / non-positive rate yields
625    /// [`Self::Undetermined`] — never a fabricated pass or fail.
626    #[must_use]
627    pub fn from_device_measurement(engaged: bool, measured_rows_per_sec: f64) -> Self {
628        Self::from_device_measurement_against(
629            engaged,
630            measured_rows_per_sec,
631            GPU_THROUGHPUT_TARGET_ROWS_PER_SEC,
632        )
633    }
634
635    /// [`Self::from_device_measurement`] against an explicit target (for tests
636    /// that probe the decision logic without the global target constant).
637    #[must_use]
638    pub fn from_device_measurement_against(
639        engaged: bool,
640        measured_rows_per_sec: f64,
641        target_rows_per_sec: f64,
642    ) -> Self {
643        let usable = measured_rows_per_sec.is_finite() && measured_rows_per_sec > 0.0;
644        if !engaged || !usable {
645            return Self::Undetermined {
646                reason: EncodeDecisionBlocked::DeviceNotEngaged,
647            };
648        }
649        if measured_rows_per_sec >= target_rows_per_sec {
650            Self::Met {
651                measured_rows_per_sec,
652                target_rows_per_sec,
653            }
654        } else {
655            Self::Unmet {
656                measured_rows_per_sec,
657                target_rows_per_sec,
658            }
659        }
660    }
661
662    /// Construct the blocked decision for a host that cannot measure the exact
663    /// encode on a device. This is the honest CPU-only / no-device-kernel outcome
664    /// — the deployment target is left undetermined rather than projected.
665    #[must_use]
666    pub fn blocked(reason: EncodeDecisionBlocked) -> Self {
667        Self::Undetermined { reason }
668    }
669
670    /// True ONLY when a device measurement cleared the target: the exact encode
671    /// ships and no surrogate is built. Never true from a CPU proxy.
672    #[must_use]
673    pub fn surrogate_unneeded(&self) -> bool {
674        matches!(self, Self::Met { .. })
675    }
676
677    /// True ONLY when a device measurement missed the target: the certified
678    /// amortized surrogate becomes justified. Never true without a measurement.
679    #[must_use]
680    pub fn surrogate_justified(&self) -> bool {
681        matches!(self, Self::Unmet { .. })
682    }
683
684    /// True when no device measurement is available and the decision is blocked
685    /// on hardware (neither [`Self::surrogate_unneeded`] nor
686    /// [`Self::surrogate_justified`]).
687    #[must_use]
688    pub fn is_undetermined(&self) -> bool {
689        matches!(self, Self::Undetermined { .. })
690    }
691}
692
693/// Which `(response, link)` family the Stage 3.3 device-resident PIRLS loop
694/// can evaluate without going through the Level-B raw-body NVRTC path.
695///
696/// Mirrors `PirlsRowFamily::ALL` at the policy layer so the predicate stays
697/// linkable from the CPU PIRLS entry without dragging a Linux-only enum into
698/// every host compilation unit.
699#[derive(Clone, Copy, Debug, Eq, PartialEq)]
700pub enum PirlsLoopFamilyKind {
701    BernoulliLogit,
702    BernoulliProbit,
703    BernoulliCLogLog,
704    PoissonLog,
705    GaussianIdentity,
706    GammaLog,
707}
708
709#[derive(Clone, Copy, Debug, Eq, PartialEq)]
710pub enum PirlsLoopCurvatureKind {
711    Fisher,
712    Observed,
713}
714
715/// Inputs to [`should_run_reml_outer_on_device`]. The admission predicate
716/// for routing the *outer* REML BFGS-over-ρ loop onto a fully device-resident
717/// driver (rather than the host orchestrator that hops out per step).
718///
719/// Fields are intentionally lifted from data the CPU REML entry has on hand
720/// before it touches the seed generator or the inner P-IRLS loop, so the
721/// admission check is allocation-free and can short-circuit before any
722/// device call.
723#[derive(Clone, Copy, Debug)]
724pub struct RemlOuterAdmission {
725    /// Active design rows (post-transform).
726    pub n: usize,
727    /// Active design columns / penalised-Hessian dimension.
728    pub p: usize,
729    /// Number of smoothing parameters ρ the outer BFGS optimises over.
730    pub num_rho: usize,
731    /// Inner family / link pair the device-resident PIRLS loop can evaluate.
732    /// `None` means the family does not map onto the six JIT-cached row
733    /// kernels — the outer loop must stay on the host orchestrator because
734    /// the inner step would already hop out anyway.
735    pub family: Option<PirlsLoopFamilyKind>,
736    /// Curvature surface the inner loop will use; tied to `family` via
737    /// `pirls_loop_curvature_for`.
738    pub curvature: PirlsLoopCurvatureKind,
739    /// True when the CUDA runtime is initialised on this host.
740    pub gpu_available: bool,
741}
742
743/// Inputs to [`should_use_gpu_pirls_loop`]. Each field comes from data the
744/// CPU PIRLS entry has on hand before it touches the eigendecomposition
745/// engine, so the admission check itself is allocation-free and can short-
746/// circuit before any heavy work happens.
747#[derive(Clone, Copy, Debug)]
748pub struct PirlsLoopAdmission {
749    /// Number of rows in the active (post-transform) design matrix.
750    pub n: usize,
751    /// Number of columns in the active design (i.e. `p` of `Xᵀ X`).
752    pub p: usize,
753    /// `Some(_)` when the inner family maps onto one of the six JIT-cached
754    /// `PirlsRowFamily` variants; `None` for custom families that still
755    /// require Stage 6 Level B and have not yet been admitted here.
756    pub family: Option<PirlsLoopFamilyKind>,
757    /// Curvature surface the inner loop will use; the GPU loop has Fisher +
758    /// Observed kernels, anything else (e.g. expected-projection surrogates)
759    /// is not admitted.
760    pub curvature: PirlsLoopCurvatureKind,
761    /// True when the CUDA runtime is initialised on this host (i.e.
762    /// `GpuRuntime::global().is_some()`).
763    pub gpu_available: bool,
764}
765
766impl GpuDispatchPolicy {
767    /// Minimum design column count for the device-resident inner/outer loops.
768    ///
769    /// Below this width the per-iteration `XᵀWX + Cholesky` is dominated by
770    /// launch latency and PCIe staging rather than arithmetic, so the host LM
771    /// loop (which populates the full `PirlsResult` surface as a free
772    /// side-effect) is strictly cheaper. Shared by both the inner PIRLS and
773    /// outer REML admission predicates so they cannot drift apart.
774    pub const DEVICE_LOOP_MIN_P: usize = 32;
775
776    /// Conservative admission predicate for routing
777    /// `fit_model_for_fixed_rho_with_adaptive_kkt` through the Stage 3.3
778    /// device-resident PIRLS loop instead of the CPU LM loop.
779    ///
780    /// The threshold is the dense `XᵀWX` work estimate, not row count alone:
781    /// LLM/SAE fits can have only a few thousand rows but thousands of columns,
782    /// so `2*n*p^2` already dwarfs launch/staging overhead. Smaller fits stay on
783    /// the CPU LM loop where the full `PirlsResult` surface (firth, EDF,
784    /// per-row weights, …) is already populated as a free side-effect of the
785    /// iteration.
786    pub const fn should_use_gpu_pirls_loop(&self, adm: PirlsLoopAdmission) -> bool {
787        if !adm.gpu_available {
788            return false;
789        }
790        if !self.dense_hessian_work_target_is_gpu(adm.n, adm.p) {
791            return false;
792        }
793        match adm.family {
794            Some(_) => true,
795            None => false,
796        }
797    }
798
799    /// Admission predicate for routing the outer REML BFGS-over-ρ loop onto
800    /// a device-resident driver that keeps the BFGS state (ρ, gradient,
801    /// Hessian approx) on-device and only downloads the per-step scalar
802    /// metrics (objective value, gradient norm, convergence flag).
803    ///
804    /// The dense-work threshold piggybacks on the existing inner-PIRLS admission
805    /// predicate because the device-resident outer loop calls
806    /// `pirls_loop_on_stream` per step and must not pay the host hop for small
807    /// fits the inner loop would have rejected anyway. The
808    /// `num_rho ≥ 2` floor rules out the trivial single-smoother case where
809    /// host orchestration is already negligible and the device BFGS state
810    /// (one length-`num_rho` gradient + a `num_rho × num_rho` Hessian
811    /// approx) collapses to a couple of scalars not worth keeping on device.
812    pub const fn should_run_reml_outer_on_device(&self, adm: RemlOuterAdmission) -> bool {
813        if !adm.gpu_available {
814            return false;
815        }
816        if !self.dense_hessian_work_target_is_gpu(adm.n, adm.p) {
817            return false;
818        }
819        if adm.num_rho < 2 {
820            return false;
821        }
822        match adm.family {
823            Some(_) => true,
824            None => false,
825        }
826    }
827}
828
829#[cfg(test)]
830mod refinement_policy_tests {
831    use super::*;
832
833    #[test]
834    fn refinement_policy_admits_large_p() {
835        let pol = GpuDispatchPolicy::default();
836        // Default policy is Refinement; large p should be admitted.
837        assert!(pol.iterative_refinement_should_attempt(512));
838        assert!(pol.iterative_refinement_should_attempt(GpuDispatchPolicy::REFINEMENT_MIN_P));
839    }
840
841    #[test]
842    fn refinement_policy_rejects_small_p() {
843        let pol = GpuDispatchPolicy::default();
844        assert!(!pol.iterative_refinement_should_attempt(GpuDispatchPolicy::REFINEMENT_MIN_P - 1));
845        assert!(!pol.iterative_refinement_should_attempt(0));
846    }
847
848    #[test]
849    fn off_policy_never_attempts_refinement() {
850        let pol = GpuDispatchPolicy {
851            mixed_precision: GpuMixedPrecisionPolicy::Off,
852            ..Default::default()
853        };
854        assert!(!pol.iterative_refinement_should_attempt(1024));
855    }
856
857    #[test]
858    fn never_policy_never_attempts_refinement() {
859        let pol = GpuDispatchPolicy {
860            mixed_precision: GpuMixedPrecisionPolicy::Never,
861            ..Default::default()
862        };
863        assert!(!pol.iterative_refinement_should_attempt(1024));
864    }
865}
866
867#[cfg(test)]
868mod reduced_schur_matvec_offload_tests {
869    use super::*;
870
871    /// The LLM/SAE shape the whole #1017 Phase-1 re-keying targets: a few
872    /// thousand row blocks, a *wide* border (decoder atom count in the
873    /// thousands), a modest per-row frame depth, and a realistic CG budget.
874    /// The row-count gate (50k) and the dense-Direct flop floor both miss this
875    /// "thousands of tiny dense ops" shape; the work-amortised matvec gate must
876    /// fire on it.
877    #[test]
878    fn admits_llm_sae_matvec_shape() {
879        let pol = GpuDispatchPolicy::default();
880        // n≈2000 rows, k≈2048 atoms, M≈8 frame depth — n is far below the 50k
881        // row gate, yet the summed CG matvec work is large.
882        assert!(pol.reduced_schur_matvec_should_offload(
883            2_000,
884            2_048,
885            8,
886            GpuDispatchPolicy::MATVEC_OFFLOAD_MIN_CG_ITERS,
887        ));
888        // The same shape would be rejected by the row-count-style dense gate,
889        // confirming the re-keying is what admits it.
890        assert!(!pol.dense_hessian_work_target_is_gpu(2_000, 8));
891    }
892
893    /// Even with only a single conservative CG iteration the wide LLM border
894    /// clears the breakeven (the per-apply work alone is `2_000·(2·8·2_048 +
895    /// 8²) ≈ 6.6e7` flops > 1e7 by the conservative `n·(2·d·k + d²)` model;
896    /// the true `n·(4·d·k + d²)` arithmetic is ≈1.3e8),
897    /// so the gate is not relying on an inflated iteration count.
898    #[test]
899    fn admits_llm_shape_with_one_cg_iter() {
900        let pol = GpuDispatchPolicy::default();
901        assert!(pol.reduced_schur_matvec_should_offload(2_000, 2_048, 8, 1));
902    }
903
904    /// #1783: the primary manifold-SAE regime is a `d_atom = 1` curve
905    /// dictionary.  Its scalar row frames have much lower staging cost than the
906    /// general framed matvec, so realistic token blocks must not be stranded on
907    /// the CPU merely because the conservative admission lower bound is thin in
908    /// `d`.
909    #[test]
910    fn admits_thin_curve_atoms_at_realistic_scale() {
911        let pol = GpuDispatchPolicy::default();
912        assert!(pol.reduced_schur_matvec_should_offload(24_576, 64, 1, 1));
913        assert!(pol.reduced_schur_matvec_should_offload(40_456, 256, 1, 1));
914        assert!(!pol.reduced_schur_matvec_should_offload(300, 6, 1, 8));
915    }
916
917    /// Tiny shapes where the host↔device transfer dominates must stay on the
918    /// CPU: a handful of rows, a narrow border, shallow frames. The summed
919    /// matvec work is orders of magnitude below the staging breakeven.
920    #[test]
921    fn rejects_tiny_shape_where_transfer_dominates() {
922        let pol = GpuDispatchPolicy::default();
923        assert!(!pol.reduced_schur_matvec_should_offload(
924            30,
925            8,
926            2,
927            GpuDispatchPolicy::MATVEC_OFFLOAD_MIN_CG_ITERS,
928        ));
929        // The 300×8 shape the production seam tests use as the "stay CPU"
930        // canary is rejected here too.
931        assert!(!pol.reduced_schur_matvec_should_offload(300, 8, 4, 16));
932    }
933
934    /// A narrow border (k below the device-loop floor) is rejected regardless
935    /// of how much row/iteration work is piled on: per-apply launch latency
936    /// dominates a sub-`DEVICE_LOOP_MIN_P` border.
937    #[test]
938    fn rejects_narrow_border_even_with_huge_row_count() {
939        let pol = GpuDispatchPolicy::default();
940        let narrow = GpuDispatchPolicy::DEVICE_LOOP_MIN_P - 1;
941        assert!(!pol.reduced_schur_matvec_should_offload(1_000_000, narrow, 64, 64));
942    }
943
944    /// Degenerate dimensions are never offloaded (no work, or no solve).
945    #[test]
946    fn rejects_degenerate_dimensions() {
947        let pol = GpuDispatchPolicy::default();
948        assert!(!pol.reduced_schur_matvec_should_offload(0, 2_048, 8, 8));
949        assert!(!pol.reduced_schur_matvec_should_offload(2_000, 0, 8, 8));
950        assert!(!pol.reduced_schur_matvec_should_offload(2_000, 2_048, 0, 8));
951        assert!(!pol.reduced_schur_matvec_should_offload(2_000, 2_048, 8, 0));
952    }
953
954    /// The gate is monotone in the CG budget: once a shape is admitted at a
955    /// given iteration count it stays admitted for any larger count (more
956    /// applies over the same resident frames only improves amortization), and
957    /// a borderline shape crosses the breakeven as iterations grow.
958    #[test]
959    fn monotone_in_cg_iters() {
960        let pol = GpuDispatchPolicy::default();
961        // A border at the floor with shallow frames and few rows: per-apply
962        // work ~ n·(2·d·k + d²). Choose a shape that is below breakeven at 1
963        // iter but above it once enough iterations accumulate.
964        let (n, k, d) = (200usize, GpuDispatchPolicy::DEVICE_LOOP_MIN_P, 4usize);
965        // per_apply ≈ 200·(2·4·32 + 16) = 200·272 = 54_400 flops.
966        assert!(!pol.reduced_schur_matvec_should_offload(n, k, d, 1));
967        // Once the summed work clears 1e7 the gate fires; ~184 iters here.
968        assert!(pol.reduced_schur_matvec_should_offload(n, k, d, 1_000));
969        // Monotonicity: admitted at 1_000 ⇒ admitted at every larger budget.
970        assert!(pol.reduced_schur_matvec_should_offload(n, k, d, 5_000));
971    }
972
973    /// The admission lower bound must stay strictly below the true per-apply
974    /// work `n·(4·d·k + d²)` for any non-degenerate cross-block shape (it drops
975    /// the transpose GEMV). Treating the lower bound as a flop count would
976    /// over-report device speedups, so this asserts the gap is real.
977    #[test]
978    fn admission_lower_bound_undercounts_actual_work() {
979        for &(n, k, d) in &[
980            (2_000usize, 2_048usize, 8usize),
981            (200, GpuDispatchPolicy::DEVICE_LOOP_MIN_P, 4),
982            (1, 1, 1),
983        ] {
984            let lower = GpuDispatchPolicy::admission_work_lower_bound(n, k, d);
985            // True per-apply work models the full forward+transpose GEMV pair
986            // plus the d×d solve: n·(4·d·k + d²).
987            let actual = (n as u128) * (4 * (d as u128) * (k as u128) + (d as u128) * (d as u128));
988            assert!(
989                lower < actual,
990                "admission lower bound {lower} must undercount actual work {actual} for ({n},{k},{d})"
991            );
992        }
993    }
994}
995
996#[cfg(test)]
997mod arrow_border_solve_plan_tests {
998    use super::*;
999
1000    /// The #1017 color arm — few rows, shallow per-row depth, a very wide border
1001    /// (`k = 15360 = 3 × 5120`). The dense `k³/3` Cholesky (`≈ 1.2e12` flops)
1002    /// dwarfs a matrix-free PCG solve at any realistic CG budget, and the border
1003    /// is grossly rank-deficient (`n·d = 360 ≪ k`). The plan must recommend
1004    /// `ReducedIterative` and flag the rank deficiency.
1005    #[test]
1006    fn color_arm_recommends_reduced_iterative_and_flags_rank_deficiency() {
1007        let pol = GpuDispatchPolicy::default();
1008        let plan = pol.arrow_border_solve_plan(180, 15_360, 2, 30);
1009        assert_eq!(plan.recommended, ArrowBorderStrategy::ReducedIterative);
1010        assert!(plan.dense_border_rank_deficient);
1011        assert_eq!(plan.data_fit_rank, 360);
1012        // The dense path is orders of magnitude more expensive here.
1013        assert!(plan.dense_direct_flops > plan.reduced_iterative_flops * 100);
1014        // The recommended (iterative) path is device-favorable at this shape:
1015        // the wide border × summed CG work clears the matvec offload floor.
1016        assert!(plan.device_favorable);
1017    }
1018
1019    /// A modest, near-square border where the data-fit rank is comparable to `k`
1020    /// and the `k³/3` Cholesky is cheap: dense Direct is the right call.
1021    #[test]
1022    fn small_square_border_recommends_dense_direct() {
1023        let pol = GpuDispatchPolicy::default();
1024        // n·d = 400 > k = 64: not rank-deficient; a 64³/3 Cholesky is trivial.
1025        let plan = pol.arrow_border_solve_plan(200, 64, 2, 8);
1026        assert_eq!(plan.recommended, ArrowBorderStrategy::DenseDirect);
1027        assert!(!plan.dense_border_rank_deficient);
1028        assert_eq!(plan.data_fit_rank, 64);
1029    }
1030
1031    /// The rank-deficiency flag is exactly `n·d < k`, and `data_fit_rank` is
1032    /// clamped at `k` (the border can carry no more than `k` data directions).
1033    #[test]
1034    fn rank_flag_and_clamp_track_n_d_versus_k() {
1035        let pol = GpuDispatchPolicy::default();
1036        // n·d == k exactly: full-rank border, not deficient.
1037        let exact = pol.arrow_border_solve_plan(50, 100, 2, 8);
1038        assert!(!exact.dense_border_rank_deficient);
1039        assert_eq!(exact.data_fit_rank, 100);
1040        // n·d one below k: deficient.
1041        let deficient = pol.arrow_border_solve_plan(49, 100, 2, 8);
1042        assert!(deficient.dense_border_rank_deficient);
1043        assert_eq!(deficient.data_fit_rank, 98);
1044    }
1045
1046    /// The recommendation is monotone toward `ReducedIterative` as the border
1047    /// widens at fixed row work: once the dense `k³` term overtakes the linear-
1048    /// in-`k` iterative cost, growing `k` keeps it recommending iterative.
1049    #[test]
1050    fn wider_border_only_moves_toward_iterative() {
1051        let pol = GpuDispatchPolicy::default();
1052        let narrow = pol.arrow_border_solve_plan(200, 128, 4, 16);
1053        let wide = pol.arrow_border_solve_plan(200, 8_192, 4, 16);
1054        // The wide border must recommend iterative.
1055        assert_eq!(wide.recommended, ArrowBorderStrategy::ReducedIterative);
1056        // If the narrow one already recommends iterative, the wide one still
1057        // does (monotone); if not, the wide one is a strict switch. Either way
1058        // the wide border's dense/iterative flop ratio exceeds the narrow one's.
1059        let narrow_ratio = narrow.dense_direct_flops as f64 / narrow.reduced_iterative_flops as f64;
1060        let wide_ratio = wide.dense_direct_flops as f64 / wide.reduced_iterative_flops as f64;
1061        assert!(wide_ratio > narrow_ratio);
1062    }
1063
1064    /// A larger CG budget makes the iterative path more expensive, so the
1065    /// crossover can only move toward `DenseDirect`, never away from it. If a
1066    /// shape is `DenseDirect` at a small budget it stays `DenseDirect` at a
1067    /// larger one.
1068    #[test]
1069    fn larger_cg_budget_never_switches_away_from_dense() {
1070        let pol = GpuDispatchPolicy::default();
1071        let shape = (200usize, 96usize, 3usize);
1072        let small = pol.arrow_border_solve_plan(shape.0, shape.1, shape.2, 4);
1073        let large = pol.arrow_border_solve_plan(shape.0, shape.1, shape.2, 400);
1074        if small.recommended == ArrowBorderStrategy::DenseDirect {
1075            assert_eq!(large.recommended, ArrowBorderStrategy::DenseDirect);
1076        }
1077        assert!(large.reduced_iterative_flops >= small.reduced_iterative_flops);
1078    }
1079
1080    /// Degenerate shapes yield an all-zero plan on the trivial `DenseDirect`
1081    /// path and are never device-favorable.
1082    #[test]
1083    fn degenerate_shapes_are_trivial_dense_and_not_device_favorable() {
1084        let pol = GpuDispatchPolicy::default();
1085        for shape in [(0usize, 100usize, 2usize), (100, 0, 2), (100, 100, 0)] {
1086            let plan = pol.arrow_border_solve_plan(shape.0, shape.1, shape.2, 8);
1087            assert_eq!(plan.recommended, ArrowBorderStrategy::DenseDirect);
1088            assert!(!plan.device_favorable);
1089            assert_eq!(plan.dense_direct_flops, 0);
1090            assert_eq!(plan.reduced_iterative_flops, 0);
1091        }
1092    }
1093
1094    /// A zero CG budget is treated as one apply (a plan must still be
1095    /// comparable), never a divide-by-zero or an all-free iterative path.
1096    #[test]
1097    fn zero_cg_budget_is_treated_as_one_apply() {
1098        let pol = GpuDispatchPolicy::default();
1099        let plan = pol.arrow_border_solve_plan(180, 15_360, 2, 0);
1100        assert_eq!(plan.cg_iters, 1);
1101        assert!(plan.reduced_iterative_flops > 0);
1102    }
1103}
1104
1105#[cfg(test)]
1106mod encode_deployment_decision_tests {
1107    use super::*;
1108
1109    /// #1412 anti-green-wash core: a CPU rate can NEVER produce a `Met`/`Unmet`
1110    /// decision. The only Met/Unmet constructor requires `engaged == true`; a
1111    /// CPU-only host has no device measurement, so it can only ever be
1112    /// `Undetermined`, no matter how fast the CPU is.
1113    #[test]
1114    fn cpu_rate_can_never_meet_or_refute_the_target() {
1115        // Even a CPU rate a thousand times the target cannot certify the gate:
1116        // there is simply no `from_cpu_measurement` — the type has no such door.
1117        // The blocked constructor is the only CPU-side option.
1118        let cpu_only = EncodeDeploymentDecision::blocked(EncodeDecisionBlocked::NoDevice);
1119        assert!(cpu_only.is_undetermined());
1120        assert!(!cpu_only.surrogate_unneeded());
1121        assert!(!cpu_only.surrogate_justified());
1122
1123        // A "device" measurement that did not engage (false routing) is refused —
1124        // it becomes Undetermined even with a huge rate.
1125        let false_routed = EncodeDeploymentDecision::from_device_measurement(false, 1.0e9);
1126        assert!(false_routed.is_undetermined());
1127        assert!(!false_routed.surrogate_unneeded());
1128    }
1129
1130    #[test]
1131    fn engaged_measurement_decides_by_the_number() {
1132        let target = GPU_THROUGHPUT_TARGET_ROWS_PER_SEC;
1133        // Clears the target => Met => surrogate unneeded.
1134        let met = EncodeDeploymentDecision::from_device_measurement(true, target * 2.0);
1135        assert!(matches!(met, EncodeDeploymentDecision::Met { .. }));
1136        assert!(met.surrogate_unneeded());
1137        assert!(!met.surrogate_justified());
1138        assert!(!met.is_undetermined());
1139
1140        // Misses the target => Unmet => surrogate justified.
1141        let unmet = EncodeDeploymentDecision::from_device_measurement(true, target * 0.25);
1142        assert!(matches!(unmet, EncodeDeploymentDecision::Unmet { .. }));
1143        assert!(unmet.surrogate_justified());
1144        assert!(!unmet.surrogate_unneeded());
1145
1146        // Exact boundary meets the target.
1147        let boundary = EncodeDeploymentDecision::from_device_measurement(true, target);
1148        assert!(boundary.surrogate_unneeded());
1149    }
1150
1151    #[test]
1152    fn engaged_but_non_usable_rate_is_undetermined_not_a_pass() {
1153        for bad in [0.0, -1.0, f64::NAN, f64::INFINITY] {
1154            let d = EncodeDeploymentDecision::from_device_measurement(true, bad);
1155            assert!(
1156                d.is_undetermined(),
1157                "an engaged-but-unusable rate {bad} must be Undetermined, not a decision"
1158            );
1159            assert!(!d.surrogate_unneeded());
1160            assert!(!d.surrogate_justified());
1161        }
1162    }
1163
1164    #[test]
1165    fn blocked_reasons_are_all_undetermined() {
1166        for reason in [
1167            EncodeDecisionBlocked::NoDevice,
1168            EncodeDecisionBlocked::NoDeviceEncodeKernel,
1169            EncodeDecisionBlocked::DeviceNotEngaged,
1170        ] {
1171            let d = EncodeDeploymentDecision::blocked(reason);
1172            assert!(d.is_undetermined());
1173            assert!(!d.surrogate_unneeded());
1174            assert!(!d.surrogate_justified());
1175        }
1176    }
1177}