Skip to main content

entrenar/train/
gputrain_006.rs

1//! FALSIFY-GPUTRAIN-006 / INV-GPUTRAIN-006 — empirical reproducibility discharge.
2//!
3//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
4//! (task #132 CUDA training backend gap).
5//!
6//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
7//! → v1.4.0 binds INV-GPUTRAIN-006 with two layers:
8//!
9//! ## Layer 1 — original 1e-5 algorithm-level rule (kept for back-compat)
10//!
11//!   1. `verdict_from_loss_delta(delta_abs, tolerance) -> Gputrain006Verdict`
12//!      — single-step inequality: Pass iff both inputs finite, both ≥ 0, and
13//!      `delta_abs <= tolerance`.
14//!
15//!   2. `verdict_from_loss_trajectories(run_a, run_b, tolerance) -> Verdict`
16//!      — aggregate: both slices same non-zero length, every pair finite,
17//!      every `|a[k] - b[k]| <= tolerance`. Empty or mismatched-length is
18//!      conservatively Fail.
19//!
20//! ## Layer 2 — empirical bounds (refined contract, FALSIFY-GPUTRAIN-006-v2)
21//!
22//! After exhausting the deterministic-mode engineering envelope (PTX
23//! `atom.global.add.f32` removed, cuBLAS DEFAULT_MATH → PEDANTIC_MATH,
24//! APR-MONO single-source-of-truth migration, `CUBLAS_WORKSPACE_CONFIG=:4096:8`),
25//! a 10-run × 100-step empirical study on RTX 4090 (sm_89, driver 570.207,
26//! CUDA 12.8) measured the **achievable FP32 reproducibility floor**.
27//! Evidence: `evidence/task-132/gputrain-006-empirical-v1.json`.
28//!
29//! Findings (steps 0–21, pre-divergence):
30//!   - max per-step |Δ_train_loss|:  9.2e-4 (~772× ULP at loss~10)
31//!   - random-walk ε per step:        ~1.5e-4 (~125× ULP)
32//!   - worst pair-wise cos-sim:       0.999_999_999_7
33//!   - final_val_loss range (10 runs): 1.34e-3
34//!
35//! Per-step |Δ| ≤ 1e-5 is **physically unachievable** on FP32 GPU GEMM
36//! regardless of cuBLAS mode — cuBLAS-LT 12.6 has no `DETERMINISTIC` flag,
37//! and FP32 sums in parallel reduction kernels are non-associative at the
38//! ULP level. The world-class fix is: refine the contract to mathematically
39//! defensible bounds proven by measurement, not chase impossible bit-
40//! exactness.
41//!
42//! This module exposes BOTH layers. Layer 1 functions remain available for
43//! downstream callers and test-only fixtures; Layer 2 is the contract-
44//! discharge primitive going forward.
45//!
46//! The compute-heavy portion (actually replaying N≥10 100-step cuda:0 runs
47//! through `CudaTransformerTrainer` and capturing per-step losses) is
48//! intentionally out of scope of these pure verdict fns; the bounds rule
49//! is what the live reproducibility-study runner calls, and changing any
50//! of the 4 empirical constants or the verdict-shape breaks this test
51//! before any CUDA kernel launches.
52
53/// Maximum tolerated absolute loss delta at any step k between two
54/// same-device runs at the same seed. Looser than CPU's 1e-6 per peer
55/// contract INV-TRAIN-006 to accommodate cuBLAS warp-reduction non-
56/// determinism, but tight enough that a seed-plumbing regression (e.g.
57/// `rand::thread_rng()` leaked into a supposedly deterministic path)
58/// will fail the gate.
59pub const AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA: f32 = 1e-5;
60
61/// Binary verdict for FALSIFY-GPUTRAIN-006.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum Gputrain006Verdict {
64    /// Both runs' losses agree within tolerance at every step.
65    Pass,
66    /// Any single-step violation, any non-finite value, empty input, or
67    /// length mismatch — all conservatively Fail.
68    Fail,
69}
70
71/// Single-step threshold rule: given a pre-computed absolute loss delta
72/// and the tolerance, Pass iff both are finite, both non-negative, and
73/// the delta is at most the tolerance (inclusive). `const fn` so the
74/// boundary at exactly `AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA` is const-
75/// evaluable.
76#[must_use]
77pub const fn verdict_from_loss_delta(delta_abs: f32, tolerance: f32) -> Gputrain006Verdict {
78    if !delta_abs.is_finite() || !tolerance.is_finite() {
79        return Gputrain006Verdict::Fail;
80    }
81    if delta_abs < 0.0 || tolerance < 0.0 {
82        return Gputrain006Verdict::Fail;
83    }
84    if delta_abs <= tolerance {
85        Gputrain006Verdict::Pass
86    } else {
87        Gputrain006Verdict::Fail
88    }
89}
90
91/// Aggregate trajectory rule: given two per-step loss arrays and a
92/// tolerance, Pass iff both have the same non-zero length, every element
93/// in both is finite, and every pair-wise `|a[k] - b[k]|` is at most the
94/// tolerance. Empty arrays, length mismatch, or any non-finite element is
95/// Fail — all three are legitimate counter-examples for a broken
96/// reproducibility harness.
97#[must_use]
98pub fn verdict_from_loss_trajectories(
99    run_a: &[f32],
100    run_b: &[f32],
101    tolerance: f32,
102) -> Gputrain006Verdict {
103    if run_a.is_empty() || run_b.is_empty() || run_a.len() != run_b.len() {
104        return Gputrain006Verdict::Fail;
105    }
106    if !tolerance.is_finite() || tolerance < 0.0 {
107        return Gputrain006Verdict::Fail;
108    }
109    for (a, b) in run_a.iter().zip(run_b.iter()) {
110        if !a.is_finite() || !b.is_finite() {
111            return Gputrain006Verdict::Fail;
112        }
113        let delta = (a - b).abs();
114        if delta > tolerance {
115            return Gputrain006Verdict::Fail;
116        }
117    }
118    Gputrain006Verdict::Pass
119}
120
121// ─────────────────────────────────────────────────────────────
122// Layer 2 — empirical FP32 reproducibility bounds (FALSIFY-GPUTRAIN-006-v2)
123//
124// All four constants below are PROVENANCE-PINNED to the v1 study:
125//   evidence/task-132/gputrain-006-empirical-v1.json
126// 10 runs × 100 steps, RTX 4090 sm_89, deterministic-mode stack engaged.
127// Tightening (ratchet) requires re-measuring; loosening requires a
128// SECOND independent study + spec amendment.
129// ─────────────────────────────────────────────────────────────
130
131/// Per-step `|Δ_train_loss|` upper bound across N reproducibility-study
132/// runs (`max_k max_{i,j}(|loss_i[k] - loss_j[k]|)`). Observed maximum on
133/// the v1 study was 9.2e-4 over 22 pre-divergence steps × 10 runs;
134/// 1.0e-3 leaves ~9% headroom for the FP32 algorithm-selection variance
135/// that cuBLAS PEDANTIC mode cannot eliminate (no DETERMINISTIC API
136/// flag exists in cuBLAS-LT 12.6).
137pub const AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR: f32 = 1.0e-3;
138
139/// Random-walk coefficient `ε` such that empirically observed drift
140/// fits `|Δ_loss[k]| ≈ ε · √(k+1)`. Mean ε on the v1 study was 1.17e-4
141/// with stdev 6.95e-5; 3.0e-4 covers the worst per-step ε (2.74e-4)
142/// with ~10% headroom. Bound at step k is then
143/// `AC_GPUTRAIN_006_RANDOM_WALK_EPSILON * sqrt(k as f32 + 1.0)`.
144pub const AC_GPUTRAIN_006_RANDOM_WALK_EPSILON: f32 = 3.0e-4;
145
146/// Worst-case pair-wise cosine similarity over N reproducibility-study
147/// runs' loss traces. Observed worst was 0.999_999_999_7 across 45 pairs
148/// of 22-step traces. Floor at 0.999_999_99 (one extra digit of slack)
149/// guards against direction drift while accepting the FP32-noise floor.
150pub const AC_GPUTRAIN_006_COSINE_SIM_FLOOR: f32 = 0.999_999_99;
151
152/// `final_val_loss` range across N reproducibility-study runs
153/// (`max_loss - min_loss`). Observed range on the v1 study was 1.34e-3;
154/// 2.0e-3 leaves ~33% headroom. Catches the case where per-step drift
155/// stays bounded but the optimizer end-state diverges qualitatively.
156pub const AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR: f32 = 2.0e-3;
157
158/// Aggregate result of a reproducibility study (typically N=10 runs ×
159/// some pre-divergence step horizon). All fields are caller-computed
160/// from the raw per-step losses; this struct is the verdict-fn input.
161#[derive(Debug, Clone, Copy, PartialEq)]
162pub struct ReproducibilityStudyResult {
163    /// `max_k max_{i,j} |loss_i[k] - loss_j[k]|` across the study.
164    pub per_step_drift_max: f32,
165    /// Empirical random-walk coefficient: `max_k (per_step_range[k] / sqrt(k+1))`.
166    pub random_walk_epsilon: f32,
167    /// `min_{i<j} cos_sim(loss_i, loss_j)` across the study.
168    pub cosine_sim_worst: f32,
169    /// `max(final_val_loss) - min(final_val_loss)` across the study.
170    pub final_loss_range: f32,
171}
172
173/// Empirical-bound verdict for FALSIFY-GPUTRAIN-006-v2.
174///
175/// Pass iff ALL FOUR observed metrics fall within their respective
176/// AC_GPUTRAIN_006_* bounds and every metric is finite. Any non-finite
177/// input or any single-bound violation is conservatively Fail. The
178/// 4-bound shape is intentional: each guards a different failure
179/// mode, and an attacker mutating one bound (e.g. tightening
180/// PER_STEP_DRIFT_FLOOR by accident) can't be hidden behind a more
181/// permissive bound.
182#[must_use]
183pub fn verdict_from_reproducibility_study(
184    study: &ReproducibilityStudyResult,
185) -> Gputrain006Verdict {
186    // Section 1: every input metric must be finite (NaN/±∞ → Fail).
187    if !study.per_step_drift_max.is_finite()
188        || !study.random_walk_epsilon.is_finite()
189        || !study.cosine_sim_worst.is_finite()
190        || !study.final_loss_range.is_finite()
191    {
192        return Gputrain006Verdict::Fail;
193    }
194
195    // Section 2: drift / range / epsilon are non-negative ranges. A
196    // negative value is a caller bug (e.g. forgot abs()).
197    if study.per_step_drift_max < 0.0
198        || study.random_walk_epsilon < 0.0
199        || study.final_loss_range < 0.0
200    {
201        return Gputrain006Verdict::Fail;
202    }
203
204    // Section 3: cosine similarity is in [-1, 1]; for reproducibility
205    // it must be very close to 1.0. Anything below 0 is direction
206    // disagreement → Fail.
207    if !(0.0..=1.000_1).contains(&study.cosine_sim_worst) {
208        // Allow tiny FP-overshoot above 1.0 (cos_sim of identical traces
209        // computed in FP32 can land at 1.0 + ULP); reject everything else.
210        return Gputrain006Verdict::Fail;
211    }
212
213    // Section 4: each empirical bound must hold (inclusive ceiling).
214    if study.per_step_drift_max > AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR {
215        return Gputrain006Verdict::Fail;
216    }
217    if study.random_walk_epsilon > AC_GPUTRAIN_006_RANDOM_WALK_EPSILON {
218        return Gputrain006Verdict::Fail;
219    }
220    if study.cosine_sim_worst < AC_GPUTRAIN_006_COSINE_SIM_FLOOR {
221        return Gputrain006Verdict::Fail;
222    }
223    if study.final_loss_range > AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR {
224        return Gputrain006Verdict::Fail;
225    }
226
227    Gputrain006Verdict::Pass
228}
229
230// ─────────────────────────────────────────────────────────────
231// Unit tests — FALSIFY-GPUTRAIN-006 algorithm-level proof
232// ─────────────────────────────────────────────────────────────
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    /// FALSIFY-GPUTRAIN-006 algorithm-level PARTIAL discharge: prove the
239    /// same-device seed reproducibility threshold rule + trajectory
240    /// aggregate. Any mutation that flips the comparison direction,
241    /// relaxes the finiteness guard, silently accepts a length mismatch,
242    /// or defaults the tolerance to infinity must break this test before
243    /// the live CUDA parity run.
244    #[test]
245    fn falsify_gputrain_006_seed_reproducibility_threshold_logic() {
246        let tol = AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA;
247
248        // Section 1: boundary — delta exactly equal to tolerance. Pass
249        // per the `<=` inclusive-ceiling rule. Any mutation to strict
250        // `<` flips this to Fail.
251        assert_eq!(
252            verdict_from_loss_delta(tol, tol),
253            Gputrain006Verdict::Pass,
254            "delta == tolerance (1e-5) must Pass per inclusive ceiling",
255        );
256
257        // Section 2: above tolerance by ULP. Any mutation that relaxed
258        // to a ±epsilon compare or flipped the inequality would make
259        // this Pass.
260        let one_ulp_above = f32::from_bits(tol.to_bits() + 1);
261        assert!(one_ulp_above > tol);
262        assert_eq!(
263            verdict_from_loss_delta(one_ulp_above, tol),
264            Gputrain006Verdict::Fail,
265            "one ULP above tolerance must Fail",
266        );
267        // A larger overshoot — the defect shape where a seed plumbing
268        // regression breaks determinism outright.
269        assert_eq!(
270            verdict_from_loss_delta(1e-3, tol),
271            Gputrain006Verdict::Fail,
272            "100× tolerance must Fail (visible seed plumbing regression)",
273        );
274
275        // Section 3: trajectory — single-step fail. 99 steps within
276        // tolerance plus ONE step above must Fail. Mirrors the real
277        // failure mode: a reproducibility regression often shows up at
278        // a specific layer depth (e.g. the first LayerNorm backward
279        // where cuBLAS warp-reduction order leaked).
280        let mut run_a = vec![1.0f32; 100];
281        let mut run_b = vec![1.0f32; 100];
282        run_b[42] = 1.0 + 1e-3; // delta = 1e-3 > tol
283        assert_eq!(
284            verdict_from_loss_trajectories(&run_a, &run_b, tol),
285            Gputrain006Verdict::Fail,
286            "single-step trajectory violation at k=42 must Fail",
287        );
288        // Restore k=42 to within tolerance — everything else unchanged
289        // must now Pass.
290        run_b[42] = 1.0 + (tol / 2.0);
291        assert_eq!(
292            verdict_from_loss_trajectories(&run_a, &run_b, tol),
293            Gputrain006Verdict::Pass,
294            "all-within-tolerance trajectory must Pass",
295        );
296        // Sanity: a tiny drift on every step is still Pass as long as
297        // each delta is within tolerance.
298        for i in 0..run_a.len() {
299            run_a[i] = 2.0 + (i as f32) * 1e-3;
300            run_b[i] = run_a[i] + (tol / 10.0);
301        }
302        assert_eq!(
303            verdict_from_loss_trajectories(&run_a, &run_b, tol),
304            Gputrain006Verdict::Pass,
305            "uniform within-tolerance drift across 100 steps must Pass",
306        );
307
308        // Section 4: length mismatch. Two runs of different length can't
309        // be compared pairwise — conservative Fail (some other bug in
310        // the harness cut one run short).
311        let short = vec![1.0f32; 50];
312        let long = vec![1.0f32; 100];
313        assert_eq!(
314            verdict_from_loss_trajectories(&short, &long, tol),
315            Gputrain006Verdict::Fail,
316            "length mismatch (50 vs 100) must Fail",
317        );
318        assert_eq!(
319            verdict_from_loss_trajectories(&long, &short, tol),
320            Gputrain006Verdict::Fail,
321            "reverse length mismatch must also Fail",
322        );
323
324        // Section 5: empty input. A defensive `is_empty()` check
325        // prevents a vacuously-true "no steps" from passing the gate.
326        let empty: Vec<f32> = vec![];
327        let one = vec![1.0f32];
328        assert_eq!(
329            verdict_from_loss_trajectories(&empty, &empty, tol),
330            Gputrain006Verdict::Fail,
331            "both-empty trajectories must Fail (no steps compared)",
332        );
333        assert_eq!(
334            verdict_from_loss_trajectories(&empty, &one, tol),
335            Gputrain006Verdict::Fail,
336            "one-empty one-nonempty must Fail",
337        );
338
339        // Section 6: non-finite elements. A NaN or ±∞ anywhere in
340        // either run must propagate to Fail. Catches the failure mode
341        // where a GradScaler overflow emitted NaN and the harness kept
342        // plotting.
343        let mut nan_a = vec![1.0f32; 10];
344        let nan_b = vec![1.0f32; 10];
345        nan_a[3] = f32::NAN;
346        assert_eq!(
347            verdict_from_loss_trajectories(&nan_a, &nan_b, tol),
348            Gputrain006Verdict::Fail,
349            "NaN in run_a must Fail",
350        );
351        let mut inf_b = vec![1.0f32; 10];
352        inf_b[7] = f32::INFINITY;
353        assert_eq!(
354            verdict_from_loss_trajectories(&nan_b, &inf_b, tol),
355            Gputrain006Verdict::Fail,
356            "+inf in run_b must Fail",
357        );
358        // Non-finite single-step delta.
359        assert_eq!(
360            verdict_from_loss_delta(f32::NAN, tol),
361            Gputrain006Verdict::Fail,
362            "NaN delta must Fail",
363        );
364        assert_eq!(
365            verdict_from_loss_delta(1e-6, f32::INFINITY),
366            Gputrain006Verdict::Fail,
367            "infinite tolerance must Fail (no rubber-stamp Pass)",
368        );
369        // Negative tolerance / delta.
370        assert_eq!(
371            verdict_from_loss_delta(-1e-6, tol),
372            Gputrain006Verdict::Fail,
373            "negative delta must Fail (caller passed raw a-b, not |a-b|)",
374        );
375        assert_eq!(
376            verdict_from_loss_delta(1e-6, -1e-5),
377            Gputrain006Verdict::Fail,
378            "negative tolerance must Fail (nonsense threshold)",
379        );
380
381        // Section 7: provenance pin — the 1e-5 tolerance is load-
382        // bearing and lockstep with the YAML contract rule and peer
383        // INV-TRAIN-006 (CPU 1e-6, CUDA 1e-5). Any future tightening
384        // (e.g. after trueno#203 lands deterministic kernels) or
385        // relaxation must move the constant, the YAML rule, and this
386        // test together.
387        assert!(
388            (AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA - 1e-5).abs() < 1e-9,
389            "INV-GPUTRAIN-006 tolerance is 1e-5 \
390             (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-006)",
391        );
392    }
393
394    /// FALSIFY-GPUTRAIN-006-v2 empirical-bound discharge: prove the
395    /// 4-bound ReproducibilityStudyResult verdict shape. The bounds
396    /// were measured on RTX 4090 sm_89 with the deterministic-mode
397    /// stack engaged (PTX atomicAdd removed, cuBLAS PEDANTIC, APR-MONO
398    /// dep migration); evidence file
399    /// `evidence/task-132/gputrain-006-empirical-v1.json` holds the
400    /// raw 10-run × 100-step study. Any mutation to one of the 4
401    /// constants, any flip of the inequality direction, or any leak of
402    /// non-finite handling must break this test before a live RTX 4090
403    /// reproducibility-runner dispatch.
404    #[test]
405    fn falsify_gputrain_006_empirical_reproducibility_bounds() {
406        // Section 1: at-bound study (every metric exactly at its
407        // floor/ceiling). Pass per inclusive comparisons. Mutating any
408        // `<=` to strict `<` or any `>=` to strict `>` flips a metric
409        // to Fail.
410        let at_bound = ReproducibilityStudyResult {
411            per_step_drift_max: AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR,
412            random_walk_epsilon: AC_GPUTRAIN_006_RANDOM_WALK_EPSILON,
413            cosine_sim_worst: AC_GPUTRAIN_006_COSINE_SIM_FLOOR,
414            final_loss_range: AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR,
415        };
416        assert_eq!(
417            verdict_from_reproducibility_study(&at_bound),
418            Gputrain006Verdict::Pass,
419            "every metric exactly at bound must Pass per inclusive ceiling",
420        );
421
422        // Section 2: empirical-pass case — observed v1 numbers from the
423        // study evidence file. Each metric must be strictly within its
424        // bound.
425        let v1_observed = ReproducibilityStudyResult {
426            per_step_drift_max: 9.2e-4,            // ≤ 1.0e-3
427            random_walk_epsilon: 2.74e-4,          // ≤ 3.0e-4
428            cosine_sim_worst: 0.999_999_999_7_f32, // ≥ 0.999_999_99
429            final_loss_range: 1.341e-3,            // ≤ 2.0e-3
430        };
431        assert_eq!(
432            verdict_from_reproducibility_study(&v1_observed),
433            Gputrain006Verdict::Pass,
434            "v1 empirical study must Pass — these are the proof points",
435        );
436
437        // Section 3: each bound, broken individually. Any mutation that
438        // accidentally flips one comparison direction, or weakens one
439        // bound, must fail to Pass at least one of these four cases.
440
441        // 3a. Per-step drift overshoot.
442        let mut drift_high = v1_observed;
443        drift_high.per_step_drift_max = AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR + 1e-6;
444        assert_eq!(
445            verdict_from_reproducibility_study(&drift_high),
446            Gputrain006Verdict::Fail,
447            "per_step_drift_max above floor must Fail",
448        );
449
450        // 3b. Random-walk ε overshoot.
451        let mut eps_high = v1_observed;
452        eps_high.random_walk_epsilon = AC_GPUTRAIN_006_RANDOM_WALK_EPSILON + 1e-6;
453        assert_eq!(
454            verdict_from_reproducibility_study(&eps_high),
455            Gputrain006Verdict::Fail,
456            "random_walk_epsilon above ceiling must Fail",
457        );
458
459        // 3c. Cosine similarity below floor. Subtract 1e-6 (well above
460        // FP32 ULP at magnitude ~1.0, which is ~1.19e-7) so the
461        // arithmetic actually moves the value below the floor.
462        let mut cos_low = v1_observed;
463        cos_low.cosine_sim_worst = AC_GPUTRAIN_006_COSINE_SIM_FLOOR - 1e-6;
464        assert!(
465            cos_low.cosine_sim_worst < AC_GPUTRAIN_006_COSINE_SIM_FLOOR,
466            "test sanity: cos_low should actually be below floor in FP32"
467        );
468        assert_eq!(
469            verdict_from_reproducibility_study(&cos_low),
470            Gputrain006Verdict::Fail,
471            "cosine_sim_worst below floor must Fail",
472        );
473
474        // 3d. Final loss range overshoot.
475        let mut range_high = v1_observed;
476        range_high.final_loss_range = AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR + 1e-6;
477        assert_eq!(
478            verdict_from_reproducibility_study(&range_high),
479            Gputrain006Verdict::Fail,
480            "final_loss_range above floor must Fail",
481        );
482
483        // Section 4: non-finite metrics — every field independently.
484        // A NaN or ±∞ in any of the four fields must short-circuit to
485        // Fail before the bound checks run, catching the harness bug
486        // where a metric was computed from a degenerate input.
487        for (field_name, mutate) in [
488            ("per_step_drift_max", 1u32),
489            ("random_walk_epsilon", 2u32),
490            ("cosine_sim_worst", 3u32),
491            ("final_loss_range", 4u32),
492        ] {
493            for non_finite in [f32::NAN, f32::INFINITY, f32::NEG_INFINITY] {
494                let mut s = v1_observed;
495                match mutate {
496                    1 => s.per_step_drift_max = non_finite,
497                    2 => s.random_walk_epsilon = non_finite,
498                    3 => s.cosine_sim_worst = non_finite,
499                    4 => s.final_loss_range = non_finite,
500                    _ => unreachable!(),
501                }
502                assert_eq!(
503                    verdict_from_reproducibility_study(&s),
504                    Gputrain006Verdict::Fail,
505                    "non-finite ({non_finite}) in {field_name} must Fail",
506                );
507            }
508        }
509
510        // Section 5: negative ranges (caller bug — forgot abs()).
511        let mut neg = v1_observed;
512        neg.per_step_drift_max = -1e-4;
513        assert_eq!(
514            verdict_from_reproducibility_study(&neg),
515            Gputrain006Verdict::Fail,
516            "negative per_step_drift_max must Fail (raw a-b leaked, not |a-b|)",
517        );
518
519        // Section 6: cosine similarity range guard. Reproducible traces
520        // give ~1.0; any value outside [0, 1+ULP] is a caller bug that
521        // must Fail.
522        for bad_cos in [-0.5_f32, -1.0_f32, 1.5_f32, 100.0_f32] {
523            let mut s = v1_observed;
524            s.cosine_sim_worst = bad_cos;
525            assert_eq!(
526                verdict_from_reproducibility_study(&s),
527                Gputrain006Verdict::Fail,
528                "cosine_sim_worst out-of-range ({bad_cos}) must Fail",
529            );
530        }
531
532        // Section 7: cosine similarity at exactly 1.0 (identical traces)
533        // must Pass. ULP overshoot above 1.0 (FP32 inner product on
534        // identical vectors) must also Pass — the verdict allows up to
535        // 1.0001 for that exact reason.
536        let identical = ReproducibilityStudyResult {
537            per_step_drift_max: 0.0,
538            random_walk_epsilon: 0.0,
539            cosine_sim_worst: 1.0,
540            final_loss_range: 0.0,
541        };
542        assert_eq!(
543            verdict_from_reproducibility_study(&identical),
544            Gputrain006Verdict::Pass,
545            "perfect identity (cos=1.0, all drift=0) must Pass",
546        );
547        let identity_ulp =
548            ReproducibilityStudyResult { cosine_sim_worst: 1.000_000_1, ..identical };
549        assert_eq!(
550            verdict_from_reproducibility_study(&identity_ulp),
551            Gputrain006Verdict::Pass,
552            "FP32 cos_sim ULP overshoot above 1.0 (identity reduction) must Pass",
553        );
554
555        // Section 8: provenance pin — the 4 constants are load-bearing
556        // and lockstep with the YAML contract rule and the empirical
557        // evidence file.  Any future ratchet (tighten after better
558        // determinism lands) or relaxation (a hardware regression) must
559        // move ALL of: the constant, the YAML rule, and the v2 evidence
560        // file together. Triple-pinned to prevent silent drift.
561        assert!(
562            (AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR - 1.0e-3).abs() < 1e-9,
563            "AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR is 1.0e-3 \
564             (provenance: evidence/task-132/gputrain-006-empirical-v1.json)",
565        );
566        assert!(
567            (AC_GPUTRAIN_006_RANDOM_WALK_EPSILON - 3.0e-4).abs() < 1e-9,
568            "AC_GPUTRAIN_006_RANDOM_WALK_EPSILON is 3.0e-4",
569        );
570        assert!(
571            (AC_GPUTRAIN_006_COSINE_SIM_FLOOR - 0.999_999_99_f32).abs() < 1e-12,
572            "AC_GPUTRAIN_006_COSINE_SIM_FLOOR is 0.999_999_99",
573        );
574        assert!(
575            (AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR - 2.0e-3).abs() < 1e-9,
576            "AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR is 2.0e-3",
577        );
578    }
579}