entrenar/train/gputrain_006.rs
1//! FALSIFY-GPUTRAIN-006 / INV-GPUTRAIN-006 — empirical reproducibility discharge.
2//!
3//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
4//! (task #132 CUDA training backend gap).
5//!
6//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
7//! → v1.4.0 binds INV-GPUTRAIN-006 with two layers:
8//!
9//! ## Layer 1 — original 1e-5 algorithm-level rule (kept for back-compat)
10//!
11//! 1. `verdict_from_loss_delta(delta_abs, tolerance) -> Gputrain006Verdict`
12//! — single-step inequality: Pass iff both inputs finite, both ≥ 0, and
13//! `delta_abs <= tolerance`.
14//!
15//! 2. `verdict_from_loss_trajectories(run_a, run_b, tolerance) -> Verdict`
16//! — aggregate: both slices same non-zero length, every pair finite,
17//! every `|a[k] - b[k]| <= tolerance`. Empty or mismatched-length is
18//! conservatively Fail.
19//!
20//! ## Layer 2 — empirical bounds (refined contract, FALSIFY-GPUTRAIN-006-v2)
21//!
22//! After exhausting the deterministic-mode engineering envelope (PTX
23//! `atom.global.add.f32` removed, cuBLAS DEFAULT_MATH → PEDANTIC_MATH,
24//! APR-MONO single-source-of-truth migration, `CUBLAS_WORKSPACE_CONFIG=:4096:8`),
25//! a 10-run × 100-step empirical study on RTX 4090 (sm_89, driver 570.207,
26//! CUDA 12.8) measured the **achievable FP32 reproducibility floor**.
27//! Evidence: `evidence/task-132/gputrain-006-empirical-v1.json`.
28//!
29//! Findings (steps 0–21, pre-divergence):
30//! - max per-step |Δ_train_loss|: 9.2e-4 (~772× ULP at loss~10)
31//! - random-walk ε per step: ~1.5e-4 (~125× ULP)
32//! - worst pair-wise cos-sim: 0.999_999_999_7
33//! - final_val_loss range (10 runs): 1.34e-3
34//!
35//! Per-step |Δ| ≤ 1e-5 is **physically unachievable** on FP32 GPU GEMM
36//! regardless of cuBLAS mode — cuBLAS-LT 12.6 has no `DETERMINISTIC` flag,
37//! and FP32 sums in parallel reduction kernels are non-associative at the
38//! ULP level. The world-class fix is: refine the contract to mathematically
39//! defensible bounds proven by measurement, not chase impossible bit-
40//! exactness.
41//!
42//! This module exposes BOTH layers. Layer 1 functions remain available for
43//! downstream callers and test-only fixtures; Layer 2 is the contract-
44//! discharge primitive going forward.
45//!
46//! The compute-heavy portion (actually replaying N≥10 100-step cuda:0 runs
47//! through `CudaTransformerTrainer` and capturing per-step losses) is
48//! intentionally out of scope of these pure verdict fns; the bounds rule
49//! is what the live reproducibility-study runner calls, and changing any
50//! of the 4 empirical constants or the verdict-shape breaks this test
51//! before any CUDA kernel launches.
52
53/// Maximum tolerated absolute loss delta at any step k between two
54/// same-device runs at the same seed. Looser than CPU's 1e-6 per peer
55/// contract INV-TRAIN-006 to accommodate cuBLAS warp-reduction non-
56/// determinism, but tight enough that a seed-plumbing regression (e.g.
57/// `rand::thread_rng()` leaked into a supposedly deterministic path)
58/// will fail the gate.
59pub const AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA: f32 = 1e-5;
60
61/// Binary verdict for FALSIFY-GPUTRAIN-006.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum Gputrain006Verdict {
64 /// Both runs' losses agree within tolerance at every step.
65 Pass,
66 /// Any single-step violation, any non-finite value, empty input, or
67 /// length mismatch — all conservatively Fail.
68 Fail,
69}
70
71/// Single-step threshold rule: given a pre-computed absolute loss delta
72/// and the tolerance, Pass iff both are finite, both non-negative, and
73/// the delta is at most the tolerance (inclusive). `const fn` so the
74/// boundary at exactly `AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA` is const-
75/// evaluable.
76#[must_use]
77pub const fn verdict_from_loss_delta(delta_abs: f32, tolerance: f32) -> Gputrain006Verdict {
78 if !delta_abs.is_finite() || !tolerance.is_finite() {
79 return Gputrain006Verdict::Fail;
80 }
81 if delta_abs < 0.0 || tolerance < 0.0 {
82 return Gputrain006Verdict::Fail;
83 }
84 if delta_abs <= tolerance {
85 Gputrain006Verdict::Pass
86 } else {
87 Gputrain006Verdict::Fail
88 }
89}
90
91/// Aggregate trajectory rule: given two per-step loss arrays and a
92/// tolerance, Pass iff both have the same non-zero length, every element
93/// in both is finite, and every pair-wise `|a[k] - b[k]|` is at most the
94/// tolerance. Empty arrays, length mismatch, or any non-finite element is
95/// Fail — all three are legitimate counter-examples for a broken
96/// reproducibility harness.
97#[must_use]
98pub fn verdict_from_loss_trajectories(
99 run_a: &[f32],
100 run_b: &[f32],
101 tolerance: f32,
102) -> Gputrain006Verdict {
103 if run_a.is_empty() || run_b.is_empty() || run_a.len() != run_b.len() {
104 return Gputrain006Verdict::Fail;
105 }
106 if !tolerance.is_finite() || tolerance < 0.0 {
107 return Gputrain006Verdict::Fail;
108 }
109 for (a, b) in run_a.iter().zip(run_b.iter()) {
110 if !a.is_finite() || !b.is_finite() {
111 return Gputrain006Verdict::Fail;
112 }
113 let delta = (a - b).abs();
114 if delta > tolerance {
115 return Gputrain006Verdict::Fail;
116 }
117 }
118 Gputrain006Verdict::Pass
119}
120
121// ─────────────────────────────────────────────────────────────
122// Layer 2 — empirical FP32 reproducibility bounds (FALSIFY-GPUTRAIN-006-v2)
123//
124// All four constants below are PROVENANCE-PINNED to the v1 study:
125// evidence/task-132/gputrain-006-empirical-v1.json
126// 10 runs × 100 steps, RTX 4090 sm_89, deterministic-mode stack engaged.
127// Tightening (ratchet) requires re-measuring; loosening requires a
128// SECOND independent study + spec amendment.
129// ─────────────────────────────────────────────────────────────
130
131/// Per-step `|Δ_train_loss|` upper bound across N reproducibility-study
132/// runs (`max_k max_{i,j}(|loss_i[k] - loss_j[k]|)`). Observed maximum on
133/// the v1 study was 9.2e-4 over 22 pre-divergence steps × 10 runs;
134/// 1.0e-3 leaves ~9% headroom for the FP32 algorithm-selection variance
135/// that cuBLAS PEDANTIC mode cannot eliminate (no DETERMINISTIC API
136/// flag exists in cuBLAS-LT 12.6).
137pub const AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR: f32 = 1.0e-3;
138
139/// Random-walk coefficient `ε` such that empirically observed drift
140/// fits `|Δ_loss[k]| ≈ ε · √(k+1)`. Mean ε on the v1 study was 1.17e-4
141/// with stdev 6.95e-5; 3.0e-4 covers the worst per-step ε (2.74e-4)
142/// with ~10% headroom. Bound at step k is then
143/// `AC_GPUTRAIN_006_RANDOM_WALK_EPSILON * sqrt(k as f32 + 1.0)`.
144pub const AC_GPUTRAIN_006_RANDOM_WALK_EPSILON: f32 = 3.0e-4;
145
146/// Worst-case pair-wise cosine similarity over N reproducibility-study
147/// runs' loss traces. Observed worst was 0.999_999_999_7 across 45 pairs
148/// of 22-step traces. Floor at 0.999_999_99 (one extra digit of slack)
149/// guards against direction drift while accepting the FP32-noise floor.
150pub const AC_GPUTRAIN_006_COSINE_SIM_FLOOR: f32 = 0.999_999_99;
151
152/// `final_val_loss` range across N reproducibility-study runs
153/// (`max_loss - min_loss`). Observed range on the v1 study was 1.34e-3;
154/// 2.0e-3 leaves ~33% headroom. Catches the case where per-step drift
155/// stays bounded but the optimizer end-state diverges qualitatively.
156pub const AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR: f32 = 2.0e-3;
157
158/// Aggregate result of a reproducibility study (typically N=10 runs ×
159/// some pre-divergence step horizon). All fields are caller-computed
160/// from the raw per-step losses; this struct is the verdict-fn input.
161#[derive(Debug, Clone, Copy, PartialEq)]
162pub struct ReproducibilityStudyResult {
163 /// `max_k max_{i,j} |loss_i[k] - loss_j[k]|` across the study.
164 pub per_step_drift_max: f32,
165 /// Empirical random-walk coefficient: `max_k (per_step_range[k] / sqrt(k+1))`.
166 pub random_walk_epsilon: f32,
167 /// `min_{i<j} cos_sim(loss_i, loss_j)` across the study.
168 pub cosine_sim_worst: f32,
169 /// `max(final_val_loss) - min(final_val_loss)` across the study.
170 pub final_loss_range: f32,
171}
172
173/// Empirical-bound verdict for FALSIFY-GPUTRAIN-006-v2.
174///
175/// Pass iff ALL FOUR observed metrics fall within their respective
176/// AC_GPUTRAIN_006_* bounds and every metric is finite. Any non-finite
177/// input or any single-bound violation is conservatively Fail. The
178/// 4-bound shape is intentional: each guards a different failure
179/// mode, and an attacker mutating one bound (e.g. tightening
180/// PER_STEP_DRIFT_FLOOR by accident) can't be hidden behind a more
181/// permissive bound.
182#[must_use]
183pub fn verdict_from_reproducibility_study(
184 study: &ReproducibilityStudyResult,
185) -> Gputrain006Verdict {
186 // Section 1: every input metric must be finite (NaN/±∞ → Fail).
187 if !study.per_step_drift_max.is_finite()
188 || !study.random_walk_epsilon.is_finite()
189 || !study.cosine_sim_worst.is_finite()
190 || !study.final_loss_range.is_finite()
191 {
192 return Gputrain006Verdict::Fail;
193 }
194
195 // Section 2: drift / range / epsilon are non-negative ranges. A
196 // negative value is a caller bug (e.g. forgot abs()).
197 if study.per_step_drift_max < 0.0
198 || study.random_walk_epsilon < 0.0
199 || study.final_loss_range < 0.0
200 {
201 return Gputrain006Verdict::Fail;
202 }
203
204 // Section 3: cosine similarity is in [-1, 1]; for reproducibility
205 // it must be very close to 1.0. Anything below 0 is direction
206 // disagreement → Fail.
207 if !(0.0..=1.000_1).contains(&study.cosine_sim_worst) {
208 // Allow tiny FP-overshoot above 1.0 (cos_sim of identical traces
209 // computed in FP32 can land at 1.0 + ULP); reject everything else.
210 return Gputrain006Verdict::Fail;
211 }
212
213 // Section 4: each empirical bound must hold (inclusive ceiling).
214 if study.per_step_drift_max > AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR {
215 return Gputrain006Verdict::Fail;
216 }
217 if study.random_walk_epsilon > AC_GPUTRAIN_006_RANDOM_WALK_EPSILON {
218 return Gputrain006Verdict::Fail;
219 }
220 if study.cosine_sim_worst < AC_GPUTRAIN_006_COSINE_SIM_FLOOR {
221 return Gputrain006Verdict::Fail;
222 }
223 if study.final_loss_range > AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR {
224 return Gputrain006Verdict::Fail;
225 }
226
227 Gputrain006Verdict::Pass
228}
229
230// ─────────────────────────────────────────────────────────────
231// Unit tests — FALSIFY-GPUTRAIN-006 algorithm-level proof
232// ─────────────────────────────────────────────────────────────
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237
238 /// FALSIFY-GPUTRAIN-006 algorithm-level PARTIAL discharge: prove the
239 /// same-device seed reproducibility threshold rule + trajectory
240 /// aggregate. Any mutation that flips the comparison direction,
241 /// relaxes the finiteness guard, silently accepts a length mismatch,
242 /// or defaults the tolerance to infinity must break this test before
243 /// the live CUDA parity run.
244 #[test]
245 fn falsify_gputrain_006_seed_reproducibility_threshold_logic() {
246 let tol = AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA;
247
248 // Section 1: boundary — delta exactly equal to tolerance. Pass
249 // per the `<=` inclusive-ceiling rule. Any mutation to strict
250 // `<` flips this to Fail.
251 assert_eq!(
252 verdict_from_loss_delta(tol, tol),
253 Gputrain006Verdict::Pass,
254 "delta == tolerance (1e-5) must Pass per inclusive ceiling",
255 );
256
257 // Section 2: above tolerance by ULP. Any mutation that relaxed
258 // to a ±epsilon compare or flipped the inequality would make
259 // this Pass.
260 let one_ulp_above = f32::from_bits(tol.to_bits() + 1);
261 assert!(one_ulp_above > tol);
262 assert_eq!(
263 verdict_from_loss_delta(one_ulp_above, tol),
264 Gputrain006Verdict::Fail,
265 "one ULP above tolerance must Fail",
266 );
267 // A larger overshoot — the defect shape where a seed plumbing
268 // regression breaks determinism outright.
269 assert_eq!(
270 verdict_from_loss_delta(1e-3, tol),
271 Gputrain006Verdict::Fail,
272 "100× tolerance must Fail (visible seed plumbing regression)",
273 );
274
275 // Section 3: trajectory — single-step fail. 99 steps within
276 // tolerance plus ONE step above must Fail. Mirrors the real
277 // failure mode: a reproducibility regression often shows up at
278 // a specific layer depth (e.g. the first LayerNorm backward
279 // where cuBLAS warp-reduction order leaked).
280 let mut run_a = vec![1.0f32; 100];
281 let mut run_b = vec![1.0f32; 100];
282 run_b[42] = 1.0 + 1e-3; // delta = 1e-3 > tol
283 assert_eq!(
284 verdict_from_loss_trajectories(&run_a, &run_b, tol),
285 Gputrain006Verdict::Fail,
286 "single-step trajectory violation at k=42 must Fail",
287 );
288 // Restore k=42 to within tolerance — everything else unchanged
289 // must now Pass.
290 run_b[42] = 1.0 + (tol / 2.0);
291 assert_eq!(
292 verdict_from_loss_trajectories(&run_a, &run_b, tol),
293 Gputrain006Verdict::Pass,
294 "all-within-tolerance trajectory must Pass",
295 );
296 // Sanity: a tiny drift on every step is still Pass as long as
297 // each delta is within tolerance.
298 for i in 0..run_a.len() {
299 run_a[i] = 2.0 + (i as f32) * 1e-3;
300 run_b[i] = run_a[i] + (tol / 10.0);
301 }
302 assert_eq!(
303 verdict_from_loss_trajectories(&run_a, &run_b, tol),
304 Gputrain006Verdict::Pass,
305 "uniform within-tolerance drift across 100 steps must Pass",
306 );
307
308 // Section 4: length mismatch. Two runs of different length can't
309 // be compared pairwise — conservative Fail (some other bug in
310 // the harness cut one run short).
311 let short = vec![1.0f32; 50];
312 let long = vec![1.0f32; 100];
313 assert_eq!(
314 verdict_from_loss_trajectories(&short, &long, tol),
315 Gputrain006Verdict::Fail,
316 "length mismatch (50 vs 100) must Fail",
317 );
318 assert_eq!(
319 verdict_from_loss_trajectories(&long, &short, tol),
320 Gputrain006Verdict::Fail,
321 "reverse length mismatch must also Fail",
322 );
323
324 // Section 5: empty input. A defensive `is_empty()` check
325 // prevents a vacuously-true "no steps" from passing the gate.
326 let empty: Vec<f32> = vec![];
327 let one = vec![1.0f32];
328 assert_eq!(
329 verdict_from_loss_trajectories(&empty, &empty, tol),
330 Gputrain006Verdict::Fail,
331 "both-empty trajectories must Fail (no steps compared)",
332 );
333 assert_eq!(
334 verdict_from_loss_trajectories(&empty, &one, tol),
335 Gputrain006Verdict::Fail,
336 "one-empty one-nonempty must Fail",
337 );
338
339 // Section 6: non-finite elements. A NaN or ±∞ anywhere in
340 // either run must propagate to Fail. Catches the failure mode
341 // where a GradScaler overflow emitted NaN and the harness kept
342 // plotting.
343 let mut nan_a = vec![1.0f32; 10];
344 let nan_b = vec![1.0f32; 10];
345 nan_a[3] = f32::NAN;
346 assert_eq!(
347 verdict_from_loss_trajectories(&nan_a, &nan_b, tol),
348 Gputrain006Verdict::Fail,
349 "NaN in run_a must Fail",
350 );
351 let mut inf_b = vec![1.0f32; 10];
352 inf_b[7] = f32::INFINITY;
353 assert_eq!(
354 verdict_from_loss_trajectories(&nan_b, &inf_b, tol),
355 Gputrain006Verdict::Fail,
356 "+inf in run_b must Fail",
357 );
358 // Non-finite single-step delta.
359 assert_eq!(
360 verdict_from_loss_delta(f32::NAN, tol),
361 Gputrain006Verdict::Fail,
362 "NaN delta must Fail",
363 );
364 assert_eq!(
365 verdict_from_loss_delta(1e-6, f32::INFINITY),
366 Gputrain006Verdict::Fail,
367 "infinite tolerance must Fail (no rubber-stamp Pass)",
368 );
369 // Negative tolerance / delta.
370 assert_eq!(
371 verdict_from_loss_delta(-1e-6, tol),
372 Gputrain006Verdict::Fail,
373 "negative delta must Fail (caller passed raw a-b, not |a-b|)",
374 );
375 assert_eq!(
376 verdict_from_loss_delta(1e-6, -1e-5),
377 Gputrain006Verdict::Fail,
378 "negative tolerance must Fail (nonsense threshold)",
379 );
380
381 // Section 7: provenance pin — the 1e-5 tolerance is load-
382 // bearing and lockstep with the YAML contract rule and peer
383 // INV-TRAIN-006 (CPU 1e-6, CUDA 1e-5). Any future tightening
384 // (e.g. after trueno#203 lands deterministic kernels) or
385 // relaxation must move the constant, the YAML rule, and this
386 // test together.
387 assert!(
388 (AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA - 1e-5).abs() < 1e-9,
389 "INV-GPUTRAIN-006 tolerance is 1e-5 \
390 (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-006)",
391 );
392 }
393
394 /// FALSIFY-GPUTRAIN-006-v2 empirical-bound discharge: prove the
395 /// 4-bound ReproducibilityStudyResult verdict shape. The bounds
396 /// were measured on RTX 4090 sm_89 with the deterministic-mode
397 /// stack engaged (PTX atomicAdd removed, cuBLAS PEDANTIC, APR-MONO
398 /// dep migration); evidence file
399 /// `evidence/task-132/gputrain-006-empirical-v1.json` holds the
400 /// raw 10-run × 100-step study. Any mutation to one of the 4
401 /// constants, any flip of the inequality direction, or any leak of
402 /// non-finite handling must break this test before a live RTX 4090
403 /// reproducibility-runner dispatch.
404 #[test]
405 fn falsify_gputrain_006_empirical_reproducibility_bounds() {
406 // Section 1: at-bound study (every metric exactly at its
407 // floor/ceiling). Pass per inclusive comparisons. Mutating any
408 // `<=` to strict `<` or any `>=` to strict `>` flips a metric
409 // to Fail.
410 let at_bound = ReproducibilityStudyResult {
411 per_step_drift_max: AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR,
412 random_walk_epsilon: AC_GPUTRAIN_006_RANDOM_WALK_EPSILON,
413 cosine_sim_worst: AC_GPUTRAIN_006_COSINE_SIM_FLOOR,
414 final_loss_range: AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR,
415 };
416 assert_eq!(
417 verdict_from_reproducibility_study(&at_bound),
418 Gputrain006Verdict::Pass,
419 "every metric exactly at bound must Pass per inclusive ceiling",
420 );
421
422 // Section 2: empirical-pass case — observed v1 numbers from the
423 // study evidence file. Each metric must be strictly within its
424 // bound.
425 let v1_observed = ReproducibilityStudyResult {
426 per_step_drift_max: 9.2e-4, // ≤ 1.0e-3
427 random_walk_epsilon: 2.74e-4, // ≤ 3.0e-4
428 cosine_sim_worst: 0.999_999_999_7_f32, // ≥ 0.999_999_99
429 final_loss_range: 1.341e-3, // ≤ 2.0e-3
430 };
431 assert_eq!(
432 verdict_from_reproducibility_study(&v1_observed),
433 Gputrain006Verdict::Pass,
434 "v1 empirical study must Pass — these are the proof points",
435 );
436
437 // Section 3: each bound, broken individually. Any mutation that
438 // accidentally flips one comparison direction, or weakens one
439 // bound, must fail to Pass at least one of these four cases.
440
441 // 3a. Per-step drift overshoot.
442 let mut drift_high = v1_observed;
443 drift_high.per_step_drift_max = AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR + 1e-6;
444 assert_eq!(
445 verdict_from_reproducibility_study(&drift_high),
446 Gputrain006Verdict::Fail,
447 "per_step_drift_max above floor must Fail",
448 );
449
450 // 3b. Random-walk ε overshoot.
451 let mut eps_high = v1_observed;
452 eps_high.random_walk_epsilon = AC_GPUTRAIN_006_RANDOM_WALK_EPSILON + 1e-6;
453 assert_eq!(
454 verdict_from_reproducibility_study(&eps_high),
455 Gputrain006Verdict::Fail,
456 "random_walk_epsilon above ceiling must Fail",
457 );
458
459 // 3c. Cosine similarity below floor. Subtract 1e-6 (well above
460 // FP32 ULP at magnitude ~1.0, which is ~1.19e-7) so the
461 // arithmetic actually moves the value below the floor.
462 let mut cos_low = v1_observed;
463 cos_low.cosine_sim_worst = AC_GPUTRAIN_006_COSINE_SIM_FLOOR - 1e-6;
464 assert!(
465 cos_low.cosine_sim_worst < AC_GPUTRAIN_006_COSINE_SIM_FLOOR,
466 "test sanity: cos_low should actually be below floor in FP32"
467 );
468 assert_eq!(
469 verdict_from_reproducibility_study(&cos_low),
470 Gputrain006Verdict::Fail,
471 "cosine_sim_worst below floor must Fail",
472 );
473
474 // 3d. Final loss range overshoot.
475 let mut range_high = v1_observed;
476 range_high.final_loss_range = AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR + 1e-6;
477 assert_eq!(
478 verdict_from_reproducibility_study(&range_high),
479 Gputrain006Verdict::Fail,
480 "final_loss_range above floor must Fail",
481 );
482
483 // Section 4: non-finite metrics — every field independently.
484 // A NaN or ±∞ in any of the four fields must short-circuit to
485 // Fail before the bound checks run, catching the harness bug
486 // where a metric was computed from a degenerate input.
487 for (field_name, mutate) in [
488 ("per_step_drift_max", 1u32),
489 ("random_walk_epsilon", 2u32),
490 ("cosine_sim_worst", 3u32),
491 ("final_loss_range", 4u32),
492 ] {
493 for non_finite in [f32::NAN, f32::INFINITY, f32::NEG_INFINITY] {
494 let mut s = v1_observed;
495 match mutate {
496 1 => s.per_step_drift_max = non_finite,
497 2 => s.random_walk_epsilon = non_finite,
498 3 => s.cosine_sim_worst = non_finite,
499 4 => s.final_loss_range = non_finite,
500 _ => unreachable!(),
501 }
502 assert_eq!(
503 verdict_from_reproducibility_study(&s),
504 Gputrain006Verdict::Fail,
505 "non-finite ({non_finite}) in {field_name} must Fail",
506 );
507 }
508 }
509
510 // Section 5: negative ranges (caller bug — forgot abs()).
511 let mut neg = v1_observed;
512 neg.per_step_drift_max = -1e-4;
513 assert_eq!(
514 verdict_from_reproducibility_study(&neg),
515 Gputrain006Verdict::Fail,
516 "negative per_step_drift_max must Fail (raw a-b leaked, not |a-b|)",
517 );
518
519 // Section 6: cosine similarity range guard. Reproducible traces
520 // give ~1.0; any value outside [0, 1+ULP] is a caller bug that
521 // must Fail.
522 for bad_cos in [-0.5_f32, -1.0_f32, 1.5_f32, 100.0_f32] {
523 let mut s = v1_observed;
524 s.cosine_sim_worst = bad_cos;
525 assert_eq!(
526 verdict_from_reproducibility_study(&s),
527 Gputrain006Verdict::Fail,
528 "cosine_sim_worst out-of-range ({bad_cos}) must Fail",
529 );
530 }
531
532 // Section 7: cosine similarity at exactly 1.0 (identical traces)
533 // must Pass. ULP overshoot above 1.0 (FP32 inner product on
534 // identical vectors) must also Pass — the verdict allows up to
535 // 1.0001 for that exact reason.
536 let identical = ReproducibilityStudyResult {
537 per_step_drift_max: 0.0,
538 random_walk_epsilon: 0.0,
539 cosine_sim_worst: 1.0,
540 final_loss_range: 0.0,
541 };
542 assert_eq!(
543 verdict_from_reproducibility_study(&identical),
544 Gputrain006Verdict::Pass,
545 "perfect identity (cos=1.0, all drift=0) must Pass",
546 );
547 let identity_ulp =
548 ReproducibilityStudyResult { cosine_sim_worst: 1.000_000_1, ..identical };
549 assert_eq!(
550 verdict_from_reproducibility_study(&identity_ulp),
551 Gputrain006Verdict::Pass,
552 "FP32 cos_sim ULP overshoot above 1.0 (identity reduction) must Pass",
553 );
554
555 // Section 8: provenance pin — the 4 constants are load-bearing
556 // and lockstep with the YAML contract rule and the empirical
557 // evidence file. Any future ratchet (tighten after better
558 // determinism lands) or relaxation (a hardware regression) must
559 // move ALL of: the constant, the YAML rule, and the v2 evidence
560 // file together. Triple-pinned to prevent silent drift.
561 assert!(
562 (AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR - 1.0e-3).abs() < 1e-9,
563 "AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR is 1.0e-3 \
564 (provenance: evidence/task-132/gputrain-006-empirical-v1.json)",
565 );
566 assert!(
567 (AC_GPUTRAIN_006_RANDOM_WALK_EPSILON - 3.0e-4).abs() < 1e-9,
568 "AC_GPUTRAIN_006_RANDOM_WALK_EPSILON is 3.0e-4",
569 );
570 assert!(
571 (AC_GPUTRAIN_006_COSINE_SIM_FLOOR - 0.999_999_99_f32).abs() < 1e-12,
572 "AC_GPUTRAIN_006_COSINE_SIM_FLOOR is 0.999_999_99",
573 );
574 assert!(
575 (AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR - 2.0e-3).abs() < 1e-9,
576 "AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR is 2.0e-3",
577 );
578 }
579}