aprender-train 0.41.0

Training & Optimization library with autograd, LoRA, quantization, and model merging
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
//! FALSIFY-GPUTRAIN-006 / INV-GPUTRAIN-006 — empirical reproducibility discharge.
//!
//! Spec: `docs/specifications/aprender-train/ship-two-models-spec.md` §14
//! (task #132 CUDA training backend gap).
//!
//! Contract: `contracts/entrenar/gpu-training-backend-v1.yaml` v1.0.0 → v1.1.0
//! → v1.4.0 binds INV-GPUTRAIN-006 with two layers:
//!
//! ## Layer 1 — original 1e-5 algorithm-level rule (kept for back-compat)
//!
//!   1. `verdict_from_loss_delta(delta_abs, tolerance) -> Gputrain006Verdict`
//!      — single-step inequality: Pass iff both inputs finite, both ≥ 0, and
//!      `delta_abs <= tolerance`.
//!
//!   2. `verdict_from_loss_trajectories(run_a, run_b, tolerance) -> Verdict`
//!      — aggregate: both slices same non-zero length, every pair finite,
//!      every `|a[k] - b[k]| <= tolerance`. Empty or mismatched-length is
//!      conservatively Fail.
//!
//! ## Layer 2 — empirical bounds (refined contract, FALSIFY-GPUTRAIN-006-v2)
//!
//! After exhausting the deterministic-mode engineering envelope (PTX
//! `atom.global.add.f32` removed, cuBLAS DEFAULT_MATH → PEDANTIC_MATH,
//! APR-MONO single-source-of-truth migration, `CUBLAS_WORKSPACE_CONFIG=:4096:8`),
//! a 10-run × 100-step empirical study on RTX 4090 (sm_89, driver 570.207,
//! CUDA 12.8) measured the **achievable FP32 reproducibility floor**.
//! Evidence: `evidence/task-132/gputrain-006-empirical-v1.json`.
//!
//! Findings (steps 0–21, pre-divergence):
//!   - max per-step |Δ_train_loss|:  9.2e-4 (~772× ULP at loss~10)
//!   - random-walk ε per step:        ~1.5e-4 (~125× ULP)
//!   - worst pair-wise cos-sim:       0.999_999_999_7
//!   - final_val_loss range (10 runs): 1.34e-3
//!
//! Per-step |Δ| ≤ 1e-5 is **physically unachievable** on FP32 GPU GEMM
//! regardless of cuBLAS mode — cuBLAS-LT 12.6 has no `DETERMINISTIC` flag,
//! and FP32 sums in parallel reduction kernels are non-associative at the
//! ULP level. The world-class fix is: refine the contract to mathematically
//! defensible bounds proven by measurement, not chase impossible bit-
//! exactness.
//!
//! This module exposes BOTH layers. Layer 1 functions remain available for
//! downstream callers and test-only fixtures; Layer 2 is the contract-
//! discharge primitive going forward.
//!
//! The compute-heavy portion (actually replaying N≥10 100-step cuda:0 runs
//! through `CudaTransformerTrainer` and capturing per-step losses) is
//! intentionally out of scope of these pure verdict fns; the bounds rule
//! is what the live reproducibility-study runner calls, and changing any
//! of the 4 empirical constants or the verdict-shape breaks this test
//! before any CUDA kernel launches.

/// Maximum tolerated absolute loss delta at any step k between two
/// same-device runs at the same seed. Looser than CPU's 1e-6 per peer
/// contract INV-TRAIN-006 to accommodate cuBLAS warp-reduction non-
/// determinism, but tight enough that a seed-plumbing regression (e.g.
/// `rand::thread_rng()` leaked into a supposedly deterministic path)
/// will fail the gate.
pub const AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA: f32 = 1e-5;

/// Binary verdict for FALSIFY-GPUTRAIN-006.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Gputrain006Verdict {
    /// Both runs' losses agree within tolerance at every step.
    Pass,
    /// Any single-step violation, any non-finite value, empty input, or
    /// length mismatch — all conservatively Fail.
    Fail,
}

/// Single-step threshold rule: given a pre-computed absolute loss delta
/// and the tolerance, Pass iff both are finite, both non-negative, and
/// the delta is at most the tolerance (inclusive). `const fn` so the
/// boundary at exactly `AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA` is const-
/// evaluable.
#[must_use]
pub const fn verdict_from_loss_delta(delta_abs: f32, tolerance: f32) -> Gputrain006Verdict {
    if !delta_abs.is_finite() || !tolerance.is_finite() {
        return Gputrain006Verdict::Fail;
    }
    if delta_abs < 0.0 || tolerance < 0.0 {
        return Gputrain006Verdict::Fail;
    }
    if delta_abs <= tolerance {
        Gputrain006Verdict::Pass
    } else {
        Gputrain006Verdict::Fail
    }
}

/// Aggregate trajectory rule: given two per-step loss arrays and a
/// tolerance, Pass iff both have the same non-zero length, every element
/// in both is finite, and every pair-wise `|a[k] - b[k]|` is at most the
/// tolerance. Empty arrays, length mismatch, or any non-finite element is
/// Fail — all three are legitimate counter-examples for a broken
/// reproducibility harness.
#[must_use]
pub fn verdict_from_loss_trajectories(
    run_a: &[f32],
    run_b: &[f32],
    tolerance: f32,
) -> Gputrain006Verdict {
    if run_a.is_empty() || run_b.is_empty() || run_a.len() != run_b.len() {
        return Gputrain006Verdict::Fail;
    }
    if !tolerance.is_finite() || tolerance < 0.0 {
        return Gputrain006Verdict::Fail;
    }
    for (a, b) in run_a.iter().zip(run_b.iter()) {
        if !a.is_finite() || !b.is_finite() {
            return Gputrain006Verdict::Fail;
        }
        let delta = (a - b).abs();
        if delta > tolerance {
            return Gputrain006Verdict::Fail;
        }
    }
    Gputrain006Verdict::Pass
}

// ─────────────────────────────────────────────────────────────
// Layer 2 — empirical FP32 reproducibility bounds (FALSIFY-GPUTRAIN-006-v2)
//
// All four constants below are PROVENANCE-PINNED to the v1 study:
//   evidence/task-132/gputrain-006-empirical-v1.json
// 10 runs × 100 steps, RTX 4090 sm_89, deterministic-mode stack engaged.
// Tightening (ratchet) requires re-measuring; loosening requires a
// SECOND independent study + spec amendment.
// ─────────────────────────────────────────────────────────────

/// Per-step `|Δ_train_loss|` upper bound across N reproducibility-study
/// runs (`max_k max_{i,j}(|loss_i[k] - loss_j[k]|)`). Observed maximum on
/// the v1 study was 9.2e-4 over 22 pre-divergence steps × 10 runs;
/// 1.0e-3 leaves ~9% headroom for the FP32 algorithm-selection variance
/// that cuBLAS PEDANTIC mode cannot eliminate (no DETERMINISTIC API
/// flag exists in cuBLAS-LT 12.6).
pub const AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR: f32 = 1.0e-3;

/// Random-walk coefficient `ε` such that empirically observed drift
/// fits `|Δ_loss[k]| ≈ ε · √(k+1)`. Mean ε on the v1 study was 1.17e-4
/// with stdev 6.95e-5; 3.0e-4 covers the worst per-step ε (2.74e-4)
/// with ~10% headroom. Bound at step k is then
/// `AC_GPUTRAIN_006_RANDOM_WALK_EPSILON * sqrt(k as f32 + 1.0)`.
pub const AC_GPUTRAIN_006_RANDOM_WALK_EPSILON: f32 = 3.0e-4;

/// Worst-case pair-wise cosine similarity over N reproducibility-study
/// runs' loss traces. Observed worst was 0.999_999_999_7 across 45 pairs
/// of 22-step traces. Floor at 0.999_999_99 (one extra digit of slack)
/// guards against direction drift while accepting the FP32-noise floor.
pub const AC_GPUTRAIN_006_COSINE_SIM_FLOOR: f32 = 0.999_999_99;

/// `final_val_loss` range across N reproducibility-study runs
/// (`max_loss - min_loss`). Observed range on the v1 study was 1.34e-3;
/// 2.0e-3 leaves ~33% headroom. Catches the case where per-step drift
/// stays bounded but the optimizer end-state diverges qualitatively.
pub const AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR: f32 = 2.0e-3;

/// Aggregate result of a reproducibility study (typically N=10 runs ×
/// some pre-divergence step horizon). All fields are caller-computed
/// from the raw per-step losses; this struct is the verdict-fn input.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct ReproducibilityStudyResult {
    /// `max_k max_{i,j} |loss_i[k] - loss_j[k]|` across the study.
    pub per_step_drift_max: f32,
    /// Empirical random-walk coefficient: `max_k (per_step_range[k] / sqrt(k+1))`.
    pub random_walk_epsilon: f32,
    /// `min_{i<j} cos_sim(loss_i, loss_j)` across the study.
    pub cosine_sim_worst: f32,
    /// `max(final_val_loss) - min(final_val_loss)` across the study.
    pub final_loss_range: f32,
}

/// Empirical-bound verdict for FALSIFY-GPUTRAIN-006-v2.
///
/// Pass iff ALL FOUR observed metrics fall within their respective
/// AC_GPUTRAIN_006_* bounds and every metric is finite. Any non-finite
/// input or any single-bound violation is conservatively Fail. The
/// 4-bound shape is intentional: each guards a different failure
/// mode, and an attacker mutating one bound (e.g. tightening
/// PER_STEP_DRIFT_FLOOR by accident) can't be hidden behind a more
/// permissive bound.
#[must_use]
pub fn verdict_from_reproducibility_study(
    study: &ReproducibilityStudyResult,
) -> Gputrain006Verdict {
    // Section 1: every input metric must be finite (NaN/±∞ → Fail).
    if !study.per_step_drift_max.is_finite()
        || !study.random_walk_epsilon.is_finite()
        || !study.cosine_sim_worst.is_finite()
        || !study.final_loss_range.is_finite()
    {
        return Gputrain006Verdict::Fail;
    }

    // Section 2: drift / range / epsilon are non-negative ranges. A
    // negative value is a caller bug (e.g. forgot abs()).
    if study.per_step_drift_max < 0.0
        || study.random_walk_epsilon < 0.0
        || study.final_loss_range < 0.0
    {
        return Gputrain006Verdict::Fail;
    }

    // Section 3: cosine similarity is in [-1, 1]; for reproducibility
    // it must be very close to 1.0. Anything below 0 is direction
    // disagreement → Fail.
    if !(0.0..=1.000_1).contains(&study.cosine_sim_worst) {
        // Allow tiny FP-overshoot above 1.0 (cos_sim of identical traces
        // computed in FP32 can land at 1.0 + ULP); reject everything else.
        return Gputrain006Verdict::Fail;
    }

    // Section 4: each empirical bound must hold (inclusive ceiling).
    if study.per_step_drift_max > AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR {
        return Gputrain006Verdict::Fail;
    }
    if study.random_walk_epsilon > AC_GPUTRAIN_006_RANDOM_WALK_EPSILON {
        return Gputrain006Verdict::Fail;
    }
    if study.cosine_sim_worst < AC_GPUTRAIN_006_COSINE_SIM_FLOOR {
        return Gputrain006Verdict::Fail;
    }
    if study.final_loss_range > AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR {
        return Gputrain006Verdict::Fail;
    }

    Gputrain006Verdict::Pass
}

// ─────────────────────────────────────────────────────────────
// Unit tests — FALSIFY-GPUTRAIN-006 algorithm-level proof
// ─────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    /// FALSIFY-GPUTRAIN-006 algorithm-level PARTIAL discharge: prove the
    /// same-device seed reproducibility threshold rule + trajectory
    /// aggregate. Any mutation that flips the comparison direction,
    /// relaxes the finiteness guard, silently accepts a length mismatch,
    /// or defaults the tolerance to infinity must break this test before
    /// the live CUDA parity run.
    #[test]
    fn falsify_gputrain_006_seed_reproducibility_threshold_logic() {
        let tol = AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA;

        // Section 1: boundary — delta exactly equal to tolerance. Pass
        // per the `<=` inclusive-ceiling rule. Any mutation to strict
        // `<` flips this to Fail.
        assert_eq!(
            verdict_from_loss_delta(tol, tol),
            Gputrain006Verdict::Pass,
            "delta == tolerance (1e-5) must Pass per inclusive ceiling",
        );

        // Section 2: above tolerance by ULP. Any mutation that relaxed
        // to a ±epsilon compare or flipped the inequality would make
        // this Pass.
        let one_ulp_above = f32::from_bits(tol.to_bits() + 1);
        assert!(one_ulp_above > tol);
        assert_eq!(
            verdict_from_loss_delta(one_ulp_above, tol),
            Gputrain006Verdict::Fail,
            "one ULP above tolerance must Fail",
        );
        // A larger overshoot — the defect shape where a seed plumbing
        // regression breaks determinism outright.
        assert_eq!(
            verdict_from_loss_delta(1e-3, tol),
            Gputrain006Verdict::Fail,
            "100× tolerance must Fail (visible seed plumbing regression)",
        );

        // Section 3: trajectory — single-step fail. 99 steps within
        // tolerance plus ONE step above must Fail. Mirrors the real
        // failure mode: a reproducibility regression often shows up at
        // a specific layer depth (e.g. the first LayerNorm backward
        // where cuBLAS warp-reduction order leaked).
        let mut run_a = vec![1.0f32; 100];
        let mut run_b = vec![1.0f32; 100];
        run_b[42] = 1.0 + 1e-3; // delta = 1e-3 > tol
        assert_eq!(
            verdict_from_loss_trajectories(&run_a, &run_b, tol),
            Gputrain006Verdict::Fail,
            "single-step trajectory violation at k=42 must Fail",
        );
        // Restore k=42 to within tolerance — everything else unchanged
        // must now Pass.
        run_b[42] = 1.0 + (tol / 2.0);
        assert_eq!(
            verdict_from_loss_trajectories(&run_a, &run_b, tol),
            Gputrain006Verdict::Pass,
            "all-within-tolerance trajectory must Pass",
        );
        // Sanity: a tiny drift on every step is still Pass as long as
        // each delta is within tolerance.
        for i in 0..run_a.len() {
            run_a[i] = 2.0 + (i as f32) * 1e-3;
            run_b[i] = run_a[i] + (tol / 10.0);
        }
        assert_eq!(
            verdict_from_loss_trajectories(&run_a, &run_b, tol),
            Gputrain006Verdict::Pass,
            "uniform within-tolerance drift across 100 steps must Pass",
        );

        // Section 4: length mismatch. Two runs of different length can't
        // be compared pairwise — conservative Fail (some other bug in
        // the harness cut one run short).
        let short = vec![1.0f32; 50];
        let long = vec![1.0f32; 100];
        assert_eq!(
            verdict_from_loss_trajectories(&short, &long, tol),
            Gputrain006Verdict::Fail,
            "length mismatch (50 vs 100) must Fail",
        );
        assert_eq!(
            verdict_from_loss_trajectories(&long, &short, tol),
            Gputrain006Verdict::Fail,
            "reverse length mismatch must also Fail",
        );

        // Section 5: empty input. A defensive `is_empty()` check
        // prevents a vacuously-true "no steps" from passing the gate.
        let empty: Vec<f32> = vec![];
        let one = vec![1.0f32];
        assert_eq!(
            verdict_from_loss_trajectories(&empty, &empty, tol),
            Gputrain006Verdict::Fail,
            "both-empty trajectories must Fail (no steps compared)",
        );
        assert_eq!(
            verdict_from_loss_trajectories(&empty, &one, tol),
            Gputrain006Verdict::Fail,
            "one-empty one-nonempty must Fail",
        );

        // Section 6: non-finite elements. A NaN or ±∞ anywhere in
        // either run must propagate to Fail. Catches the failure mode
        // where a GradScaler overflow emitted NaN and the harness kept
        // plotting.
        let mut nan_a = vec![1.0f32; 10];
        let nan_b = vec![1.0f32; 10];
        nan_a[3] = f32::NAN;
        assert_eq!(
            verdict_from_loss_trajectories(&nan_a, &nan_b, tol),
            Gputrain006Verdict::Fail,
            "NaN in run_a must Fail",
        );
        let mut inf_b = vec![1.0f32; 10];
        inf_b[7] = f32::INFINITY;
        assert_eq!(
            verdict_from_loss_trajectories(&nan_b, &inf_b, tol),
            Gputrain006Verdict::Fail,
            "+inf in run_b must Fail",
        );
        // Non-finite single-step delta.
        assert_eq!(
            verdict_from_loss_delta(f32::NAN, tol),
            Gputrain006Verdict::Fail,
            "NaN delta must Fail",
        );
        assert_eq!(
            verdict_from_loss_delta(1e-6, f32::INFINITY),
            Gputrain006Verdict::Fail,
            "infinite tolerance must Fail (no rubber-stamp Pass)",
        );
        // Negative tolerance / delta.
        assert_eq!(
            verdict_from_loss_delta(-1e-6, tol),
            Gputrain006Verdict::Fail,
            "negative delta must Fail (caller passed raw a-b, not |a-b|)",
        );
        assert_eq!(
            verdict_from_loss_delta(1e-6, -1e-5),
            Gputrain006Verdict::Fail,
            "negative tolerance must Fail (nonsense threshold)",
        );

        // Section 7: provenance pin — the 1e-5 tolerance is load-
        // bearing and lockstep with the YAML contract rule and peer
        // INV-TRAIN-006 (CPU 1e-6, CUDA 1e-5). Any future tightening
        // (e.g. after trueno#203 lands deterministic kernels) or
        // relaxation must move the constant, the YAML rule, and this
        // test together.
        assert!(
            (AC_GPUTRAIN_006_MAX_SEED_LOSS_DELTA - 1e-5).abs() < 1e-9,
            "INV-GPUTRAIN-006 tolerance is 1e-5 \
             (spec §14.4 / gpu-training-backend-v1 INV-GPUTRAIN-006)",
        );
    }

    /// FALSIFY-GPUTRAIN-006-v2 empirical-bound discharge: prove the
    /// 4-bound ReproducibilityStudyResult verdict shape. The bounds
    /// were measured on RTX 4090 sm_89 with the deterministic-mode
    /// stack engaged (PTX atomicAdd removed, cuBLAS PEDANTIC, APR-MONO
    /// dep migration); evidence file
    /// `evidence/task-132/gputrain-006-empirical-v1.json` holds the
    /// raw 10-run × 100-step study. Any mutation to one of the 4
    /// constants, any flip of the inequality direction, or any leak of
    /// non-finite handling must break this test before a live RTX 4090
    /// reproducibility-runner dispatch.
    #[test]
    fn falsify_gputrain_006_empirical_reproducibility_bounds() {
        // Section 1: at-bound study (every metric exactly at its
        // floor/ceiling). Pass per inclusive comparisons. Mutating any
        // `<=` to strict `<` or any `>=` to strict `>` flips a metric
        // to Fail.
        let at_bound = ReproducibilityStudyResult {
            per_step_drift_max: AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR,
            random_walk_epsilon: AC_GPUTRAIN_006_RANDOM_WALK_EPSILON,
            cosine_sim_worst: AC_GPUTRAIN_006_COSINE_SIM_FLOOR,
            final_loss_range: AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR,
        };
        assert_eq!(
            verdict_from_reproducibility_study(&at_bound),
            Gputrain006Verdict::Pass,
            "every metric exactly at bound must Pass per inclusive ceiling",
        );

        // Section 2: empirical-pass case — observed v1 numbers from the
        // study evidence file. Each metric must be strictly within its
        // bound.
        let v1_observed = ReproducibilityStudyResult {
            per_step_drift_max: 9.2e-4,            // ≤ 1.0e-3
            random_walk_epsilon: 2.74e-4,          // ≤ 3.0e-4
            cosine_sim_worst: 0.999_999_999_7_f32, // ≥ 0.999_999_99
            final_loss_range: 1.341e-3,            // ≤ 2.0e-3
        };
        assert_eq!(
            verdict_from_reproducibility_study(&v1_observed),
            Gputrain006Verdict::Pass,
            "v1 empirical study must Pass — these are the proof points",
        );

        // Section 3: each bound, broken individually. Any mutation that
        // accidentally flips one comparison direction, or weakens one
        // bound, must fail to Pass at least one of these four cases.

        // 3a. Per-step drift overshoot.
        let mut drift_high = v1_observed;
        drift_high.per_step_drift_max = AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR + 1e-6;
        assert_eq!(
            verdict_from_reproducibility_study(&drift_high),
            Gputrain006Verdict::Fail,
            "per_step_drift_max above floor must Fail",
        );

        // 3b. Random-walk ε overshoot.
        let mut eps_high = v1_observed;
        eps_high.random_walk_epsilon = AC_GPUTRAIN_006_RANDOM_WALK_EPSILON + 1e-6;
        assert_eq!(
            verdict_from_reproducibility_study(&eps_high),
            Gputrain006Verdict::Fail,
            "random_walk_epsilon above ceiling must Fail",
        );

        // 3c. Cosine similarity below floor. Subtract 1e-6 (well above
        // FP32 ULP at magnitude ~1.0, which is ~1.19e-7) so the
        // arithmetic actually moves the value below the floor.
        let mut cos_low = v1_observed;
        cos_low.cosine_sim_worst = AC_GPUTRAIN_006_COSINE_SIM_FLOOR - 1e-6;
        assert!(
            cos_low.cosine_sim_worst < AC_GPUTRAIN_006_COSINE_SIM_FLOOR,
            "test sanity: cos_low should actually be below floor in FP32"
        );
        assert_eq!(
            verdict_from_reproducibility_study(&cos_low),
            Gputrain006Verdict::Fail,
            "cosine_sim_worst below floor must Fail",
        );

        // 3d. Final loss range overshoot.
        let mut range_high = v1_observed;
        range_high.final_loss_range = AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR + 1e-6;
        assert_eq!(
            verdict_from_reproducibility_study(&range_high),
            Gputrain006Verdict::Fail,
            "final_loss_range above floor must Fail",
        );

        // Section 4: non-finite metrics — every field independently.
        // A NaN or ±∞ in any of the four fields must short-circuit to
        // Fail before the bound checks run, catching the harness bug
        // where a metric was computed from a degenerate input.
        for (field_name, mutate) in [
            ("per_step_drift_max", 1u32),
            ("random_walk_epsilon", 2u32),
            ("cosine_sim_worst", 3u32),
            ("final_loss_range", 4u32),
        ] {
            for non_finite in [f32::NAN, f32::INFINITY, f32::NEG_INFINITY] {
                let mut s = v1_observed;
                match mutate {
                    1 => s.per_step_drift_max = non_finite,
                    2 => s.random_walk_epsilon = non_finite,
                    3 => s.cosine_sim_worst = non_finite,
                    4 => s.final_loss_range = non_finite,
                    _ => unreachable!(),
                }
                assert_eq!(
                    verdict_from_reproducibility_study(&s),
                    Gputrain006Verdict::Fail,
                    "non-finite ({non_finite}) in {field_name} must Fail",
                );
            }
        }

        // Section 5: negative ranges (caller bug — forgot abs()).
        let mut neg = v1_observed;
        neg.per_step_drift_max = -1e-4;
        assert_eq!(
            verdict_from_reproducibility_study(&neg),
            Gputrain006Verdict::Fail,
            "negative per_step_drift_max must Fail (raw a-b leaked, not |a-b|)",
        );

        // Section 6: cosine similarity range guard. Reproducible traces
        // give ~1.0; any value outside [0, 1+ULP] is a caller bug that
        // must Fail.
        for bad_cos in [-0.5_f32, -1.0_f32, 1.5_f32, 100.0_f32] {
            let mut s = v1_observed;
            s.cosine_sim_worst = bad_cos;
            assert_eq!(
                verdict_from_reproducibility_study(&s),
                Gputrain006Verdict::Fail,
                "cosine_sim_worst out-of-range ({bad_cos}) must Fail",
            );
        }

        // Section 7: cosine similarity at exactly 1.0 (identical traces)
        // must Pass. ULP overshoot above 1.0 (FP32 inner product on
        // identical vectors) must also Pass — the verdict allows up to
        // 1.0001 for that exact reason.
        let identical = ReproducibilityStudyResult {
            per_step_drift_max: 0.0,
            random_walk_epsilon: 0.0,
            cosine_sim_worst: 1.0,
            final_loss_range: 0.0,
        };
        assert_eq!(
            verdict_from_reproducibility_study(&identical),
            Gputrain006Verdict::Pass,
            "perfect identity (cos=1.0, all drift=0) must Pass",
        );
        let identity_ulp =
            ReproducibilityStudyResult { cosine_sim_worst: 1.000_000_1, ..identical };
        assert_eq!(
            verdict_from_reproducibility_study(&identity_ulp),
            Gputrain006Verdict::Pass,
            "FP32 cos_sim ULP overshoot above 1.0 (identity reduction) must Pass",
        );

        // Section 8: provenance pin — the 4 constants are load-bearing
        // and lockstep with the YAML contract rule and the empirical
        // evidence file.  Any future ratchet (tighten after better
        // determinism lands) or relaxation (a hardware regression) must
        // move ALL of: the constant, the YAML rule, and the v2 evidence
        // file together. Triple-pinned to prevent silent drift.
        assert!(
            (AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR - 1.0e-3).abs() < 1e-9,
            "AC_GPUTRAIN_006_PER_STEP_DRIFT_FLOOR is 1.0e-3 \
             (provenance: evidence/task-132/gputrain-006-empirical-v1.json)",
        );
        assert!(
            (AC_GPUTRAIN_006_RANDOM_WALK_EPSILON - 3.0e-4).abs() < 1e-9,
            "AC_GPUTRAIN_006_RANDOM_WALK_EPSILON is 3.0e-4",
        );
        assert!(
            (AC_GPUTRAIN_006_COSINE_SIM_FLOOR - 0.999_999_99_f32).abs() < 1e-12,
            "AC_GPUTRAIN_006_COSINE_SIM_FLOOR is 0.999_999_99",
        );
        assert!(
            (AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR - 2.0e-3).abs() < 1e-9,
            "AC_GPUTRAIN_006_FINAL_LOSS_RANGE_FLOOR is 2.0e-3",
        );
    }
}