1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
// SHIP-TWO-001 §6 Compound Ship Gates — GATE-SHIP-004 algorithm-level
// PARTIAL discharge.
//
// Spec: docs/specifications/aprender-train/ship-two-models-spec.md §6 row
// `GATE-SHIP-004 | HumanEval harness produces identical score on two
// consecutive runs (seed=0) | AC-005, AC-008`.
// Contract: contracts/compound-ship-gates-v1.yaml v1.0.0 PROPOSED
// (FALSIFY-GATE-SHIP-004 — wired in the same PR as this file lands).
//
// GATE-SHIP-004 is the *determinism* gate for HumanEval: with seed=0
// and greedy decoding, two consecutive `apr eval --benchmark humaneval`
// runs must produce BITWISE-IDENTICAL pass@1 percentages. This is
// STRICTER than FALSIFY-SHIP-023 (which allows ≤ 1.2 pp drift across
// two days); GATE-SHIP-004 enforces bit-for-bit determinism within a
// single session. If two seed=0 runs produce different bytes, either
// (a) the sampling path is non-deterministic (hidden entropy source),
// or (b) the eval harness has a race.
//
// This file discharges the *decision rule* at `PARTIAL_ALGORITHM_LEVEL`:
// given two pass@1 percentages (f32), the verdict is `Pass` iff
// `to_bits()` returns identical u32 AND both values are finite AND in
// `[0.0, 100.0]`. A single-ULP drift (even within floating-point
// rounding noise) Fails — because a truly deterministic harness with
// seed=0 must produce byte-equal outputs on every invocation.
//
// Contrast with FALSIFY-SHIP-023 (two-day drift tolerance 1.2 pp):
// - SHIP-023 tolerates natural day-over-day noise (temperature,
// page cache, etc.) — semantically "stability across sessions".
// - GATE-SHIP-004 tolerates ZERO drift — semantically "pure
// determinism within a session".
// Both gates must pass to ship; GATE-SHIP-004 is the sharper one.
/// Binary verdict for FALSIFY-GATE-SHIP-004 / GATE-SHIP-004.
/// `Pass` iff both run scores are well-formed AND their `to_bits()`
/// returns identical u32. `Fail` otherwise.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GateShip004Verdict {
/// Both pass@1 scores are finite, in `[0.0, 100.0]`, AND
/// `run_a.to_bits() == run_b.to_bits()`. Two consecutive seed=0
/// HumanEval runs produced bitwise-identical output — the eval
/// harness is deterministic.
Pass,
/// Any of: non-finite score on either side; out-of-range score;
/// `to_bits()` mismatch (even by one ULP). Determinism invariant
/// is broken; publish is blocked.
Fail,
}
/// Algorithm-level verdict rule for FALSIFY-GATE-SHIP-004 /
/// GATE-SHIP-004: bitwise-identical determinism check on two seed=0
/// HumanEval pass@1 runs.
///
/// Distinct from `AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP = 1.2` (day-over-
/// day drift tolerance): this rule tolerates ZERO drift because two
/// runs with seed=0 within the same session must produce byte-equal
/// output. A single-ULP drift (`f32::from_bits(x.to_bits() + 1)`)
/// Fails, because the intent is to catch hidden non-determinism
/// (entropy source, race condition, accidental atomic-counter read)
/// rather than noise absorption.
///
/// Conservative-Fail guards:
///
/// - `!run_a.is_finite()` OR `!run_b.is_finite()` → Fail (NaN /
/// ±∞ are never valid pass@1 scores).
/// - `run_a` or `run_b` outside `[0.0, 100.0]` → Fail (pass@1 is a
/// percentage).
/// - `run_a.to_bits() != run_b.to_bits()` → Fail (determinism
/// broken).
///
/// # Examples
///
/// ```
/// use aprender::format::gate_ship_004::{
/// verdict_from_identical_humaneval_scores, GateShip004Verdict,
/// };
///
/// // Identical seed=0 runs → Pass.
/// assert_eq!(
/// verdict_from_identical_humaneval_scores(86.0, 86.0),
/// GateShip004Verdict::Pass
/// );
///
/// // Close but not bitwise-equal (single ULP) → Fail.
/// let run_a = 86.0_f32;
/// let run_b = f32::from_bits(run_a.to_bits() + 1);
/// assert_eq!(
/// verdict_from_identical_humaneval_scores(run_a, run_b),
/// GateShip004Verdict::Fail
/// );
/// ```
#[must_use]
pub fn verdict_from_identical_humaneval_scores(
run_a_pct: f32,
run_b_pct: f32,
) -> GateShip004Verdict {
if !run_a_pct.is_finite() || !run_b_pct.is_finite() {
return GateShip004Verdict::Fail;
}
if !(0.0_f32..=100.0_f32).contains(&run_a_pct) {
return GateShip004Verdict::Fail;
}
if !(0.0_f32..=100.0_f32).contains(&run_b_pct) {
return GateShip004Verdict::Fail;
}
if run_a_pct.to_bits() == run_b_pct.to_bits() {
GateShip004Verdict::Pass
} else {
GateShip004Verdict::Fail
}
}
// ─────────────────────────────────────────────────────────────
// Unit tests — FALSIFY-GATE-SHIP-004 algorithm-level proof
// ─────────────────────────────────────────────────────────────
#[cfg(test)]
mod gate_ship_004_tests {
use super::*;
/// FALSIFY-GATE-SHIP-004 algorithm-level PARTIAL discharge: prove
/// the bitwise-identity determinism rule binding two seed=0
/// HumanEval pass@1 runs. Any edit that relaxes `to_bits()` to
/// `==` (which treats +0.0 == -0.0), widens to drift-tolerance
/// like SHIP-023, or skips the range guards must break this test.
#[test]
fn falsify_gate_ship_004_humaneval_bitwise_determinism() {
// Section 1: identical seed=0 runs → Pass.
assert_eq!(
verdict_from_identical_humaneval_scores(86.0, 86.0),
GateShip004Verdict::Pass,
"identical 86.0 scores must Pass",
);
// Section 2: single-ULP difference → Fail. This is the
// sharpest counter-example: a relaxation to "close-enough
// for floats" (e.g., `(a - b).abs() < 1e-7`) would flip this
// to Pass. GATE-SHIP-004's whole purpose is to detect
// hidden non-determinism that shows up as a single-ULP
// difference when a thread-local RNG is accidentally seeded
// with time, or when atomics expose a race.
let run_a = 86.0_f32;
let run_b = f32::from_bits(run_a.to_bits() + 1);
assert_ne!(
run_a.to_bits(),
run_b.to_bits(),
"harness sanity: single-ULP neighbours have different bits",
);
assert_eq!(
verdict_from_identical_humaneval_scores(run_a, run_b),
GateShip004Verdict::Fail,
"single-ULP drift must Fail (this is the whole point of GATE-SHIP-004)",
);
// Section 3: close-but-not-equal — 86.0 vs 86.0000001. Even
// within float noise, this must Fail. Catches the class where
// a reviewer thinks "eh, 1e-7 is indistinguishable" and
// relaxes the comparison.
let close_a = 86.0_f32;
let close_b = 86.000_001_f32;
if close_a.to_bits() != close_b.to_bits() {
assert_eq!(
verdict_from_identical_humaneval_scores(close_a, close_b),
GateShip004Verdict::Fail,
"86.0 vs 86.000001 must Fail (bits differ even if within tolerance)",
);
}
// Section 4: non-finite on either side → Fail. A harness bug
// that emits NaN must not silently Pass just because NaN
// compares equal to itself under to_bits (which it does, but
// we guard earlier on is_finite).
for &bad in &[f32::NAN, f32::INFINITY, f32::NEG_INFINITY] {
assert_eq!(
verdict_from_identical_humaneval_scores(bad, 86.0),
GateShip004Verdict::Fail,
"non-finite run_a ({bad}) must Fail conservatively",
);
assert_eq!(
verdict_from_identical_humaneval_scores(86.0, bad),
GateShip004Verdict::Fail,
"non-finite run_b ({bad}) must Fail conservatively",
);
assert_eq!(
verdict_from_identical_humaneval_scores(bad, bad),
GateShip004Verdict::Fail,
"both non-finite ({bad}) must Fail (NaN bit-equality is not determinism)",
);
}
// Section 5: out-of-range values Fail. pass@1 is a percentage
// in `[0.0, 100.0]`; -0.1 or 100.1 on either side is a harness
// bug.
for &oor in &[-0.1_f32, -1.0, -86.0, 100.1, 101.0, 1_000.0] {
assert_eq!(
verdict_from_identical_humaneval_scores(oor, 86.0),
GateShip004Verdict::Fail,
"out-of-range run_a ({oor}) must Fail",
);
assert_eq!(
verdict_from_identical_humaneval_scores(86.0, oor),
GateShip004Verdict::Fail,
"out-of-range run_b ({oor}) must Fail",
);
// Even if both sides are OOR and bit-equal, the range
// guard fires first — degenerate Pass path blocked.
assert_eq!(
verdict_from_identical_humaneval_scores(oor, oor),
GateShip004Verdict::Fail,
"both out-of-range ({oor}) must Fail (range-guard-first)",
);
}
// Section 6: boundary values {0.0, 100.0} are legal pass@1
// scores. Degenerate cases (pass@1 = 0% or pass@1 = 100%)
// must Pass when both runs produce the same boundary value.
assert_eq!(
verdict_from_identical_humaneval_scores(0.0, 0.0),
GateShip004Verdict::Pass,
"both at 0.0% must Pass (degenerate but legal)",
);
assert_eq!(
verdict_from_identical_humaneval_scores(100.0, 100.0),
GateShip004Verdict::Pass,
"both at 100.0% must Pass (degenerate but legal)",
);
// But 0.0 vs 100.0 is maximally non-deterministic → Fail.
assert_eq!(
verdict_from_identical_humaneval_scores(0.0, 100.0),
GateShip004Verdict::Fail,
"0.0 vs 100.0 must Fail (max drift across band)",
);
// Section 7: provenance — GATE-SHIP-004 is DISTINCT from
// FALSIFY-SHIP-023 (which uses `AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP
// = 1.2` for day-over-day tolerance). Document the distinction
// in a test assertion so a careless refactor that unifies
// them gets caught.
//
// SHIP-023 rule: `(day1 - day2).abs() <= 1.2 pp` Pass.
// GATE-SHIP-004 rule: `run_a.to_bits() == run_b.to_bits()`
// Pass. A value that would Pass SHIP-023 (drift = 0.5 pp)
// must Fail GATE-SHIP-004 (bits differ).
let drift_05_a = 86.0_f32;
let drift_05_b = 86.5_f32;
assert_eq!(
verdict_from_identical_humaneval_scores(drift_05_a, drift_05_b),
GateShip004Verdict::Fail,
"0.5 pp drift (Pass under SHIP-023) must Fail under GATE-SHIP-004 \
(bitwise-identity is strictly stricter than 1.2 pp tolerance)",
);
}
}