1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
// SHIP-TWO-001 AC-SHIP1-023 / FALSIFY-SHIP-023 algorithm-level PARTIAL discharge.
//
// Spec: docs/specifications/aprender-train/ship-two-models-spec.md §7.1 row
// `FALSIFY-SHIP-023 | Re-run AC-005 on second day; score drift | drift > 1.2 pp`.
// Contract: contracts/qwen2-e2e-verification-v1.yaml (FALSIFY-QW2E-SHIP-023 —
// wired in the same PR as this file lands).
//
// SHIP-TWO-001 §7.1 states that the MODEL-1 teacher must produce
// HumanEval pass@1 scores that are stable across two consecutive
// evaluation days. The spec's falsification rule is phrased "drift
// > 1.2 pp" — a pair-of-runs threshold over `|day1 − day2|`.
//
// This file discharges the *decision rule* at `PARTIAL_ALGORITHM_LEVEL`
// by binding one pure verdict fn:
//
// `verdict_from_score_drift(day1_pct, day2_pct, tolerance_pp)
// -> Ship023Verdict` — pair-of-runs drift rule. `Pass` iff all three
// inputs are `is_finite()`, both `day1_pct` and `day2_pct` live in
// the valid pass@1 range `[0.0, 100.0]`, `tolerance_pp >= 0.0`, AND
// `(day1_pct - day2_pct).abs() <= tolerance_pp`. Conservative Fail
// on any non-finite, out-of-range, negative-tolerance, or over-drift
// input — a harness bug must never silently ship a drifting teacher.
//
// SHIP-023 pairs numerically with SHIP-005 (`AC_SHIP1_005_NOISE_ALLOWANCE_PP
// = 1.2`) but carries different semantics: SHIP-005 is the noise window
// around the *nominal* 86.00% HumanEval pass@1 floor (tolerated drop),
// while SHIP-023 is the drift tolerance *between two measured runs*
// (tolerated day-over-day fluctuation). Both use the same 1.2 pp budget
// by design — SHIP-023's drift must fit inside SHIP-005's noise allowance
// for the ship gate to compose.
//
// The compute-heavy portion of the AC (actually running `apr eval
// --benchmark humaneval` on two consecutive days and computing the
// absolute drift between the two pass@1 medians) is intentionally out
// of scope here. The threshold rule is what the compute harness must
// emit a Pass on, and changing the bound constant
// (`AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP`), the inequality direction,
// the `.abs()` symmetric combinator, or the range guard breaks this
// test before a single second-day eval runs.
//
// Mirrors the single-number-threshold shape of SHIP-007 / SHIP-020
// (decode-tps) and SHIP-005's three-constant approach, but uniquely
// uses a pair-of-runs diff with `.abs()` for order invariance. The
// conservative-range guard `[0.0, 100.0]` on both day values mirrors
// SHIP-005's input validation. SHIP-023 is `ship_blocking: false`
// (spec §7.1 stability test, not in §4.2 AC table).
/// Maximum tolerated cross-run drift in percentage points (pp) for the
/// MODEL-1 HumanEval pass@1 stability gate. Any two measured pass@1
/// scores whose absolute difference exceeds 1.2 pp fails
/// FALSIFY-SHIP-023.
///
/// Lockstep with `docs/specifications/aprender-train/ship-two-models-spec.md`
/// §7.1 row FALSIFY-SHIP-023: "Re-run AC-005 on second day; score
/// drift > 1.2 pp".
///
/// Pairs numerically with `AC_SHIP1_005_NOISE_ALLOWANCE_PP = 1.2` but
/// semantically distinct: SHIP-005's noise allowance is the tolerated
/// drop from the 86.00% nominal floor (one-sided, signed), while
/// SHIP-023's drift tolerance is the symmetric `.abs()` difference
/// between two measured runs (two-sided, unsigned). Sharing the same
/// 1.2 pp budget is intentional: a day-over-day drift that exceeds
/// the noise-allowance window would be incompatible with SHIP-005's
/// ship gate.
///
/// f32-exact: `1.2` in IEEE-754 binary32 is `0x3F99999A` =
/// 1.20000004768... — the ULP-above neighbour (`0x3F99999B`) is the
/// sharp counter-example the mutation survey uses.
pub const AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP: f32 = 1.2;
/// Binary verdict for FALSIFY-SHIP-023 / AC-SHIP1-023.
/// `Pass` iff inputs are well-formed AND the absolute drift between
/// day1 and day2 pass@1 scores is at or below the tolerance. `Fail`
/// otherwise.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Ship023Verdict {
/// Well-formed inputs AND `(day1 − day2).abs() ≤ tolerance_pp`.
/// The two-day HumanEval re-run is stable within the 1.2 pp drift
/// budget.
Pass,
/// Any of: non-finite `day1_pct` / `day2_pct` / `tolerance_pp`;
/// `day1_pct` or `day2_pct` outside `[0.0, 100.0]`;
/// `tolerance_pp < 0.0`; `(day1 − day2).abs() > tolerance_pp`.
Fail,
}
/// Pure decision rule for FALSIFY-SHIP-023 / AC-SHIP1-023:
/// pair-of-runs HumanEval pass@1 drift check.
///
/// Conservative-Fail guards:
///
/// - `!day1_pct.is_finite()` OR `!day2_pct.is_finite()` OR
/// `!tolerance_pp.is_finite()` → Fail (NaN / ±∞ are never valid
/// pass@1 scores or drift budgets).
/// - `day1_pct` or `day2_pct` outside `[0.0, 100.0]` → Fail
/// (pass@1 is a percentage; values outside that range are harness
/// bugs).
/// - `tolerance_pp < 0.0` → Fail (contract drift — a negative drift
/// allowance is nonsense; 0.0 is legal and means "byte-identical").
/// - `(day1_pct − day2_pct).abs() > tolerance_pp` → Fail (drift
/// exceeds tolerance).
///
/// The function is order-invariant because of `.abs()`: swapping day1
/// and day2 yields the same verdict. The mutation survey's Section 4
/// proves this.
///
/// # Examples
///
/// ```
/// use aprender::format::ship_023::{
/// verdict_from_score_drift, Ship023Verdict, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP,
/// };
///
/// // Stable re-run: 86.0% day 1, 86.5% day 2 → drift 0.5 pp ≤ 1.2 pp.
/// assert_eq!(
/// verdict_from_score_drift(86.0, 86.5, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
/// Ship023Verdict::Pass
/// );
/// // Drifting re-run: 86.0% day 1, 84.5% day 2 → drift 1.5 pp > 1.2 pp.
/// assert_eq!(
/// verdict_from_score_drift(86.0, 84.5, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
/// Ship023Verdict::Fail
/// );
/// ```
#[must_use]
pub fn verdict_from_score_drift(day1_pct: f32, day2_pct: f32, tolerance_pp: f32) -> Ship023Verdict {
if !day1_pct.is_finite() || !day2_pct.is_finite() || !tolerance_pp.is_finite() {
return Ship023Verdict::Fail;
}
if !(0.0_f32..=100.0_f32).contains(&day1_pct) {
return Ship023Verdict::Fail;
}
if !(0.0_f32..=100.0_f32).contains(&day2_pct) {
return Ship023Verdict::Fail;
}
if tolerance_pp < 0.0 {
return Ship023Verdict::Fail;
}
let drift = (day1_pct - day2_pct).abs();
if drift <= tolerance_pp {
Ship023Verdict::Pass
} else {
Ship023Verdict::Fail
}
}
// ─────────────────────────────────────────────────────────────
// Unit tests — FALSIFY-SHIP-023 algorithm-level proof
// ─────────────────────────────────────────────────────────────
#[cfg(test)]
mod ship_023_tests {
use super::*;
/// FALSIFY-SHIP-023 algorithm-level PARTIAL discharge: prove the
/// pair-of-runs drift rule for AC-SHIP1-023. Any edit that changes
/// the 1.2 pp tolerance, the inequality direction, the `.abs()`
/// combinator, the range guard `[0.0, 100.0]`, the negative-tolerance
/// rejection, or the non-finite handling must break this test before
/// a two-day HumanEval re-run is launched.
#[test]
fn falsify_ship_023_score_drift_threshold_logic() {
// Section 1: exact boundary drift = 1.2 pp → Pass; drift just
// above the boundary → Fail. `AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP`
// is the stored f32 value `0x3F99999A` so the boundary test
// `(86.0 - 84.8).abs() == 1.2000002` compared against 1.2 must
// be handled with IEEE-754 awareness. We use 86.0 vs 87.2 which
// gives drift exactly at 1.2000003 (not quite boundary-equal)
// — prefer a precisely-boundary pair: 1.2000005 and 0.0.
assert_eq!(
verdict_from_score_drift(
AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP,
0.0,
AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP
),
Ship023Verdict::Pass,
"drift exactly at tolerance boundary must Pass (inclusive)",
);
// drift = 1.2 + 1e-4 = 1.2001, above boundary → Fail.
let just_above: f32 = AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP + 1e-4;
assert!(
just_above > AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP,
"harness sanity: just_above must really exceed tolerance",
);
assert_eq!(
verdict_from_score_drift(just_above, 0.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"drift = 1.2 + 1e-4 must Fail (sharpest counter-example)",
);
// Section 2: clear Pass band — drifts well below the 1.2 pp
// budget.
for &drift in &[0.0_f32, 0.5, 1.0, 1.199] {
let day1 = 86.0;
let day2 = 86.0 - drift;
assert_eq!(
verdict_from_score_drift(day1, day2, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"drift {drift} pp must Pass (below 1.2 tolerance)",
);
}
// Section 3: clear Fail band — drifts well above the 1.2 pp
// budget, including HumanEval-extreme pairings.
for &drift in &[1.3_f32, 2.0, 10.0, 86.0] {
let day1 = 86.0;
let day2 = 86.0 - drift;
assert_eq!(
verdict_from_score_drift(day1, day2, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"drift {drift} pp must Fail (above 1.2 tolerance)",
);
}
// Section 4: symmetric / order-invariant — swapping day1 and
// day2 must yield identical verdict because `.abs()`. The spec
// phrasing "drift > 1.2 pp" has no directional bias.
let (pass_a, pass_b) = (86.0_f32, 84.8_f32);
assert_eq!(
verdict_from_score_drift(pass_a, pass_b, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
verdict_from_score_drift(pass_b, pass_a, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
"verdict must be order-invariant (Pass direction)",
);
// Both directions must Pass for this pair (drift is exactly
// 1.2 pp, but f32 subtraction may yield 1.2000002, which is
// above the stored threshold of 1.2000001 — so we use 86.0
// and 85.0 for a safely-in-band pair).
let (pass_c, pass_d) = (86.0_f32, 85.0_f32);
assert_eq!(
verdict_from_score_drift(pass_c, pass_d, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"1.0 pp drift (within 1.2) must Pass in forward direction",
);
assert_eq!(
verdict_from_score_drift(pass_d, pass_c, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"1.0 pp drift (within 1.2) must Pass in reverse direction (symmetry)",
);
// Fail direction must also be symmetric.
let (fail_a, fail_b) = (86.0_f32, 82.0_f32);
assert_eq!(
verdict_from_score_drift(fail_a, fail_b, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
verdict_from_score_drift(fail_b, fail_a, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
"verdict must be order-invariant (Fail direction)",
);
assert_eq!(
verdict_from_score_drift(fail_a, fail_b, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"4.0 pp drift must Fail regardless of order",
);
// Section 5: non-finite inputs on any of the three f32 args
// must conservatively Fail. A harness that divides by zero, a
// telemetry bug that promotes NaN, or a missing-field default
// must NOT silently ship a Pass.
for &bad in &[f32::NAN, f32::INFINITY, f32::NEG_INFINITY] {
assert_eq!(
verdict_from_score_drift(bad, 86.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"non-finite day1 ({bad}) must Fail conservatively",
);
assert_eq!(
verdict_from_score_drift(86.0, bad, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"non-finite day2 ({bad}) must Fail conservatively",
);
assert_eq!(
verdict_from_score_drift(86.0, 86.0, bad),
Ship023Verdict::Fail,
"non-finite tolerance ({bad}) must Fail conservatively",
);
}
// Section 6: out-of-range day values Fail. pass@1 is a
// percentage in `[0.0, 100.0]`; anything else is a harness
// bug — negative pass@1 is impossible (SHIP-018 proves floor),
// >100% is impossible by construction (num-correct ≤ total).
for &oor in &[-0.1_f32, -1.0, -86.0, 100.1, 101.0, 1_000.0] {
assert_eq!(
verdict_from_score_drift(oor, 86.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"out-of-range day1 ({oor}) must Fail",
);
assert_eq!(
verdict_from_score_drift(86.0, oor, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Fail,
"out-of-range day2 ({oor}) must Fail",
);
}
// Negative tolerance → Fail (contract drift — the drift budget
// must be non-negative; 0.0 is legal and means byte-identical).
for &neg_tol in &[-1e-6_f32, -0.001, -1.0, -100.0] {
assert_eq!(
verdict_from_score_drift(86.0, 86.0, neg_tol),
Ship023Verdict::Fail,
"negative tolerance ({neg_tol}) must Fail (contract drift)",
);
}
// Zero tolerance → Pass iff day1 == day2 exactly.
assert_eq!(
verdict_from_score_drift(86.0, 86.0, 0.0),
Ship023Verdict::Pass,
"zero tolerance with identical scores must Pass",
);
assert_eq!(
verdict_from_score_drift(86.0, 86.1, 0.0),
Ship023Verdict::Fail,
"zero tolerance with any drift must Fail",
);
// Boundary pass@1 values {0.0, 100.0} are legal.
assert_eq!(
verdict_from_score_drift(0.0, 0.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"both at 0.0% must Pass (degenerate but legal)",
);
assert_eq!(
verdict_from_score_drift(100.0, 100.0, AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP),
Ship023Verdict::Pass,
"both at 100.0% must Pass (degenerate but legal)",
);
// Section 7: provenance pin — the 1.2 pp constant is load-
// bearing and lockstepped with the spec. If §7.1 FALSIFY-SHIP-023
// ever widens the drift tolerance (say to 2.0 pp), this const
// must move in lockstep and the rest of the survey follows.
#[allow(clippy::float_cmp)]
{
assert_eq!(
AC_SHIP1_023_MAX_HUMANEVAL_DRIFT_PP, 1.2_f32,
"max HumanEval drift is 1.2 pp (spec §7.1 FALSIFY-SHIP-023)",
);
}
}
}