1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
// SHIP-TWO-001 AC-SHIP1-003 / FALSIFY-SHIP-003 algorithm-level PARTIAL discharge.
//
// Spec: docs/specifications/aprender-train/ship-two-models-spec.md
// Contract: contracts/qwen2-e2e-verification-v1.yaml (FALSIFY-QW2E-SHIP-003 —
// wired in the same PR as this file lands).
//
// AC-SHIP1-003 states that the MODEL-1 teacher must survive an
// `apr convert --quantize q4_k_m` round-trip with **every per-layer
// weight tensor preserving cosine similarity ≥ 0.999** between the
// original f32/f16 weights and the dequantized q4_k_m weights. The
// spec's falsification rule is phrased as "any layer cos < 0.999"
// — an aggregate-AND over the per-layer cosine vector.
//
// This file discharges the *decision rule* at `PARTIAL_ALGORITHM_LEVEL`
// by binding two pure verdict fns:
//
// 1. `verdict_from_cosine_similarity(sim, threshold) -> Ship003Verdict`
// — the single-layer rule. `Pass` iff `sim >= threshold` AND the
// similarity is a finite value in the cosine range `[-1.0, 1.0]`
// AND the threshold is a finite value in `[-1.0, 1.0]`. Conservative
// Fail on any non-finite or out-of-range input (a harness bug must
// never silently ship a broken quantizer).
//
// 2. `verdict_from_per_layer_cosines(sims, threshold) -> Ship003Verdict`
// — the aggregate-AND rule. `Pass` iff the per-layer vector is
// non-empty AND every layer's verdict is Pass. `Fail` on the
// first failing layer, or on the empty input (a round-trip that
// produced zero per-layer measurements is evidence of a broken
// harness, not a quantizer success).
//
// The compute-heavy portion of the AC (running `apr convert --quantize
// q4_k_m` against the 7B teacher and measuring per-layer cosines across
// all 28 transformer blocks × {q,k,v,o,gate,up,down,norm} projections)
// is intentionally out of scope here. The threshold rule and the
// aggregate combinator are what the compute harness must emit a Pass
// on, and changing either side of the bind (the 0.999 floor, the range
// guard, the aggregate-AND shape, or the non-empty requirement) breaks
// this test before a single quantization run is launched.
//
// Mirrors the single-number-threshold shape of SHIP-007 / SHIP-020
// (AC_SHIP?_???_MIN_DECODE_TPS_*) and the aggregate-AND shape of
// SHIP-016 (`verdict_from_qa_gates(&[bool])`). Unlike the integer-
// threshold ships (SHIP-005 pass@1, SHIP-017/002 syntax error count),
// the cosine-similarity floor is a float in `[0.0, 1.0]` and demands
// an explicit range guard on top of the usual non-finite / monotonicity
// rules. MODEL-1 is now at 6/10 AC-SHIP1 items touched (SHIP-008 +
// SHIP-009 + SHIP-006 + SHIP-007 + SHIP-005 + SHIP-010) before this
// lands; SHIP-003 brings it to 7/10.
/// Minimum per-layer cosine similarity threshold for the MODEL-1
/// `apr convert --quantize q4_k_m` round-trip. Any transformer-block
/// weight tensor whose cosine similarity between the original f32/f16
/// value and the dequantized q4_k_m value falls below 0.999 fails
/// FALSIFY-SHIP-003.
///
/// Lockstep with `docs/specifications/aprender-train/ship-two-models-spec.md`
/// §4.2 row AC-SHIP1-003: "Convert to APR via `apr convert --quantize
/// q4_k_m`; round-trip weights match (cos ≥ 0.999)".
///
/// f32-exact: `0.999` is representable in IEEE-754 binary32 as the
/// closest-round value `0x3F7FBE77` = 0.99900001287460327148... — the
/// ULP-below neighbour is `0x3F7FBE76` = 0.99899995326995849609...,
/// which the mutation survey uses as a sharp counter-example.
pub const AC_SHIP1_003_MIN_COSINE_SIMILARITY: f32 = 0.999;
/// Binary verdict for FALSIFY-SHIP-003 / AC-SHIP1-003.
/// `Pass` iff every input is well-formed AND the cosine similarity
/// is at or above the threshold (single-layer) OR every per-layer
/// verdict is Pass (aggregate-AND). `Fail` otherwise.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Ship003Verdict {
/// Well-formed inputs AND `sim ≥ threshold` (single-layer) OR
/// every layer Passes (aggregate). The round-trip preserves the
/// AC-SHIP1-003 floor on this layer / all layers.
Pass,
/// Any of: `sim < threshold`; non-finite `sim` or `threshold`;
/// `sim` or `threshold` outside `[-1.0, 1.0]`; empty per-layer
/// vector (aggregate); any layer Fails (aggregate).
Fail,
}
/// Single-layer verdict rule for FALSIFY-SHIP-003 / AC-SHIP1-003:
/// per-layer cosine similarity threshold check. Returns
/// [`Ship003Verdict::Fail`] conservatively for any malformed input so
/// that a harness bug (NaN propagation through the dot product, a
/// quantizer that silently emits a sentinel value, a telemetry bug
/// that clips to infinity) cannot silently ship a failing quantized
/// model.
///
/// Conservative-Fail guards:
///
/// - `!sim.is_finite()` → Fail (NaN / ±∞ are never valid cosines).
/// - `!threshold.is_finite()` → Fail (contract drift — no real
/// cosine floor can be non-finite).
/// - `sim` outside `[-1.0, 1.0]` → Fail (mathematical nonsense; a
/// real cosine between two vectors cannot exceed 1.0 or undercut
/// −1.0 — numerical drift slightly above 1.0 is still rejected
/// because it indicates a broken inner-product routine).
/// - `threshold` outside `[-1.0, 1.0]` → Fail (contract drift — the
/// threshold is a cosine and must live in the cosine range).
#[must_use]
pub fn verdict_from_cosine_similarity(sim: f32, threshold: f32) -> Ship003Verdict {
if !sim.is_finite() {
return Ship003Verdict::Fail;
}
if !threshold.is_finite() {
return Ship003Verdict::Fail;
}
if !(-1.0_f32..=1.0_f32).contains(&sim) {
return Ship003Verdict::Fail;
}
if !(-1.0_f32..=1.0_f32).contains(&threshold) {
return Ship003Verdict::Fail;
}
if sim >= threshold {
Ship003Verdict::Pass
} else {
Ship003Verdict::Fail
}
}
/// Aggregate-AND verdict rule for FALSIFY-SHIP-003 / AC-SHIP1-003:
/// "any layer cos < 0.999" is a ship-blocker, so the whole model is
/// a `Pass` iff **every** per-layer verdict is a `Pass`.
///
/// Conservative-Fail on empty input: a round-trip that produced zero
/// per-layer measurements is evidence of a broken
/// harness (the tensor-walker never ran, or the convert step silently
/// dropped every layer) — not a quantizer success. Mirrors the
/// aggregate-AND shape of SHIP-016 `verdict_from_qa_gates` and avoids
/// the "vacuously true" trap.
#[must_use]
pub fn verdict_from_per_layer_cosines(sims: &[f32], threshold: f32) -> Ship003Verdict {
if sims.is_empty() {
return Ship003Verdict::Fail;
}
for &sim in sims {
if verdict_from_cosine_similarity(sim, threshold) == Ship003Verdict::Fail {
return Ship003Verdict::Fail;
}
}
Ship003Verdict::Pass
}
// ─────────────────────────────────────────────────────────────
// Unit tests — FALSIFY-SHIP-003 algorithm-level proof
// ─────────────────────────────────────────────────────────────
#[cfg(test)]
mod ship_003_tests {
use super::*;
/// FALSIFY-SHIP-003 single-layer algorithm-level PARTIAL discharge:
/// prove the cosine-threshold rule for AC-SHIP1-003. Any edit that
/// changes the 0.999 floor, the inequality direction, the range
/// guard `[-1.0, 1.0]`, or the non-finite handling must break this
/// test before a teacher `apr convert --quantize q4_k_m` quality
/// run is launched.
#[test]
fn falsify_ship_003_cosine_similarity_threshold_logic() {
// Section 1: exact boundary 0.999 → Pass. `0.999` in f32 is
// `0x3F7FBE77` = 0.99900001287460327148... — this IS the const
// value, so the boundary test is `sim == threshold` and must
// Pass.
assert_eq!(
verdict_from_cosine_similarity(
AC_SHIP1_003_MIN_COSINE_SIMILARITY,
AC_SHIP1_003_MIN_COSINE_SIMILARITY
),
Ship003Verdict::Pass,
"sim == threshold at the boundary must Pass",
);
// Section 2: one ULP below the threshold → Fail. This is the
// sharpest single-layer counter-example — a quantizer that
// lands at the next representable float below 0.999 must fail
// the ship gate. `f32::from_bits` yields `0x3F7FBE76` =
// 0.99899995326995849609... which is strictly less than the
// stored value of `AC_SHIP1_003_MIN_COSINE_SIMILARITY`.
let one_ulp_below = f32::from_bits(AC_SHIP1_003_MIN_COSINE_SIMILARITY.to_bits() - 1);
assert!(
one_ulp_below < AC_SHIP1_003_MIN_COSINE_SIMILARITY,
"harness sanity: ULP-below must really be less than threshold",
);
assert_eq!(
verdict_from_cosine_similarity(one_ulp_below, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"one ULP below 0.999 must Fail (sharpest counter-example)",
);
// Section 3: safely above — 0.9999, 0.99999, 1.0 all Pass.
assert_eq!(
verdict_from_cosine_similarity(0.9999, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Pass,
"0.9999 must Pass the 0.999 floor",
);
assert_eq!(
verdict_from_cosine_similarity(1.0, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Pass,
"identical cosine 1.0 must Pass (identity quantizer)",
);
// Section 4: safely below — 0.998, 0.5, 0.0, -0.5, -1.0 all Fail.
assert_eq!(
verdict_from_cosine_similarity(0.998, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"0.998 must Fail (0.001 below floor)",
);
assert_eq!(
verdict_from_cosine_similarity(0.5, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"0.5 must Fail",
);
assert_eq!(
verdict_from_cosine_similarity(0.0, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"orthogonal (0.0) must Fail — worst-case quantizer",
);
assert_eq!(
verdict_from_cosine_similarity(-1.0, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"antipodal (-1.0) must Fail — sign-flipped quantizer",
);
// Section 5: monotonicity — sweep sim over a range crossing
// the threshold, verdict flips from Fail → Pass exactly once
// and never flips back. If a future refactor introduces a
// non-monotonic guard (e.g. accidentally rejecting sim == 1.0
// as "suspiciously perfect"), this test breaks.
let mut seen_pass = false;
// Sweep 0.990 .. 1.000 in 1e-4 steps (101 points — enough to
// straddle the 0.999 boundary and surface any non-monotonic
// mutant).
for i in 0..=100 {
#[allow(clippy::cast_precision_loss)]
let sim = 0.990 + (i as f32) * 0.0001;
let v = verdict_from_cosine_similarity(sim, AC_SHIP1_003_MIN_COSINE_SIMILARITY);
if v == Ship003Verdict::Pass {
seen_pass = true;
} else if seen_pass {
panic!("monotonicity broken: sim={sim} flipped back to Fail after Pass");
}
}
// Section 6: non-finite similarity guard. A dot-product
// routine that divides by zero norm, or a tensor with Inf
// entries, must produce a conservative Fail rather than
// silently flipping to Pass via NaN comparison semantics.
assert_eq!(
verdict_from_cosine_similarity(f32::NAN, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"NaN sim must Fail conservatively",
);
assert_eq!(
verdict_from_cosine_similarity(f32::INFINITY, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"+∞ sim must Fail conservatively",
);
assert_eq!(
verdict_from_cosine_similarity(f32::NEG_INFINITY, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"-∞ sim must Fail conservatively",
);
// Non-finite threshold (contract drift) must also Fail.
assert_eq!(
verdict_from_cosine_similarity(1.0, f32::NAN),
Ship003Verdict::Fail,
"NaN threshold must Fail conservatively",
);
assert_eq!(
verdict_from_cosine_similarity(1.0, f32::INFINITY),
Ship003Verdict::Fail,
"+∞ threshold must Fail conservatively",
);
// Section 7: range guard — cosine similarity must live in
// `[-1.0, 1.0]`. Any value outside that range is mathematical
// nonsense (typically a broken inner-product routine that
// didn't normalize, or a precision-loss bug) and must Fail
// even if it's "above" the threshold.
assert_eq!(
verdict_from_cosine_similarity(1.0001, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"sim > 1.0 must Fail (range guard, not Pass)",
);
assert_eq!(
verdict_from_cosine_similarity(2.0, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"sim = 2.0 must Fail (range guard)",
);
assert_eq!(
verdict_from_cosine_similarity(-1.0001, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"sim < -1.0 must Fail (range guard)",
);
// Out-of-range threshold (contract drift) must also Fail.
assert_eq!(
verdict_from_cosine_similarity(1.0, 1.5),
Ship003Verdict::Fail,
"threshold > 1.0 must Fail (contract drift)",
);
assert_eq!(
verdict_from_cosine_similarity(1.0, -2.0),
Ship003Verdict::Fail,
"threshold < -1.0 must Fail (contract drift)",
);
// Section 8: provenance pin — the 0.999 constant is load-
// bearing and lockstepped with the spec. If AC-SHIP1-003 ever
// changes the floor (say to 0.9995 for a tighter ship gate),
// this const must move in lockstep and the rest of the survey
// follows.
#[allow(clippy::float_cmp)]
{
assert_eq!(
AC_SHIP1_003_MIN_COSINE_SIMILARITY, 0.999,
"minimum cosine similarity is 0.999 (spec §4.2 AC-SHIP1-003)",
);
}
}
/// FALSIFY-SHIP-003 aggregate-AND algorithm-level PARTIAL discharge:
/// prove the per-layer aggregate combinator. Any edit that changes
/// the "any layer < 0.999 fails the whole model" shape, or the
/// conservative empty-Fail, must break this test.
#[test]
fn falsify_ship_003_per_layer_aggregate_and() {
// Section 1: all-Pass vector → Pass. Sweep includes exact
// boundary (0.999), well-above (0.9999), and identity (1.0).
assert_eq!(
verdict_from_per_layer_cosines(
&[
AC_SHIP1_003_MIN_COSINE_SIMILARITY,
0.9999,
1.0,
0.9995,
0.99999,
],
AC_SHIP1_003_MIN_COSINE_SIMILARITY
),
Ship003Verdict::Pass,
"vector of all Pass must be aggregate Pass",
);
// Section 2: single-layer failure → aggregate Fail. A
// realistic 28-layer Qwen2.5-7B model would have 28 blocks
// × 7 projections = 196 cosines; we simulate by inserting
// one Fail into an otherwise Pass vector.
let one_ulp_below = f32::from_bits(AC_SHIP1_003_MIN_COSINE_SIMILARITY.to_bits() - 1);
let mut mostly_pass = [AC_SHIP1_003_MIN_COSINE_SIMILARITY; 196];
mostly_pass[100] = one_ulp_below;
assert_eq!(
verdict_from_per_layer_cosines(&mostly_pass, AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"195 Pass + 1 Fail must be aggregate Fail (any-layer rule)",
);
// Section 3: all-Fail vector → aggregate Fail.
assert_eq!(
verdict_from_per_layer_cosines(
&[0.5, 0.7, 0.8, 0.9],
AC_SHIP1_003_MIN_COSINE_SIMILARITY
),
Ship003Verdict::Fail,
"all-Fail must be aggregate Fail",
);
// Section 4: empty vector → conservative Fail. A round-trip
// that produced zero per-layer measurements is a harness bug,
// not a quantizer success. Avoid the "vacuously true"
// aggregate-AND trap.
assert_eq!(
verdict_from_per_layer_cosines(&[], AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"empty per-layer vector must Fail conservatively",
);
// Section 5: single-element vector — behaves as single-layer.
// Smoke test that aggregate doesn't introduce off-by-one or
// special-case the len==1 path.
assert_eq!(
verdict_from_per_layer_cosines(&[1.0], AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Pass,
"single-element all-Pass must be aggregate Pass",
);
assert_eq!(
verdict_from_per_layer_cosines(&[0.5], AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"single-element Fail must be aggregate Fail",
);
// Section 6: first-layer Fail short-circuits — any tainted
// value (NaN, out-of-range) in position 0 must produce Fail
// without reading later entries. This also proves the
// aggregate routes through `verdict_from_cosine_similarity`
// rather than re-implementing the single-layer rule.
assert_eq!(
verdict_from_per_layer_cosines(
&[f32::NAN, 1.0, 1.0],
AC_SHIP1_003_MIN_COSINE_SIMILARITY
),
Ship003Verdict::Fail,
"first-layer NaN must Fail even if remaining layers Pass",
);
assert_eq!(
verdict_from_per_layer_cosines(&[2.0, 1.0, 1.0], AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"first-layer out-of-range must Fail even if remaining layers Pass",
);
// Section 7: last-layer Fail is still caught — proves the
// aggregate doesn't short-circuit to Pass after seeing any
// Pass (a classic mutation: replace `== Fail` with `== Pass`
// in the early-return, which would flip the whole aggregate).
assert_eq!(
verdict_from_per_layer_cosines(&[1.0, 1.0, 0.5], AC_SHIP1_003_MIN_COSINE_SIMILARITY),
Ship003Verdict::Fail,
"last-layer Fail must still Fail the aggregate",
);
}
/// FALSIFY-QW2E-SHIP-003 YAML binding: parses
/// `qwen2-e2e-verification-v1.yaml` and asserts the
/// FALSIFY-QW2E-SHIP-003 falsification block has been promoted from
/// `PARTIAL_ALGORITHM_LEVEL` (v1.4.0) → `DISCHARGED` (v1.10.0) via
/// live `apr diff` 339-tensor cosine sweep on noah-Lambda-Vector
/// RTX 4090. Falsifier: if the contract is edited to drop the
/// live-evidence block or downgrade the discharge marker, this
/// test fails before any compute runs.
#[test]
fn falsify_ship_003_yaml_binding_pins_discharged_status() {
const CONTRACT_YAML: &str =
include_str!("../../../../contracts/qwen2-e2e-verification-v1.yaml");
let doc: serde_yaml::Value = serde_yaml::from_str(CONTRACT_YAML)
.expect("qwen2-e2e-verification-v1.yaml must parse as YAML");
let falsifications = doc["falsification_tests"]
.as_sequence()
.expect("falsification_tests must be a sequence");
let block = falsifications
.iter()
.find(|b| b["id"].as_str() == Some("FALSIFY-QW2E-SHIP-003"))
.expect("FALSIFY-QW2E-SHIP-003 must exist in qwen2-e2e-verification-v1");
assert_eq!(
block["discharge_status"].as_str(),
Some("DISCHARGED"),
"FALSIFY-QW2E-SHIP-003 must advertise DISCHARGED \
(live `apr diff` 339-tensor cosine sweep at v1.10.0; \
previous PARTIAL_ALGORITHM_LEVEL at v1.4.0)",
);
assert!(
block["discharged_evidence"].is_mapping(),
"FALSIFY-QW2E-SHIP-003 DISCHARGED status requires a discharged_evidence \
block documenting host, command, cosine summary, and live verdicts",
);
assert_eq!(
block["discharged_evidence"]["host"].as_str(),
Some("noah-Lambda-Vector"),
"discharged_evidence.host must pin to lambda-labs RTX 4090",
);
assert_eq!(
block["discharged_evidence"]["aggregate_verdict"].as_str(),
Some("Pass"),
"discharged_evidence.aggregate_verdict must equal Pass",
);
assert_eq!(
block["discharged_evidence"]["tensors_compared"].as_u64(),
Some(339),
"discharged_evidence.tensors_compared must equal 339 (Qwen2.5-Coder-7B canonical)",
);
assert_eq!(
block["discharged_evidence"]["cosine_summary"]["below_threshold_count"].as_u64(),
Some(0),
"discharged_evidence.cosine_summary.below_threshold_count must equal 0 \
(zero tensors below the 0.999 floor)",
);
let live_evidence = block["discharged_evidence"]["evidence_discharged_by_live"]
.as_sequence()
.expect(
"FALSIFY-QW2E-SHIP-003 DISCHARGED requires \
discharged_evidence.evidence_discharged_by_live (live RTX 4090 evidence list)",
);
assert!(
!live_evidence.is_empty(),
"evidence_discharged_by_live must list at least one live observation",
);
}
}