1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
// SHIP-TWO-001 §6 Compound Ship Gates — GATE-SHIP-003 algorithm-level
// PARTIAL discharge.
//
// Spec: docs/specifications/aprender-train/ship-two-models-spec.md §6 row
// `GATE-SHIP-003 | Both models: apr qa Golden Output never regresses
// post-quantize | publish`.
// Contract: contracts/compound-ship-gates-v1.yaml v1.0.0 PROPOSED
// (FALSIFY-GATE-SHIP-003 — wired in the same PR as this file lands).
//
// GATE-SHIP-003 states that for both MODEL-1 and MODEL-2, the
// `apr qa` Golden Output gate MUST produce byte-identical emissions
// before and after a quantization round-trip (`apr convert --quantize
// q4_k_m`). Any drift in the emitted bytes is a ship-blocker: the
// Golden Output gate is the stack's last-line defence against silent
// quality regressions in the distilled/quantized checkpoint.
//
// This file discharges the *decision rule* at `PARTIAL_ALGORITHM_LEVEL`:
// given two byte slices representing pre-quantize and post-quantize
// Golden Output emissions, the verdict is `Pass` iff BOTH are non-empty
// (empty = no Golden Output recorded = no regression proof possible)
// AND they are byte-by-byte equal. The compute-heavy portion (actually
// running `apr qa <model>.apr` on the pre-quantize and post-quantize
// checkpoints to produce the two byte streams) is intentionally out of
// scope here; what this file proves is that the compound gate's
// *comparison shape* cannot be silently relaxed (e.g., to a Unicode-
// folded or case-insensitive compare) without breaking this test.
//
// Conservative-Fail rationale for empty inputs: if the Golden Output
// gate was SKIPPED (tokenizer missing, feature flag off, etc.), there
// is NO evidence that the model emits the canonical bytes. A missing
// Golden Output is treated as a ship-blocker per apr-model-qa-v1.yaml
// `FALSIFY-EX-001` / `--require-golden-output` promotion. Here we
// surface that semantics at the decision-rule layer: empty slice on
// either side → Fail.
/// Binary verdict for FALSIFY-GATE-SHIP-003 / GATE-SHIP-003.
/// `Pass` iff both pre-quantize and post-quantize Golden Output byte
/// streams are non-empty AND byte-by-byte equal. `Fail` otherwise
/// (length mismatch, any byte difference, empty on either side).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GateShip003Verdict {
/// Both byte streams are non-empty and byte-identical. The
/// quantization round-trip preserves Golden Output and the
/// compound gate passes.
Pass,
/// At least one of: either side is empty (no Golden Output
/// recorded — conservative Fail); lengths differ; any byte
/// position differs. MODEL-* publish is blocked.
Fail,
}
/// Algorithm-level verdict rule for FALSIFY-GATE-SHIP-003 /
/// GATE-SHIP-003: pre-quantize vs post-quantize Golden Output byte-
/// identity check.
///
/// Conservative-Fail guards:
///
/// - Either side empty → Fail. An empty Golden Output means the
/// `apr qa` gate was SKIPPED (tokenizer missing, feature flag off,
/// etc.); we cannot prove "no regression" without evidence on both
/// sides, so we conservatively Fail to block the publish.
/// - Lengths differ → Fail (would not even be byte-equal, but we
/// short-circuit on length for speed AND for a clearer failure
/// signal).
/// - Any byte position differs → Fail.
///
/// This is *byte-identity*, not Unicode-folded equality: a whitespace
/// drift, a trailing newline, a BOM promotion are all ship-blockers.
/// The Golden Output gate is the single byte-exact guard against
/// quantization-induced drift; relaxing this rule would let a
/// quantizer that silently corrupts the prompt template pass.
///
/// # Examples
///
/// ```
/// use aprender::format::gate_ship_003::{
/// verdict_from_golden_output_diff, GateShip003Verdict,
/// };
///
/// // Byte-identical Golden Output → Pass.
/// let pre = b"42\n".to_vec();
/// let post = b"42\n".to_vec();
/// assert_eq!(
/// verdict_from_golden_output_diff(&pre, &post),
/// GateShip003Verdict::Pass
/// );
///
/// // Single-byte drift → Fail.
/// let post_drift = b"43\n".to_vec();
/// assert_eq!(
/// verdict_from_golden_output_diff(&pre, &post_drift),
/// GateShip003Verdict::Fail
/// );
/// ```
#[must_use]
pub fn verdict_from_golden_output_diff(
pre_quantize: &[u8],
post_quantize: &[u8],
) -> GateShip003Verdict {
if pre_quantize.is_empty() || post_quantize.is_empty() {
return GateShip003Verdict::Fail;
}
if pre_quantize.len() != post_quantize.len() {
return GateShip003Verdict::Fail;
}
if pre_quantize == post_quantize {
GateShip003Verdict::Pass
} else {
GateShip003Verdict::Fail
}
}
// ─────────────────────────────────────────────────────────────
// Unit tests — FALSIFY-GATE-SHIP-003 algorithm-level proof
// ─────────────────────────────────────────────────────────────
#[cfg(test)]
mod gate_ship_003_tests {
use super::*;
/// FALSIFY-GATE-SHIP-003 algorithm-level PARTIAL discharge: prove
/// the byte-identity comparison rule binding pre-quantize and
/// post-quantize Golden Output byte streams to the compound
/// regression gate. Any edit that relaxes the comparison (case-
/// insensitive, trim-whitespace, Unicode-fold) or silently accepts
/// empty input must break this test.
#[test]
fn falsify_gate_ship_003_golden_output_byte_identity() {
// Section 1: byte-identical non-empty → Pass. Baseline.
let canonical =
b"```python\ndef fib(n):\n return n if n < 2 else fib(n-1)+fib(n-2)\n```\n";
assert_eq!(
verdict_from_golden_output_diff(canonical, canonical),
GateShip003Verdict::Pass,
"byte-identical canonical Golden Output must Pass",
);
// Section 2: length-mismatch — various drift shapes.
let shorter = b"```python\ndef fib(n):\n return n if n < 2 else fib(n-1)+fib(n-2)\n``";
let longer =
b"```python\ndef fib(n):\n return n if n < 2 else fib(n-1)+fib(n-2)\n```\n\n";
assert_eq!(
verdict_from_golden_output_diff(canonical, shorter),
GateShip003Verdict::Fail,
"length mismatch (pre longer) must Fail",
);
assert_eq!(
verdict_from_golden_output_diff(shorter, canonical),
GateShip003Verdict::Fail,
"length mismatch (post longer) must Fail",
);
assert_eq!(
verdict_from_golden_output_diff(canonical, longer),
GateShip003Verdict::Fail,
"length mismatch (trailing newline added) must Fail",
);
// Section 3: single-byte flip at various positions — sharpest
// possible Fail counter-examples. Any mutation that relaxes
// `==` to "close-enough" or "starts-with" would flip these.
for flip_pos in [0, 5, canonical.len() / 2, canonical.len() - 1] {
let mut mutated = canonical.to_vec();
mutated[flip_pos] ^= 0x01;
assert_eq!(
verdict_from_golden_output_diff(canonical, &mutated),
GateShip003Verdict::Fail,
"single-byte flip at position {flip_pos} must Fail",
);
}
// Section 4: both-empty — conservative Fail. The rule is "prove
// no regression"; empty Golden Output on both sides means the
// `apr qa` gate was SKIPPED (tokenizer missing, feature flag
// off). We cannot prove no regression without evidence, so we
// conservatively Fail to block the publish. This mirrors
// apr-model-qa-v1.yaml FALSIFY-EX-001 (`--require-golden-output`
// promotes SKIPPED to Fail).
assert_eq!(
verdict_from_golden_output_diff(b"", b""),
GateShip003Verdict::Fail,
"both-empty must Fail — no Golden Output recorded = no regression proof",
);
// Section 5: one-empty — also conservative Fail. A partial
// SKIP (pre recorded but post not, or vice versa) is still a
// missing-evidence state.
assert_eq!(
verdict_from_golden_output_diff(canonical, b""),
GateShip003Verdict::Fail,
"post empty must Fail — missing post-quantize evidence",
);
assert_eq!(
verdict_from_golden_output_diff(b"", canonical),
GateShip003Verdict::Fail,
"pre empty must Fail — missing pre-quantize evidence",
);
// Section 6: large identical — Pass for a 10_000-byte stream
// (stress-test the byte-by-byte comparison path; catches any
// O(1) slice-pointer-equality shortcut that would silently
// accept aliased-but-not-equal buffers).
let large: Vec<u8> = (0..10_000).map(|i| (i & 0xFF) as u8).collect();
let large_copy = large.clone();
assert_eq!(
verdict_from_golden_output_diff(&large, &large_copy),
GateShip003Verdict::Pass,
"10_000-byte identical streams must Pass (byte-by-byte depth guard)",
);
// A mid-stream single-byte flip in the 10_000-byte stream
// must still Fail.
let mut large_mutated = large.clone();
large_mutated[5000] ^= 0x01;
assert_eq!(
verdict_from_golden_output_diff(&large, &large_mutated),
GateShip003Verdict::Fail,
"mid-stream (idx 5000) single-byte flip must Fail",
);
}
}