1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
// SHIP-TWO-001 MODEL-2 — `tokenizer-bpe-v1` (C-TOK-BPE-001)
// algorithm-level PARTIAL discharge for INV-BPE-005.
//
// Contract: `contracts/tokenizer-bpe-v1.yaml`.
// Spec: `docs/specifications/aprender-train/ship-two-models-spec.md`
// MODEL-2 tokenizer pipeline (§26.3), AC-SHIP2-003.
//
// ## What INV-BPE-005 says
//
// description: Unicode normalization is NFC and is applied BEFORE
// BPE encoding. Running nfc(nfc(text)) yields the
// same bytes as nfc(text) (NFC is idempotent — this
// catches double-normalization bugs).
// falsifier: For a test string containing composable sequences
// (e.g. "café" composed vs "café" decomposed),
// tokenizer.encode() must produce identical token
// IDs for both. If they differ, the tokenizer is
// NOT applying NFC pre-encode.
//
// ## What this file proves NOW (`PARTIAL_ALGORITHM_LEVEL`)
//
// Decision rule: given two byte slices (one from the composed
// input, one from the decomposed input) representing the
// post-NFC-pre-BPE form, AND optionally the result of
// nfc(nfc(text)) for double-application idempotence, Pass iff:
//
// composed_nfc == decomposed_nfc (composable equivalence) AND
// composed_nfc == double_nfc (NFC idempotence)
//
// Both equalities are byte-level. Catches three regression classes:
// - Tokenizer not applying NFC at all (composed != decomposed).
// - Tokenizer applying NFD instead of NFC (different canonical
// form; also caught by composed != decomposed).
// - Double-NFC drift (NFC implementation has a non-idempotent bug).
/// Binary verdict for `INV-BPE-005`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BpeInv005Verdict {
/// Composed-input NFC == decomposed-input NFC (composable
/// equivalence) AND nfc(nfc(text)) == nfc(text) (idempotence).
Pass,
/// One or more of:
/// - Either input is empty (caller error — degenerate).
/// - `composed_nfc != decomposed_nfc` (NFC not applied or
/// inconsistent canonical form).
/// - `composed_nfc != double_nfc` (NFC implementation is not
/// idempotent).
Fail,
}
/// Pure verdict function for `INV-BPE-005`.
///
/// Inputs:
/// - `composed_nfc`: result of `nfc(text)` where `text` was provided
/// in NFC-composed form (e.g., "café" with U+00E9).
/// - `decomposed_nfc`: result of `nfc(text)` where `text` was
/// provided in NFD-decomposed form (e.g., "café" with
/// U+0065 U+0301).
/// - `double_nfc`: result of `nfc(nfc(text))` where the inner `nfc`
/// was already applied (idempotence probe).
///
/// Pass iff:
/// 1. All three slices are non-empty (rules out vacuous Pass on
/// empty input),
/// 2. `composed_nfc == decomposed_nfc` (composable equivalence),
/// 3. `composed_nfc == double_nfc` (idempotence on the composed
/// branch — implies idempotence on decomposed too via
/// transitivity since composed_nfc == decomposed_nfc).
///
/// Otherwise `Fail`.
///
/// # Examples
///
/// All three NFC-equivalent — `Pass`:
/// ```
/// use aprender::format::bpe_inv_005::{
/// verdict_from_nfc_idempotence, BpeInv005Verdict,
/// };
/// // "café" composed (4 bytes UTF-8 for café in NFC).
/// let nfc_form: &[u8] = "café".as_bytes();
/// let v = verdict_from_nfc_idempotence(nfc_form, nfc_form, nfc_form);
/// assert_eq!(v, BpeInv005Verdict::Pass);
/// ```
///
/// Composed != decomposed (NFC not applied) — `Fail`:
/// ```
/// use aprender::format::bpe_inv_005::{
/// verdict_from_nfc_idempotence, BpeInv005Verdict,
/// };
/// let composed: &[u8] = "café".as_bytes(); // U+00E9
/// let decomposed: &[u8] = "cafe\u{0301}".as_bytes(); // 'e' + combining acute
/// let v = verdict_from_nfc_idempotence(composed, decomposed, composed);
/// assert_eq!(v, BpeInv005Verdict::Fail);
/// ```
#[must_use]
pub fn verdict_from_nfc_idempotence(
composed_nfc: &[u8],
decomposed_nfc: &[u8],
double_nfc: &[u8],
) -> BpeInv005Verdict {
if composed_nfc.is_empty() || decomposed_nfc.is_empty() || double_nfc.is_empty() {
return BpeInv005Verdict::Fail;
}
if composed_nfc != decomposed_nfc {
return BpeInv005Verdict::Fail;
}
if composed_nfc != double_nfc {
return BpeInv005Verdict::Fail;
}
BpeInv005Verdict::Pass
}
#[cfg(test)]
mod tests {
use super::*;
// -------------------------------------------------------------------------
// Section 1: Pass band — all three byte slices identical.
// -------------------------------------------------------------------------
#[test]
fn pass_all_three_identical_ascii() {
let nfc = b"hello";
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
#[test]
fn pass_all_three_identical_cafe_composed() {
// "café" with composed é (U+00E9). 5 bytes UTF-8: c-a-f-é-(implicit).
let nfc: &[u8] = "café".as_bytes();
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
#[test]
fn pass_all_three_identical_cjk() {
let nfc = "中文测试".as_bytes();
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
#[test]
fn pass_all_three_identical_emoji() {
// Single-codepoint emoji.
let nfc = "🎉".as_bytes();
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
#[test]
fn pass_all_three_identical_mathematical_symbols() {
let nfc = "∑∫π√".as_bytes();
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
// -------------------------------------------------------------------------
// Section 2: Fail band — composed != decomposed (NFC not applied).
// -------------------------------------------------------------------------
#[test]
fn fail_cafe_composed_vs_decomposed() {
// The classic regression: "café" U+00E9 vs "café" e+combining acute.
let composed: &[u8] = "café".as_bytes();
let decomposed: &[u8] = "cafe\u{0301}".as_bytes();
// double_nfc matches composed (the post-NFC form).
let v = verdict_from_nfc_idempotence(composed, decomposed, composed);
assert_eq!(
v,
BpeInv005Verdict::Fail,
"composed != decomposed must Fail (NFC not applied)"
);
}
#[test]
fn fail_completely_different_strings() {
let a = b"hello";
let b = b"world";
let v = verdict_from_nfc_idempotence(a, b, a);
assert_eq!(v, BpeInv005Verdict::Fail);
}
#[test]
fn fail_single_byte_difference() {
let a = b"hello";
let b = b"hellp";
let v = verdict_from_nfc_idempotence(a, b, a);
assert_eq!(v, BpeInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 3: Fail band — idempotence violation (double_nfc drift).
// -------------------------------------------------------------------------
#[test]
fn fail_double_nfc_differs() {
// composed == decomposed, but nfc(nfc) != nfc — non-idempotent
// NFC implementation.
let nfc: &[u8] = "café".as_bytes();
let drifted: &[u8] = "cafe".as_bytes(); // dropped the é
let v = verdict_from_nfc_idempotence(nfc, nfc, drifted);
assert_eq!(
v,
BpeInv005Verdict::Fail,
"double-NFC drift must Fail (non-idempotent)"
);
}
#[test]
fn fail_double_nfc_off_by_one_byte() {
let nfc = b"hello";
let drifted = b"hellp"; // last byte differs
let v = verdict_from_nfc_idempotence(nfc, nfc, drifted);
assert_eq!(v, BpeInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 4: Fail band — both NFC violations and idempotence violation.
// -------------------------------------------------------------------------
#[test]
fn fail_both_violations_combined() {
let composed: &[u8] = "café".as_bytes();
let decomposed: &[u8] = "cafe\u{0301}".as_bytes();
let double = b"foo"; // Completely different from composed
let v = verdict_from_nfc_idempotence(composed, decomposed, double);
assert_eq!(v, BpeInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 5: Fail band — caller errors (empty inputs).
// -------------------------------------------------------------------------
#[test]
fn fail_all_empty() {
let v = verdict_from_nfc_idempotence(&[], &[], &[]);
assert_eq!(
v,
BpeInv005Verdict::Fail,
"all-empty inputs must Fail (vacuous Pass refused)"
);
}
#[test]
fn fail_composed_empty() {
let v = verdict_from_nfc_idempotence(&[], b"abc", b"abc");
assert_eq!(v, BpeInv005Verdict::Fail);
}
#[test]
fn fail_decomposed_empty() {
let v = verdict_from_nfc_idempotence(b"abc", &[], b"abc");
assert_eq!(v, BpeInv005Verdict::Fail);
}
#[test]
fn fail_double_empty() {
let v = verdict_from_nfc_idempotence(b"abc", b"abc", &[]);
assert_eq!(v, BpeInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 6: Symmetry / transitivity properties.
// -------------------------------------------------------------------------
#[test]
fn pass_with_a_b_swapped_when_equal() {
// If composed == decomposed == double, swapping a and b
// doesn't change verdict.
let nfc = b"hello";
let v_ab = verdict_from_nfc_idempotence(nfc, nfc, nfc);
let v_ba = verdict_from_nfc_idempotence(nfc, nfc, nfc); // Same in this trivial case
assert_eq!(v_ab, v_ba);
assert_eq!(v_ab, BpeInv005Verdict::Pass);
}
#[test]
fn fail_three_way_distinct() {
// a != b != c, all distinct. Catches a regression that
// somehow mismatches all three.
let a = b"abc";
let b = b"def";
let c = b"ghi";
let v = verdict_from_nfc_idempotence(a, b, c);
assert_eq!(v, BpeInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 7: Realistic — multi-codepoint composables (e.g., Hangul).
// -------------------------------------------------------------------------
#[test]
fn pass_hangul_precomposed_form() {
// "한" precomposed Hangul syllable U+D55C.
let nfc = "한".as_bytes();
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
#[test]
fn fail_hangul_precomposed_vs_decomposed() {
// Precomposed U+D55C vs decomposed jamo U+1112 + U+1161 + U+11AB.
let composed = "한".as_bytes();
let decomposed = "\u{1112}\u{1161}\u{11AB}".as_bytes();
let v = verdict_from_nfc_idempotence(composed, decomposed, composed);
assert_eq!(
v,
BpeInv005Verdict::Fail,
"Hangul precomposed != decomposed must Fail"
);
}
#[test]
fn pass_long_well_formed_text() {
let text = "The quick brown fox jumps over the lazy dog. \
Café au lait. 中文测试 🎉. ∑∫π. 한국어. 1234567890";
let nfc = text.as_bytes();
let v = verdict_from_nfc_idempotence(nfc, nfc, nfc);
assert_eq!(v, BpeInv005Verdict::Pass);
}
}