1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
// SHIP-TWO-001 MODEL-2 — `pretokenize-bin-v1` (C-PRETOK-BIN) algorithm-level
// PARTIAL discharge for INV-PRETOK-003.
//
// Contract: `contracts/pretokenize-bin-v1.yaml` v1.0.0 PROPOSED.
//
// ## What INV-PRETOK-003 says
//
// description: total_tokens declared in manifest equals the sum over
// shards of (file_size_bytes / 4). No drift between
// declared and actual.
//
// ## What this file proves NOW (`PARTIAL_ALGORITHM_LEVEL`)
//
// Decision rule: given a declared `total_tokens` count and a list of
// per-shard byte sizes, Pass iff:
//
// declared_total_tokens == sum(per_shard_bytes / 4)
//
// AND every per-shard size is u32-aligned (composes with INV-PRETOK-002,
// pretok_inv_002.rs in this same module). Sum-overflow protection via
// `checked_add` — a corrupted shard with size near u64::MAX cannot crash
// the verdict via wraparound.
//
// Future implementations cannot:
// - Drift the manifest's declared total by ±1 token (caught by exact ==).
// - Skip emitting one shard's bytes from the sum (caught by exact ==).
// - Allow a u64 wrap-around on the sum (caught by checked_add → Fail).
use super::pretok_inv_002::AC_PRETOK_INV_002_U32_BYTE_SIZE;
/// Binary verdict for `INV-PRETOK-003`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PretokInv003Verdict {
/// Declared `total_tokens` exactly matches the sum of
/// `(per_shard_byte_sizes[i] / 4)` across all shards. The shard list
/// is non-empty and every size is u32-aligned.
Pass,
/// One or more of:
/// - Empty shard list (caller error).
/// - Some shard size is not u32-aligned (composes with INV-PRETOK-002).
/// - Sum overflow (would imply absurd corpus size — conservative `Fail`).
/// - Declared `total_tokens` ≠ summed-actual.
Fail,
}
/// Pure verdict function for `INV-PRETOK-003`.
///
/// Inputs:
/// - `declared_total_tokens`: the manifest's claimed total token count.
/// - `per_shard_byte_sizes`: byte size of each emitted shard file.
///
/// Pass iff:
/// 1. The slice is non-empty.
/// 2. Every entry is `% 4 == 0` (u32-aligned, mirrors INV-PRETOK-002).
/// 3. Every entry is `>= 4` (at least one token).
/// 4. `sum(size / 4)` does not overflow u64.
/// 5. `sum(size / 4) == declared_total_tokens`.
///
/// Otherwise `Fail`.
///
/// # Examples
///
/// Manifest matches actual — `Pass`:
/// ```
/// use aprender::format::pretok_inv_003::{
/// verdict_from_manifest_vs_shards, PretokInv003Verdict,
/// };
/// let sizes = vec![1024_u64, 2048, 4096]; // 256 + 512 + 1024 = 1792 tokens
/// assert_eq!(
/// verdict_from_manifest_vs_shards(1792, &sizes),
/// PretokInv003Verdict::Pass,
/// );
/// ```
///
/// Manifest off by one — `Fail`:
/// ```
/// use aprender::format::pretok_inv_003::{
/// verdict_from_manifest_vs_shards, PretokInv003Verdict,
/// };
/// let sizes = vec![1024_u64, 2048, 4096]; // 1792 tokens, declared 1793
/// assert_eq!(
/// verdict_from_manifest_vs_shards(1793, &sizes),
/// PretokInv003Verdict::Fail,
/// );
/// ```
#[must_use]
pub fn verdict_from_manifest_vs_shards(
declared_total_tokens: u64,
per_shard_byte_sizes: &[u64],
) -> PretokInv003Verdict {
if per_shard_byte_sizes.is_empty() {
return PretokInv003Verdict::Fail;
}
let mut summed: u64 = 0;
for &size in per_shard_byte_sizes {
if size < AC_PRETOK_INV_002_U32_BYTE_SIZE {
return PretokInv003Verdict::Fail;
}
if size % AC_PRETOK_INV_002_U32_BYTE_SIZE != 0 {
return PretokInv003Verdict::Fail;
}
let tokens_in_shard = size / AC_PRETOK_INV_002_U32_BYTE_SIZE;
match summed.checked_add(tokens_in_shard) {
Some(new_sum) => summed = new_sum,
None => return PretokInv003Verdict::Fail,
}
}
if summed == declared_total_tokens {
PretokInv003Verdict::Pass
} else {
PretokInv003Verdict::Fail
}
}
#[cfg(test)]
mod tests {
use super::*;
// -------------------------------------------------------------------------
// Section 1: Pass band — declared matches actual.
// -------------------------------------------------------------------------
#[test]
fn pass_simple_three_shards() {
let sizes = vec![1024_u64, 2048, 4096];
// 1024/4 + 2048/4 + 4096/4 = 256 + 512 + 1024 = 1792
assert_eq!(
verdict_from_manifest_vs_shards(1792, &sizes),
PretokInv003Verdict::Pass
);
}
#[test]
fn pass_single_shard_one_token() {
let sizes = vec![4_u64];
assert_eq!(
verdict_from_manifest_vs_shards(1, &sizes),
PretokInv003Verdict::Pass
);
}
#[test]
fn pass_realistic_565m_token_corpus() {
// Per spec: MODEL-2 trained on 565.6M token Python+permissive corpus.
// Approximate by: 100 shards × 22_624_000 bytes (5.656M tokens each).
let sizes = vec![22_624_000_u64; 100];
let total = 100_u64 * (22_624_000 / 4);
assert_eq!(
verdict_from_manifest_vs_shards(total, &sizes),
PretokInv003Verdict::Pass
);
assert_eq!(total, 565_600_000);
}
#[test]
fn pass_uniform_shards() {
let sizes = vec![400_u64; 50]; // 100 tokens × 50 shards = 5000
assert_eq!(
verdict_from_manifest_vs_shards(5000, &sizes),
PretokInv003Verdict::Pass
);
}
// -------------------------------------------------------------------------
// Section 2: Fail band — declared ≠ actual (drift).
// -------------------------------------------------------------------------
#[test]
fn fail_off_by_one_high() {
let sizes = vec![1024_u64, 2048, 4096];
assert_eq!(
verdict_from_manifest_vs_shards(1793, &sizes), // actual = 1792
PretokInv003Verdict::Fail
);
}
#[test]
fn fail_off_by_one_low() {
let sizes = vec![1024_u64, 2048, 4096];
assert_eq!(
verdict_from_manifest_vs_shards(1791, &sizes), // actual = 1792
PretokInv003Verdict::Fail
);
}
#[test]
fn fail_declared_zero_with_real_data() {
let sizes = vec![1024_u64];
assert_eq!(
verdict_from_manifest_vs_shards(0, &sizes),
PretokInv003Verdict::Fail
);
}
#[test]
fn fail_declared_double_actual() {
let sizes = vec![1024_u64];
assert_eq!(
verdict_from_manifest_vs_shards(512, &sizes), // actual = 256
PretokInv003Verdict::Fail
);
}
// -------------------------------------------------------------------------
// Section 3: Fail band — alignment (composes with INV-PRETOK-002).
// -------------------------------------------------------------------------
#[test]
fn fail_misaligned_shard_short_circuit() {
// 1025 % 4 != 0 → Fail before manifest comparison.
let sizes = vec![1025_u64];
assert_eq!(
verdict_from_manifest_vs_shards(256, &sizes),
PretokInv003Verdict::Fail,
"alignment failure short-circuits manifest check"
);
}
#[test]
fn fail_zero_byte_shard() {
let sizes = vec![0_u64];
assert_eq!(
verdict_from_manifest_vs_shards(0, &sizes),
PretokInv003Verdict::Fail
);
}
#[test]
fn fail_size_below_one_token() {
for size in [1_u64, 2, 3] {
let sizes = vec![size];
assert_eq!(
verdict_from_manifest_vs_shards(0, &sizes),
PretokInv003Verdict::Fail
);
}
}
// -------------------------------------------------------------------------
// Section 4: Empty input — caller error → Fail.
// -------------------------------------------------------------------------
#[test]
fn fail_empty_shard_list_with_zero_declared() {
let sizes: Vec<u64> = vec![];
assert_eq!(
verdict_from_manifest_vs_shards(0, &sizes),
PretokInv003Verdict::Fail,
"even matching zeros must Fail on empty list"
);
}
#[test]
fn fail_empty_shard_list_with_nonzero_declared() {
let sizes: Vec<u64> = vec![];
assert_eq!(
verdict_from_manifest_vs_shards(100, &sizes),
PretokInv003Verdict::Fail
);
}
// -------------------------------------------------------------------------
// Section 5: Overflow protection — checked_add cannot wrap silently.
// -------------------------------------------------------------------------
#[test]
fn fail_overflow_via_two_max_shards() {
// u64::MAX is odd → first must catch via alignment.
// To force a sum overflow, use the largest u32-aligned u64.
let max_aligned: u64 = u64::MAX & !0b11; // = u64::MAX - 3
let sizes = vec![max_aligned, max_aligned];
// sum(max_aligned/4 + max_aligned/4) overflows u64.
// Expect Fail (checked_add returns None).
assert_eq!(
verdict_from_manifest_vs_shards(0, &sizes),
PretokInv003Verdict::Fail
);
}
#[test]
fn fail_one_max_aligned_within_range() {
// A single max_aligned shard does NOT overflow on its own
// (max_aligned/4 fits in u64). The verdict's manifest check
// will Fail unless declared exactly equals that max-aligned/4.
let max_aligned: u64 = u64::MAX & !0b11;
let sizes = vec![max_aligned];
let expected_tokens = max_aligned / 4;
assert_eq!(
verdict_from_manifest_vs_shards(expected_tokens, &sizes),
PretokInv003Verdict::Pass,
"single max-aligned shard with correct manifest must Pass"
);
assert_eq!(
verdict_from_manifest_vs_shards(expected_tokens - 1, &sizes),
PretokInv003Verdict::Fail
);
}
// -------------------------------------------------------------------------
// Section 6: Single-shard mutation — drift in any one shard fails.
// -------------------------------------------------------------------------
#[test]
fn single_shard_mutation_at_each_index_fails() {
// Baseline: 5 shards × 4 bytes = 5 tokens declared.
for bad_idx in [0_usize, 2, 4] {
let mut sizes = vec![4_u64; 5];
sizes[bad_idx] = 8; // adds one token to the actual sum
assert_eq!(
verdict_from_manifest_vs_shards(5, &sizes), // declared still 5
PretokInv003Verdict::Fail,
"mutation at index {bad_idx} must Fail"
);
}
}
// -------------------------------------------------------------------------
// Section 7: Sweep — declared probe around actual.
// -------------------------------------------------------------------------
#[test]
fn declared_sweep_around_actual() {
let sizes = vec![400_u64; 10]; // 100 tokens × 10 = 1000 actual
let probes: Vec<(u64, PretokInv003Verdict)> = vec![
(0, PretokInv003Verdict::Fail),
(999, PretokInv003Verdict::Fail),
(1000, PretokInv003Verdict::Pass), // exactly matches
(1001, PretokInv003Verdict::Fail),
(10_000, PretokInv003Verdict::Fail),
(u64::MAX, PretokInv003Verdict::Fail),
];
for (declared, expected) in probes {
assert_eq!(
verdict_from_manifest_vs_shards(declared, &sizes),
expected,
"declared={declared} (actual=1000) expected {expected:?}"
);
}
}
}