1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
// SHIP-TWO-001 MODEL-2 — `dataset-thestack-python-v1` (C-DATA-THESTACK-PYTHON)
// algorithm-level PARTIAL discharge for INV-DATA-005.
//
// Contract: `contracts/dataset-thestack-python-v1.yaml` v1.0.0 PROPOSED.
// Spec: `docs/specifications/aprender-train/ship-two-models-spec.md`
// MODEL-2 corpus pipeline (§26.2), AC-SHIP2-002.
//
// ## What INV-DATA-005 says
//
// description: corpus_sha256 declared in manifest matches the
// recomputed merkle-style sha256 over sorted shard
// sha256s. Anyone re-ingesting the same source.revision_sha
// with the same seed gets the same corpus_sha256.
// falsifier: On a second host, re-run ingest with the same
// revision_sha and same seed; compare corpus_sha256.
// Mismatch → FAIL.
//
// ## What this file proves NOW (`PARTIAL_ALGORITHM_LEVEL`)
//
// Decision rule: given two recomputed corpus_sha256 byte-arrays from
// two independent ingest hosts using the same source.revision_sha
// and same seed, Pass iff:
//
// host_a == host_b (byte-identical, all 32 bytes)
//
// AND both digests are well-formed (32 bytes each — SHA-256 output
// length). Composes byte-equality with provenance pinning the
// expected SHA-256 digest length. The contract falsifier ("Mismatch
// → FAIL") admits no near-equality band; reproducibility is binary.
/// Expected length of a SHA-256 digest in bytes.
///
/// Per RFC 6234 / FIPS 180-4: SHA-256 emits a 256-bit (32-byte)
/// output. Pinning this constant catches a regression where the
/// scanner truncates or pads digests, OR where a future drift to
/// SHA-3-256 or BLAKE3 silently changes the manifest representation
/// without bumping the contract.
pub const AC_DATA_INV_005_SHA256_BYTES: usize = 32;
/// Binary verdict for `INV-DATA-005`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DataInv005Verdict {
/// Both digests are 32 bytes long AND byte-identical.
Pass,
/// One or more of:
/// - Either digest is not 32 bytes long (caller error — wrong
/// hash function, truncation, padding bug).
/// - Digests differ in any byte (reproducibility violation).
Fail,
}
/// Pure verdict function for `INV-DATA-005`.
///
/// Inputs:
/// - `host_a`: corpus_sha256 from the first ingest host.
/// - `host_b`: corpus_sha256 from the second ingest host (re-running
/// with same source.revision_sha + same seed).
///
/// Pass iff:
/// 1. `host_a.len() == AC_DATA_INV_005_SHA256_BYTES` (32),
/// 2. `host_b.len() == AC_DATA_INV_005_SHA256_BYTES` (32),
/// 3. `host_a == host_b` (byte-identical).
///
/// Otherwise `Fail`.
///
/// # Examples
///
/// Two independent hosts produce identical 32-byte digest — `Pass`:
/// ```
/// use aprender::format::data_inv_005::{
/// verdict_from_corpus_sha256_pair, DataInv005Verdict,
/// };
/// let host_a = [0xab_u8; 32];
/// let host_b = [0xab_u8; 32];
/// let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
/// assert_eq!(v, DataInv005Verdict::Pass);
/// ```
///
/// Single-byte mismatch — `Fail`:
/// ```
/// use aprender::format::data_inv_005::{
/// verdict_from_corpus_sha256_pair, DataInv005Verdict,
/// };
/// let host_a = [0xab_u8; 32];
/// let mut host_b = [0xab_u8; 32];
/// host_b[0] = 0xac;
/// let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
/// assert_eq!(v, DataInv005Verdict::Fail);
/// ```
#[must_use]
pub fn verdict_from_corpus_sha256_pair(host_a: &[u8], host_b: &[u8]) -> DataInv005Verdict {
if host_a.len() != AC_DATA_INV_005_SHA256_BYTES {
return DataInv005Verdict::Fail;
}
if host_b.len() != AC_DATA_INV_005_SHA256_BYTES {
return DataInv005Verdict::Fail;
}
if host_a == host_b {
DataInv005Verdict::Pass
} else {
DataInv005Verdict::Fail
}
}
#[cfg(test)]
mod tests {
use super::*;
// -------------------------------------------------------------------------
// Section 1: Provenance pin — SHA-256 is exactly 32 bytes.
// -------------------------------------------------------------------------
#[test]
fn provenance_sha256_byte_length_is_32() {
assert_eq!(AC_DATA_INV_005_SHA256_BYTES, 32);
}
// -------------------------------------------------------------------------
// Section 2: Pass band — identical digests.
// -------------------------------------------------------------------------
#[test]
fn pass_two_identical_digests_all_zeros() {
let host_a = [0u8; 32];
let host_b = [0u8; 32];
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Pass);
}
#[test]
fn pass_two_identical_digests_all_ones() {
let host_a = [0xff_u8; 32];
let host_b = [0xff_u8; 32];
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Pass);
}
#[test]
fn pass_realistic_sha256_pattern() {
// A plausible non-trivial SHA-256 digest.
let digest = [
0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f,
0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b,
0x78, 0x52, 0xb8, 0x55,
];
let v = verdict_from_corpus_sha256_pair(&digest, &digest);
assert_eq!(v, DataInv005Verdict::Pass);
}
// -------------------------------------------------------------------------
// Section 3: Fail band — single-byte mismatch (reproducibility violation).
// -------------------------------------------------------------------------
#[test]
fn fail_first_byte_differs() {
let host_a = [0xab_u8; 32];
let mut host_b = [0xab_u8; 32];
host_b[0] = 0xac;
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(
v,
DataInv005Verdict::Fail,
"single-byte mismatch must Fail"
);
}
#[test]
fn fail_last_byte_differs() {
let host_a = [0xab_u8; 32];
let mut host_b = [0xab_u8; 32];
host_b[31] = 0xac;
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
#[test]
fn fail_middle_byte_differs() {
let host_a = [0xab_u8; 32];
let mut host_b = [0xab_u8; 32];
host_b[15] = 0xac;
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
#[test]
fn fail_one_bit_differs() {
// Smallest possible mismatch: one bit flipped at byte 0.
let host_a = [0u8; 32];
let mut host_b = [0u8; 32];
host_b[0] = 0x01;
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
#[test]
fn fail_completely_different() {
let host_a = [0x00_u8; 32];
let host_b = [0xff_u8; 32];
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 4: Fail band — caller errors (wrong digest length).
// -------------------------------------------------------------------------
#[test]
fn fail_host_a_too_short() {
let host_a = [0u8; 31]; // SHA-1 length
let host_b = [0u8; 32];
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
#[test]
fn fail_host_b_too_short() {
let host_a = [0u8; 32];
let host_b = [0u8; 16]; // MD5 length
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
#[test]
fn fail_both_zero_length() {
let v = verdict_from_corpus_sha256_pair(&[], &[]);
assert_eq!(
v,
DataInv005Verdict::Fail,
"empty digests must Fail (caller error)"
);
}
#[test]
fn fail_host_a_too_long() {
let host_a = [0u8; 64]; // SHA-512 length
let host_b = [0u8; 32];
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(v, DataInv005Verdict::Fail);
}
#[test]
fn fail_both_wrong_length_but_equal() {
// Two 16-byte arrays that match each other — must still
// Fail because they aren't SHA-256 length. Catches a
// regression that would silently accept MD5 collisions.
let host_a = [0xab_u8; 16];
let host_b = [0xab_u8; 16];
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(
v,
DataInv005Verdict::Fail,
"matching but wrong-length digests must Fail"
);
}
// -------------------------------------------------------------------------
// Section 5: Boundary sweep — every byte position differing.
// -------------------------------------------------------------------------
#[test]
fn fail_at_every_byte_position() {
// For each of the 32 byte positions, flipping one byte must
// Fail. Catches a regression that compares only a prefix or
// a hash of the digest.
for pos in 0..32 {
let host_a = [0xab_u8; 32];
let mut host_b = [0xab_u8; 32];
host_b[pos] ^= 0x01;
let v = verdict_from_corpus_sha256_pair(&host_a, &host_b);
assert_eq!(
v,
DataInv005Verdict::Fail,
"byte position {pos} flip must Fail"
);
}
}
// -------------------------------------------------------------------------
// Section 6: Symmetry — verdict is symmetric in (a, b).
// -------------------------------------------------------------------------
#[test]
fn verdict_is_symmetric() {
let host_a = [0u8; 32];
let mut host_b = [0u8; 32];
host_b[7] = 0xff;
let ab = verdict_from_corpus_sha256_pair(&host_a, &host_b);
let ba = verdict_from_corpus_sha256_pair(&host_b, &host_a);
assert_eq!(ab, ba, "verdict must be symmetric in (a, b)");
assert_eq!(ab, DataInv005Verdict::Fail);
}
// -------------------------------------------------------------------------
// Section 7: Realistic — well-known SHA-256 of empty string.
// -------------------------------------------------------------------------
#[test]
fn pass_well_known_sha256_empty_string() {
// SHA-256("") = e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
// Two independent ingest hosts both producing this digest →
// Pass.
let empty_sha256 = [
0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f,
0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b,
0x78, 0x52, 0xb8, 0x55,
];
let v = verdict_from_corpus_sha256_pair(&empty_sha256, &empty_sha256);
assert_eq!(v, DataInv005Verdict::Pass);
}
}