1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
//! Layer 1 (design spec ยง3): exhaustive soundness audit for the
//! `quick_check` safe-lead byte set. For every byte in the proposed
//! safe set, enumerate all 3-byte UTF-8 sequences that start with it
//! (4 096 code points per lead) and assert the CCC+QC packed lookup
//! returns `(0, 0)` under every form-shift that claims the byte.
use simd_normalizer::tables_ext::{
CCC_QC_NFC_SHIFT, CCC_QC_NFD_SHIFT, CCC_QC_NFKC_SHIFT, CCC_QC_NFKD_SHIFT, lookup_ccc_qc,
};
// (lead_byte, [shifts that claim it])
const SAFE: &[(u8, &[u32])] = &[
(
0xE4,
&[
CCC_QC_NFC_SHIFT,
CCC_QC_NFD_SHIFT,
CCC_QC_NFKC_SHIFT,
CCC_QC_NFKD_SHIFT,
],
),
(
0xE5,
&[
CCC_QC_NFC_SHIFT,
CCC_QC_NFD_SHIFT,
CCC_QC_NFKC_SHIFT,
CCC_QC_NFKD_SHIFT,
],
),
(
0xE6,
&[
CCC_QC_NFC_SHIFT,
CCC_QC_NFD_SHIFT,
CCC_QC_NFKC_SHIFT,
CCC_QC_NFKD_SHIFT,
],
),
(
0xE7,
&[
CCC_QC_NFC_SHIFT,
CCC_QC_NFD_SHIFT,
CCC_QC_NFKC_SHIFT,
CCC_QC_NFKD_SHIFT,
],
),
(
0xE8,
&[
CCC_QC_NFC_SHIFT,
CCC_QC_NFD_SHIFT,
CCC_QC_NFKC_SHIFT,
CCC_QC_NFKD_SHIFT,
],
),
(
0xE9,
&[
CCC_QC_NFC_SHIFT,
CCC_QC_NFD_SHIFT,
CCC_QC_NFKC_SHIFT,
CCC_QC_NFKD_SHIFT,
],
),
(0xEB, &[CCC_QC_NFC_SHIFT, CCC_QC_NFKC_SHIFT]),
(0xEC, &[CCC_QC_NFC_SHIFT, CCC_QC_NFKC_SHIFT]),
];
#[test]
fn safe_bytes_are_truly_safe() {
for &(lead, shifts) in SAFE {
// 3-byte UTF-8: 1110xxxx 10yyyyyy 10zzzzzz -> cp in 0x0000..=0xFFFF
let base = ((lead & 0x0F) as u32) << 12;
for low12 in 0u32..4096 {
let cp = base | low12;
let Some(c) = char::from_u32(cp) else {
continue;
};
for &shift in shifts {
assert_eq!(
lookup_ccc_qc(c, shift),
(0, 0),
"lead=0x{:02X} cp=U+{:04X} shift={} not safe",
lead,
cp,
shift
);
}
}
}
}