lua_vm/ctype.rs
1//! Lua ctype — character-classification table and predicates.
2//!
3//! Ported from `reference/lua-5.4.7/src/lctype.c` and `lctype.h`.
4//!
5//! Lua ships its own ctype replacements, optimised for its specific needs.
6//! These do **not** match the standard C `<ctype.h>` semantics exactly; in
7//! particular `lislalpha` / `lislalnum` treat `'_'` as alphabetic, and the
8//! table is seeded for ASCII byte ranges only (with high bytes left at 0x00
9//! unless `LUA_UCID` is enabled — see PORT NOTE below).
10//!
11//! On ASCII targets (`LUA_USE_CTYPE=0`, the default) the implementation is a
12//! 257-entry byte lookup table. Each entry is a bitfield:
13//!
14//! | bit | name | meaning |
15//! |-----|-----------|----------------------------------------------|
16//! | 0 | ALPHABIT | Lua-alphabetic: ASCII letters plus `_` |
17//! | 1 | DIGITBIT | decimal digit `0`-`9` |
18//! | 2 | PRINTBIT | printable (graph + space) |
19//! | 3 | SPACEBIT | whitespace (ASCII space, TAB, LF, VT, FF, CR)|
20//! | 4 | XDIGITBIT | hex digit `0`-`9`, `A`-`F`, `a`-`f` |
21//!
22//! `test_prop(c, mask)` indexes the table as `CTYPE_TABLE[(c + 1) as usize]`,
23//! which allows `c = -1` (the `EOZ` end-of-stream sentinel) without underflow.
24//!
25//! PORT NOTE: The C code supports a compile-time `LUA_UCID` flag that sets all
26//! non-ASCII bytes (0x80-0xFF, minus invalid UTF-8 sequences) to `ALPHABIT`
27//! so that Unicode identifiers are recognised. That path (`NONA = 0x01`) is
28//! not translated here; only the default `NONA = 0x00` path is ported.
29//! Enable it in Phase B by introducing a Cargo feature flag.
30
31// C: #define ALPHABIT 0
32const ALPHABIT: u32 = 0;
33
34// C: #define DIGITBIT 1
35const DIGITBIT: u32 = 1;
36
37// C: #define PRINTBIT 2
38const PRINTBIT: u32 = 2;
39
40// C: #define SPACEBIT 3
41const SPACEBIT: u32 = 3;
42
43// C: #define XDIGITBIT 4
44const XDIGITBIT: u32 = 4;
45
46// C: #define MASK(B) (1 << (B))
47// Inlined at each call site below as `1u8 << BIT`.
48
49// C: #define NONA 0x00 /* non-ASCII bytes are not alphabetic by default */
50// LUA_UCID disabled — all non-ASCII bytes remain 0x00.
51
52// C: LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = { ... };
53//
54// UCHAR_MAX + 2 = 255 + 2 = 257 entries.
55// Entry 0 → EOZ sentinel (c = -1; index = -1 + 1 = 0).
56// Entries 1-256 → bytes 0x00-0xFF.
57//
58// Bit-flag legend (combined values seen in the table):
59// 0x00 = no property (NUL, control chars, DEL, high bytes)
60// 0x04 = PRINTBIT only (punctuation, symbols)
61// 0x05 = ALPHABIT | PRINTBIT (non-hex letters + '_')
62// 0x06 = DIGITBIT | PRINTBIT (this value does not appear alone; digits always have XDIGITBIT)
63// 0x08 = SPACEBIT (TAB through CR)
64// 0x0c = SPACEBIT | PRINTBIT (ASCII space 0x20)
65// 0x15 = ALPHABIT | PRINTBIT | XDIGITBIT (A-F, a-f)
66// 0x16 = DIGITBIT | PRINTBIT | XDIGITBIT (0-9)
67pub(crate) static CTYPE_TABLE: [u8; 257] = [
68 // C: 0x00, /* EOZ */
69 0x00,
70 // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. bytes 0x00-0x07 */
71 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
72 // C: 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, /* bytes 0x08-0x0F */
73 // BS TAB LF VT FF CR
74 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,
75 // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. bytes 0x10-0x17 */
76 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
77 // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* bytes 0x18-0x1F */
78 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
79 // C: 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. bytes 0x20-0x27 */
80 // SPC ! " # $ % & '
81 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
82 // C: 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* bytes 0x28-0x2F */
83 // ( ) * + , - . /
84 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
85 // C: 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. bytes 0x30-0x37 */
86 // 0 1 2 3 4 5 6 7
87 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16,
88 // C: 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* bytes 0x38-0x3F */
89 // 8 9 : ; < = > ?
90 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
91 // C: 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. bytes 0x40-0x47 */
92 // @ A B C D E F G
93 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,
94 // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* bytes 0x48-0x4F */
95 // H I J K L M N O
96 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
97 // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. bytes 0x50-0x57 */
98 // P Q R S T U V W
99 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
100 // C: 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05, /* bytes 0x58-0x5F */
101 // X Y Z [ \ ] ^ _
102 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,
103 // C: 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. bytes 0x60-0x67 */
104 // ` a b c d e f g
105 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,
106 // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* bytes 0x68-0x6F */
107 // h i j k l m n o
108 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
109 // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. bytes 0x70-0x77 */
110 // p q r s t u v w
111 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
112 // C: 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00, /* bytes 0x78-0x7F */
113 // x y z { | } ~ DEL
114 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,
115 // C: NONA * 8, /* 8. bytes 0x80-0x87 */ (NONA = 0x00 in non-LUA_UCID build)
116 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
117 // C: NONA * 8, /* bytes 0x88-0x8F */
118 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
119 // C: NONA * 8, /* 9. bytes 0x90-0x97 */
120 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
121 // C: NONA * 8, /* bytes 0x98-0x9F */
122 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
123 // C: NONA * 8, /* a. bytes 0xA0-0xA7 */
124 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
125 // C: NONA * 8, /* bytes 0xA8-0xAF */
126 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
127 // C: NONA * 8, /* b. bytes 0xB0-0xB7 */
128 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
129 // C: NONA * 8, /* bytes 0xB8-0xBF */
130 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
131 // C: 0x00, 0x00, NONA, NONA, NONA, NONA, NONA, NONA, /* c. bytes 0xC0-0xC7 */
132 // 0xC0 and 0xC1 are invalid UTF-8 leading bytes → 0x00
133 // 0xC2-0xC7 are valid UTF-8 two-byte sequence starters → NONA (0x00 here)
134 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
135 // C: NONA, NONA, NONA, NONA, NONA, NONA, NONA, NONA, /* bytes 0xC8-0xCF */
136 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
137 // C: NONA * 8, /* d. bytes 0xD0-0xD7 */
138 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
139 // C: NONA * 8, /* bytes 0xD8-0xDF */
140 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
141 // C: NONA * 8, /* e. bytes 0xE0-0xE7 */
142 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
143 // C: NONA * 8, /* bytes 0xE8-0xEF */
144 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
145 // C: NONA, NONA, NONA, NONA, NONA, 0x00, 0x00, 0x00, /* f. bytes 0xF0-0xF7 */
146 // 0xF0-0xF4 are valid UTF-8 four-byte starters → NONA (0x00 here)
147 // 0xF5-0xF7 are invalid UTF-8 → 0x00
148 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
149 // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* bytes 0xF8-0xFF */
150 // all invalid UTF-8 sequences → 0x00
151 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
152];
153
154// C: #define testprop(c,p) (luai_ctype_[(c)+1] & (p))
155//
156// `c` is an `i32` in Lua's internal representation: it is either a byte value
157// 0-255, or -1 for EOZ. Adding 1 shifts the range to 0-256, all valid indices
158// into the 257-element table.
159#[inline]
160fn test_prop(c: i32, mask: u8) -> bool {
161 debug_assert!(
162 c >= -1 && c <= 255,
163 "test_prop: c out of range: {}",
164 c
165 );
166 CTYPE_TABLE[(c + 1) as usize] & mask != 0
167}
168
169// C: #define lislalpha(c) testprop(c, MASK(ALPHABIT))
170//
171// True for ASCII letters A-Z, a-z, and the underscore '_'.
172// Includes non-ASCII bytes if LUA_UCID is enabled (not translated here).
173#[inline]
174pub(crate) fn lislalpha(c: i32) -> bool {
175 test_prop(c, 1u8 << ALPHABIT)
176}
177
178// C: #define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
179//
180// True for ASCII letters, digits, and '_'.
181#[inline]
182pub(crate) fn lislalnum(c: i32) -> bool {
183 test_prop(c, (1u8 << ALPHABIT) | (1u8 << DIGITBIT))
184}
185
186// C: #define lisdigit(c) testprop(c, MASK(DIGITBIT))
187//
188// True for ASCII decimal digits '0'-'9'.
189#[inline]
190pub(crate) fn lisdigit(c: i32) -> bool {
191 test_prop(c, 1u8 << DIGITBIT)
192}
193
194// C: #define lisspace(c) testprop(c, MASK(SPACEBIT))
195//
196// True for ASCII whitespace: space (0x20), TAB (0x09), LF (0x0A),
197// VT (0x0B), FF (0x0C), CR (0x0D).
198#[inline]
199pub(crate) fn lisspace(c: i32) -> bool {
200 test_prop(c, 1u8 << SPACEBIT)
201}
202
203// C: #define lisprint(c) testprop(c, MASK(PRINTBIT))
204//
205// True for printable characters: ASCII space through '~' (0x20-0x7E).
206#[inline]
207pub(crate) fn lisprint(c: i32) -> bool {
208 test_prop(c, 1u8 << PRINTBIT)
209}
210
211// C: #define lisxdigit(c) testprop(c, MASK(XDIGITBIT))
212//
213// True for hexadecimal digits: '0'-'9', 'A'-'F', 'a'-'f'.
214#[inline]
215pub(crate) fn lisxdigit(c: i32) -> bool {
216 test_prop(c, 1u8 << XDIGITBIT)
217}
218
219// C: #define ltolower(c) \
220// check_exp(('A' <= (c) && (c) <= 'Z') || (c) == ((c) | ('A' ^ 'a')), \
221// (c) | ('A' ^ 'a'))
222//
223// Converts an uppercase ASCII letter to its lowercase equivalent by setting
224// bit 5 (0x20). Only safe to call on uppercase letters A-Z, or on characters
225// that already have bit 5 set (lowercase letters, '.', etc.).
226//
227// From macros.tsv: `check_exp(c, e)` → `{ debug_assert!(c); e }`.
228// `'A' ^ 'a'` = 65 ^ 97 = 32 = 0x20.
229#[inline]
230pub(crate) fn ltolower(c: i32) -> i32 {
231 debug_assert!(
232 ('A' as i32 <= c && c <= 'Z' as i32) || c == (c | ('A' as i32 ^ 'a' as i32)),
233 "ltolower: argument must be an uppercase letter or already lowercase/'.'"
234 );
235 c | ('A' as i32 ^ 'a' as i32)
236}
237
238// ──────────────────────────────────────────────────────────────────────────
239// PORT STATUS
240// source: src/lctype.c (64 lines, 0 functions — only a table + header macros)
241// target_crate: lua-vm
242// confidence: high
243// todos: 0
244// port_notes: 1
245// unsafe_blocks: 0 (must be 0 outside explicit unsafe-budget crates)
246// notes: Straightforward table + inline predicates; LUA_UCID path
247// omitted (PORT NOTE in module doc). Phase B: add Cargo
248// feature `lua-ucid` that substitutes NONA=0x01 for the
249// non-ASCII rows.
250// ──────────────────────────────────────────────────────────────────────────