Skip to main content

lua_vm/
ctype.rs

1//! Lua ctype — character-classification table and predicates.
2//!
3//! Ported from `reference/lua-5.4.7/src/lctype.c` and `lctype.h`.
4//!
5//! Lua ships its own ctype replacements, optimised for its specific needs.
6//! These do **not** match the standard C `<ctype.h>` semantics exactly; in
7//! particular `lislalpha` / `lislalnum` treat `'_'` as alphabetic, and the
8//! table is seeded for ASCII byte ranges only (with high bytes left at 0x00
9//! unless `LUA_UCID` is enabled — see PORT NOTE below).
10//!
11//! On ASCII targets (`LUA_USE_CTYPE=0`, the default) the implementation is a
12//! 257-entry byte lookup table.  Each entry is a bitfield:
13//!
14//! | bit | name      | meaning                                      |
15//! |-----|-----------|----------------------------------------------|
16//! |  0  | ALPHABIT  | Lua-alphabetic: ASCII letters plus `_`       |
17//! |  1  | DIGITBIT  | decimal digit `0`-`9`                        |
18//! |  2  | PRINTBIT  | printable (graph + space)                    |
19//! |  3  | SPACEBIT  | whitespace (ASCII space, TAB, LF, VT, FF, CR)|
20//! |  4  | XDIGITBIT | hex digit `0`-`9`, `A`-`F`, `a`-`f`         |
21//!
22//! `test_prop(c, mask)` indexes the table as `CTYPE_TABLE[(c + 1) as usize]`,
23//! which allows `c = -1` (the `EOZ` end-of-stream sentinel) without underflow.
24//!
25//! PORT NOTE: The C code supports a compile-time `LUA_UCID` flag that sets all
26//! non-ASCII bytes (0x80-0xFF, minus invalid UTF-8 sequences) to `ALPHABIT`
27//! so that Unicode identifiers are recognised.  That path (`NONA = 0x01`) is
28//! not translated here; only the default `NONA = 0x00` path is ported.
29//! Enable it in Phase B by introducing a Cargo feature flag.
30
31// C: #define ALPHABIT  0
32const ALPHABIT: u32 = 0;
33
34// C: #define DIGITBIT  1
35const DIGITBIT: u32 = 1;
36
37// C: #define PRINTBIT  2
38const PRINTBIT: u32 = 2;
39
40// C: #define SPACEBIT  3
41const SPACEBIT: u32 = 3;
42
43// C: #define XDIGITBIT 4
44const XDIGITBIT: u32 = 4;
45
46// C: #define MASK(B) (1 << (B))
47// Inlined at each call site below as `1u8 << BIT`.
48
49// C: #define NONA 0x00   /* non-ASCII bytes are not alphabetic by default */
50// LUA_UCID disabled — all non-ASCII bytes remain 0x00.
51
52// C: LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = { ... };
53//
54// UCHAR_MAX + 2 = 255 + 2 = 257 entries.
55// Entry 0         → EOZ sentinel (c = -1; index = -1 + 1 = 0).
56// Entries 1-256   → bytes 0x00-0xFF.
57//
58// Bit-flag legend (combined values seen in the table):
59//   0x00 = no property (NUL, control chars, DEL, high bytes)
60//   0x04 = PRINTBIT only (punctuation, symbols)
61//   0x05 = ALPHABIT | PRINTBIT (non-hex letters + '_')
62//   0x06 = DIGITBIT | PRINTBIT (this value does not appear alone; digits always have XDIGITBIT)
63//   0x08 = SPACEBIT (TAB through CR)
64//   0x0c = SPACEBIT | PRINTBIT (ASCII space 0x20)
65//   0x15 = ALPHABIT | PRINTBIT | XDIGITBIT (A-F, a-f)
66//   0x16 = DIGITBIT | PRINTBIT | XDIGITBIT (0-9)
67pub(crate) static CTYPE_TABLE: [u8; 257] = [
68    // C: 0x00,  /* EOZ */
69    0x00,
70    // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 0. bytes 0x00-0x07 */
71    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
72    // C: 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,  /* bytes 0x08-0x0F */
73    //    BS    TAB   LF    VT    FF    CR
74    0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,
75    // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 1. bytes 0x10-0x17 */
76    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
77    // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* bytes 0x18-0x1F */
78    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
79    // C: 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,  /* 2. bytes 0x20-0x27 */
80    //    SPC   !     "     #     $     %     &     '
81    0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
82    // C: 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,  /* bytes 0x28-0x2F */
83    //    (     )     *     +     ,     -     .     /
84    0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
85    // C: 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16,  /* 3. bytes 0x30-0x37 */
86    //    0     1     2     3     4     5     6     7
87    0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16,
88    // C: 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,  /* bytes 0x38-0x3F */
89    //    8     9     :     ;     <     =     >     ?
90    0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
91    // C: 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,  /* 4. bytes 0x40-0x47 */
92    //    @     A     B     C     D     E     F     G
93    0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,
94    // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  /* bytes 0x48-0x4F */
95    //    H     I     J     K     L     M     N     O
96    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
97    // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  /* 5. bytes 0x50-0x57 */
98    //    P     Q     R     S     T     U     V     W
99    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
100    // C: 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,  /* bytes 0x58-0x5F */
101    //    X     Y     Z     [     \     ]     ^     _
102    0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,
103    // C: 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,  /* 6. bytes 0x60-0x67 */
104    //    `     a     b     c     d     e     f     g
105    0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,
106    // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  /* bytes 0x68-0x6F */
107    //    h     i     j     k     l     m     n     o
108    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
109    // C: 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  /* 7. bytes 0x70-0x77 */
110    //    p     q     r     s     t     u     v     w
111    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
112    // C: 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,  /* bytes 0x78-0x7F */
113    //    x     y     z     {     |     }     ~     DEL
114    0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,
115    // C: NONA * 8, /* 8. bytes 0x80-0x87 */  (NONA = 0x00 in non-LUA_UCID build)
116    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
117    // C: NONA * 8, /* bytes 0x88-0x8F */
118    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
119    // C: NONA * 8, /* 9. bytes 0x90-0x97 */
120    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
121    // C: NONA * 8, /* bytes 0x98-0x9F */
122    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
123    // C: NONA * 8, /* a. bytes 0xA0-0xA7 */
124    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
125    // C: NONA * 8, /* bytes 0xA8-0xAF */
126    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
127    // C: NONA * 8, /* b. bytes 0xB0-0xB7 */
128    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
129    // C: NONA * 8, /* bytes 0xB8-0xBF */
130    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
131    // C: 0x00, 0x00, NONA, NONA, NONA, NONA, NONA, NONA,  /* c. bytes 0xC0-0xC7 */
132    //    0xC0 and 0xC1 are invalid UTF-8 leading bytes → 0x00
133    //    0xC2-0xC7 are valid UTF-8 two-byte sequence starters → NONA (0x00 here)
134    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
135    // C: NONA, NONA, NONA, NONA, NONA, NONA, NONA, NONA,  /* bytes 0xC8-0xCF */
136    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
137    // C: NONA * 8, /* d. bytes 0xD0-0xD7 */
138    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
139    // C: NONA * 8, /* bytes 0xD8-0xDF */
140    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
141    // C: NONA * 8, /* e. bytes 0xE0-0xE7 */
142    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
143    // C: NONA * 8, /* bytes 0xE8-0xEF */
144    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
145    // C: NONA, NONA, NONA, NONA, NONA, 0x00, 0x00, 0x00,  /* f. bytes 0xF0-0xF7 */
146    //    0xF0-0xF4 are valid UTF-8 four-byte starters → NONA (0x00 here)
147    //    0xF5-0xF7 are invalid UTF-8 → 0x00
148    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
149    // C: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00   /* bytes 0xF8-0xFF */
150    //    all invalid UTF-8 sequences → 0x00
151    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
152];
153
154// C: #define testprop(c,p) (luai_ctype_[(c)+1] & (p))
155//
156// `c` is an `i32` in Lua's internal representation: it is either a byte value
157// 0-255, or -1 for EOZ.  Adding 1 shifts the range to 0-256, all valid indices
158// into the 257-element table.
159#[inline]
160fn test_prop(c: i32, mask: u8) -> bool {
161    debug_assert!(
162        c >= -1 && c <= 255,
163        "test_prop: c out of range: {}",
164        c
165    );
166    CTYPE_TABLE[(c + 1) as usize] & mask != 0
167}
168
169// C: #define lislalpha(c) testprop(c, MASK(ALPHABIT))
170//
171// True for ASCII letters A-Z, a-z, and the underscore '_'.
172// Includes non-ASCII bytes if LUA_UCID is enabled (not translated here).
173#[inline]
174pub(crate) fn lislalpha(c: i32) -> bool {
175    test_prop(c, 1u8 << ALPHABIT)
176}
177
178// C: #define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
179//
180// True for ASCII letters, digits, and '_'.
181#[inline]
182pub(crate) fn lislalnum(c: i32) -> bool {
183    test_prop(c, (1u8 << ALPHABIT) | (1u8 << DIGITBIT))
184}
185
186// C: #define lisdigit(c) testprop(c, MASK(DIGITBIT))
187//
188// True for ASCII decimal digits '0'-'9'.
189#[inline]
190pub(crate) fn lisdigit(c: i32) -> bool {
191    test_prop(c, 1u8 << DIGITBIT)
192}
193
194// C: #define lisspace(c) testprop(c, MASK(SPACEBIT))
195//
196// True for ASCII whitespace: space (0x20), TAB (0x09), LF (0x0A),
197// VT (0x0B), FF (0x0C), CR (0x0D).
198#[inline]
199pub(crate) fn lisspace(c: i32) -> bool {
200    test_prop(c, 1u8 << SPACEBIT)
201}
202
203// C: #define lisprint(c) testprop(c, MASK(PRINTBIT))
204//
205// True for printable characters: ASCII space through '~' (0x20-0x7E).
206#[inline]
207pub(crate) fn lisprint(c: i32) -> bool {
208    test_prop(c, 1u8 << PRINTBIT)
209}
210
211// C: #define lisxdigit(c) testprop(c, MASK(XDIGITBIT))
212//
213// True for hexadecimal digits: '0'-'9', 'A'-'F', 'a'-'f'.
214#[inline]
215pub(crate) fn lisxdigit(c: i32) -> bool {
216    test_prop(c, 1u8 << XDIGITBIT)
217}
218
219// C: #define ltolower(c)  \
220//      check_exp(('A' <= (c) && (c) <= 'Z') || (c) == ((c) | ('A' ^ 'a')), \
221//                (c) | ('A' ^ 'a'))
222//
223// Converts an uppercase ASCII letter to its lowercase equivalent by setting
224// bit 5 (0x20).  Only safe to call on uppercase letters A-Z, or on characters
225// that already have bit 5 set (lowercase letters, '.', etc.).
226//
227// From macros.tsv: `check_exp(c, e)` → `{ debug_assert!(c); e }`.
228// `'A' ^ 'a'` = 65 ^ 97 = 32 = 0x20.
229#[inline]
230pub(crate) fn ltolower(c: i32) -> i32 {
231    debug_assert!(
232        ('A' as i32 <= c && c <= 'Z' as i32) || c == (c | ('A' as i32 ^ 'a' as i32)),
233        "ltolower: argument must be an uppercase letter or already lowercase/'.'"
234    );
235    c | ('A' as i32 ^ 'a' as i32)
236}
237
238// ──────────────────────────────────────────────────────────────────────────
239// PORT STATUS
240//   source:        src/lctype.c  (64 lines, 0 functions — only a table + header macros)
241//   target_crate:  lua-vm
242//   confidence:    high
243//   todos:         0
244//   port_notes:    1
245//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
246//   notes:         Straightforward table + inline predicates; LUA_UCID path
247//                  omitted (PORT NOTE in module doc). Phase B: add Cargo
248//                  feature `lua-ucid` that substitutes NONA=0x01 for the
249//                  non-ASCII rows.
250// ──────────────────────────────────────────────────────────────────────────