Skip to main content

lua_vm/
ctype.rs

1//! Lua ctype — character-classification table and predicates.
2//!
3//! Ported from `reference/lua-5.4.7/src/lctype.c` and `lctype.h`.
4//!
5//! Lua ships its own ctype replacements, optimised for its specific needs.
6//! These do **not** match the standard C `<ctype.h>` semantics exactly; in
7//! particular `lislalpha` / `lislalnum` treat `'_'` as alphabetic, and the
8//! table is seeded for ASCII byte ranges only (with high bytes left at 0x00
9//! unless `LUA_UCID` is enabled — see PORT NOTE below).
10//!
11//! On ASCII targets (`LUA_USE_CTYPE=0`, the default) the implementation is a
12//! 257-entry byte lookup table.  Each entry is a bitfield:
13//!
14//! | bit | name      | meaning                                      |
15//! |-----|-----------|----------------------------------------------|
16//! |  0  | ALPHABIT  | Lua-alphabetic: ASCII letters plus `_`       |
17//! |  1  | DIGITBIT  | decimal digit `0`-`9`                        |
18//! |  2  | PRINTBIT  | printable (graph + space)                    |
19//! |  3  | SPACEBIT  | whitespace (ASCII space, TAB, LF, VT, FF, CR)|
20//! |  4  | XDIGITBIT | hex digit `0`-`9`, `A`-`F`, `a`-`f`         |
21//!
22//! `test_prop(c, mask)` indexes the table as `CTYPE_TABLE[(c + 1) as usize]`,
23//! which allows `c = -1` (the `EOZ` end-of-stream sentinel) without underflow.
24//!
25//! PORT NOTE: The C code supports a compile-time `LUA_UCID` flag that sets all
26//! non-ASCII bytes (0x80-0xFF, minus invalid UTF-8 sequences) to `ALPHABIT`
27//! so that Unicode identifiers are recognised.  That path (`NONA = 0x01`) is
28//! not translated here; only the default `NONA = 0x00` path is ported.
29//! Enable it in Phase B by introducing a Cargo feature flag.
30
31const ALPHABIT: u32 = 0;
32
33const DIGITBIT: u32 = 1;
34
35const PRINTBIT: u32 = 2;
36
37const SPACEBIT: u32 = 3;
38
39const XDIGITBIT: u32 = 4;
40
41// Inlined at each call site below as `1u8 << BIT`.
42
43// LUA_UCID disabled — all non-ASCII bytes remain 0x00.
44
45//
46// UCHAR_MAX + 2 = 255 + 2 = 257 entries.
47// Entry 0         → EOZ sentinel (c = -1; index = -1 + 1 = 0).
48// Entries 1-256   → bytes 0x00-0xFF.
49//
50// Bit-flag legend (combined values seen in the table):
51//   0x00 = no property (NUL, control chars, DEL, high bytes)
52//   0x04 = PRINTBIT only (punctuation, symbols)
53//   0x05 = ALPHABIT | PRINTBIT (non-hex letters + '_')
54//   0x06 = DIGITBIT | PRINTBIT (this value does not appear alone; digits always have XDIGITBIT)
55//   0x08 = SPACEBIT (TAB through CR)
56//   0x0c = SPACEBIT | PRINTBIT (ASCII space 0x20)
57//   0x15 = ALPHABIT | PRINTBIT | XDIGITBIT (A-F, a-f)
58//   0x16 = DIGITBIT | PRINTBIT | XDIGITBIT (0-9)
59pub(crate) static CTYPE_TABLE: [u8; 257] = [
60    0x00,
61    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
62    //    BS    TAB   LF    VT    FF    CR
63    0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,
64    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
65    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
66    //    SPC   !     "     #     $     %     &     '
67    0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
68    //    (     )     *     +     ,     -     .     /
69    0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
70    //    0     1     2     3     4     5     6     7
71    0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16,
72    //    8     9     :     ;     <     =     >     ?
73    0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
74    //    @     A     B     C     D     E     F     G
75    0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,
76    //    H     I     J     K     L     M     N     O
77    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
78    //    P     Q     R     S     T     U     V     W
79    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
80    //    X     Y     Z     [     \     ]     ^     _
81    0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,
82    //    `     a     b     c     d     e     f     g
83    0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05,
84    //    h     i     j     k     l     m     n     o
85    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
86    //    p     q     r     s     t     u     v     w
87    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
88    //    x     y     z     {     |     }     ~     DEL
89    0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,
90    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
91    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
92    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
93    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
94    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
95    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
96    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
97    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
98    //    0xC0 and 0xC1 are invalid UTF-8 leading bytes → 0x00
99    //    0xC2-0xC7 are valid UTF-8 two-byte sequence starters → NONA (0x00 here)
100    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
101    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
102    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
103    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
104    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
105    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
106    //    0xF0-0xF4 are valid UTF-8 four-byte starters → NONA (0x00 here)
107    //    0xF5-0xF7 are invalid UTF-8 → 0x00
108    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
109    //    all invalid UTF-8 sequences → 0x00
110    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
111];
112
113//
114// `c` is an `i32` in Lua's internal representation: it is either a byte value
115// 0-255, or -1 for EOZ.  Adding 1 shifts the range to 0-256, all valid indices
116// into the 257-element table.
117#[inline]
118fn test_prop(c: i32, mask: u8) -> bool {
119    debug_assert!(
120        c >= -1 && c <= 255,
121        "test_prop: c out of range: {}",
122        c
123    );
124    CTYPE_TABLE[(c + 1) as usize] & mask != 0
125}
126
127//
128// True for ASCII letters A-Z, a-z, and the underscore '_'.
129// Includes non-ASCII bytes if LUA_UCID is enabled (not translated here).
130#[inline]
131pub(crate) fn lislalpha(c: i32) -> bool {
132    test_prop(c, 1u8 << ALPHABIT)
133}
134
135//
136// True for ASCII letters, digits, and '_'.
137#[inline]
138pub(crate) fn lislalnum(c: i32) -> bool {
139    test_prop(c, (1u8 << ALPHABIT) | (1u8 << DIGITBIT))
140}
141
142//
143// True for ASCII decimal digits '0'-'9'.
144#[inline]
145pub(crate) fn lisdigit(c: i32) -> bool {
146    test_prop(c, 1u8 << DIGITBIT)
147}
148
149//
150// True for ASCII whitespace: space (0x20), TAB (0x09), LF (0x0A),
151// VT (0x0B), FF (0x0C), CR (0x0D).
152#[inline]
153pub(crate) fn lisspace(c: i32) -> bool {
154    test_prop(c, 1u8 << SPACEBIT)
155}
156
157//
158// True for printable characters: ASCII space through '~' (0x20-0x7E).
159#[inline]
160pub(crate) fn lisprint(c: i32) -> bool {
161    test_prop(c, 1u8 << PRINTBIT)
162}
163
164//
165// True for hexadecimal digits: '0'-'9', 'A'-'F', 'a'-'f'.
166#[inline]
167pub(crate) fn lisxdigit(c: i32) -> bool {
168    test_prop(c, 1u8 << XDIGITBIT)
169}
170
171//      check_exp(('A' <= (c) && (c) <= 'Z') || (c) == ((c) | ('A' ^ 'a')), \
172//                (c) | ('A' ^ 'a'))
173//
174// Converts an uppercase ASCII letter to its lowercase equivalent by setting
175// bit 5 (0x20).  Only safe to call on uppercase letters A-Z, or on characters
176// that already have bit 5 set (lowercase letters, '.', etc.).
177//
178// From macros.tsv: `check_exp(c, e)` → `{ debug_assert!(c); e }`.
179// `'A' ^ 'a'` = 65 ^ 97 = 32 = 0x20.
180#[inline]
181pub(crate) fn ltolower(c: i32) -> i32 {
182    debug_assert!(
183        ('A' as i32 <= c && c <= 'Z' as i32) || c == (c | ('A' as i32 ^ 'a' as i32)),
184        "ltolower: argument must be an uppercase letter or already lowercase/'.'"
185    );
186    c | ('A' as i32 ^ 'a' as i32)
187}
188
189// ──────────────────────────────────────────────────────────────────────────
190// PORT STATUS
191//   source:        src/lctype.c  (64 lines, 0 functions — only a table + header macros)
192//   target_crate:  lua-vm
193//   confidence:    high
194//   todos:         0
195//   port_notes:    1
196//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
197//   notes:         Straightforward table + inline predicates; LUA_UCID path
198//                  omitted (PORT NOTE in module doc). Phase B: add Cargo
199//                  feature `lua-ucid` that substitutes NONA=0x01 for the
200//                  non-ASCII rows.
201// ──────────────────────────────────────────────────────────────────────────