Skip to main content

lua_lex/
lib.rs

1//! Lexical analyzer — port of `llex.c` + `llex.h`.
2//!
3//! Provides the Lua 5.4 lexer: character-by-character scanning of a [`ZIO`]
4//! input stream into [`Token`] values, with one-token lookahead.  The
5//! `llex.h` header is merged here per PORTING.md §1.
6//!
7//! # C source files
8//! - `reference/lua-5.4.7/src/llex.c`  (581 lines, 24 functions)
9//! - `reference/lua-5.4.7/src/llex.h`  (91 lines; merged here)
10//!
11//! # Design notes
12//! - `LexState.L` (back-pointer to `lua_State`) is removed.  All functions
13//!   that need `LuaState` receive it as `state: &mut LuaState`.
14//! - `Token.token` is `i32` in Phase A (matching the C `int token` field).
15//!   Single-byte tokens are their ASCII values; reserved-word tokens start at
16//!   `FIRST_RESERVED` (257).  A proper `TokenKind` enum is deferred to Phase B.
17//! - `save` / `save_and_next` are now fallible (`Result<(), LuaError>`); the
18//!   `?` operator replaces the C noreturn `lexerror` call on buffer overflow.
19//! - The `goto read_save / only_save / no_save` pattern in `read_string` is
20//!   translated via the local `EscapeResult` enum.
21
22// TODO(port): resolve remaining cross-crate calls (intern_str, table anchor,
23// number parsing, utf8 encoding) in Phase B.  Canonical cross-crate type
24// imports are now in place per harness/type-vocabulary.tsv (see below).
25
26use std::rc::Rc;
27use std::io::Write as IoWrite;
28
29// PORT NOTE: GcRef<T> = Rc<T> in Phases A–C; replaced by real GC pointer in Phase D.
30// TODO(port): move GcRef to lua-types once the GC crate is defined (Phase D).
31use lua_types::gc::GcRef;
32
33// Canonical cross-crate types: imported from owner crates per
34// harness/type-vocabulary.tsv.  See PORTING.md §7.
35pub use lua_types::LuaError;
36pub use lua_types::LuaString;
37pub use lua_vm::state::LuaState;
38pub use lua_vm::table::LuaTable;
39
40/// Placeholder for `LexBuffer` from `lua_vm::zio`.
41/// TODO(port): replace with `use lua_vm::zio::LexBuffer` in Phase B.
42/// C: `Mbuffer` — growable byte buffer for token text.
43/// types.tsv: Mbuffer → LexBuffer
44pub struct LexBuffer {
45    buffer: Vec<u8>,
46}
47
48impl LexBuffer {
49    /// C: `luaZ_initbuffer` — construct an empty buffer.
50    pub fn new() -> Self {
51        LexBuffer { buffer: Vec::new() }
52    }
53
54    /// C: `#define luaZ_bufflen(b) ((b)->n)` — live byte count.
55    /// macros.tsv: luaZ_bufflen → buf.len()
56    pub fn len(&self) -> usize {
57        self.buffer.len()
58    }
59
60    /// C: `#define luaZ_sizebuffer(b) ((b)->buffsize)` — allocated capacity.
61    /// macros.tsv: luaZ_sizebuffer → buf.capacity()
62    pub fn capacity(&self) -> usize {
63        self.buffer.capacity()
64    }
65
66    /// C: `#define luaZ_buffer(b) ((b)->buffer)` — raw byte slice.
67    /// macros.tsv: luaZ_buffer → buf.as_mut_slice()
68    pub fn as_slice(&self) -> &[u8] {
69        &self.buffer
70    }
71
72    /// C: `#define luaZ_resetbuffer(b) ((b)->n = 0)` — reset to zero length.
73    /// macros.tsv: luaZ_resetbuffer → buf.clear()
74    pub fn clear(&mut self) {
75        self.buffer.clear();
76    }
77
78    /// C: `#define luaZ_buffremove(b, i) ((b)->n -= (i))`.
79    /// macros.tsv: luaZ_buffremove → buf.truncate_by(i)
80    pub fn truncate_by(&mut self, i: usize) {
81        let new_len = self.buffer.len().saturating_sub(i);
82        self.buffer.truncate(new_len);
83    }
84
85    /// C: `luaZ_resizebuffer(L, b, newsize)` — grow/shrink the buffer's
86    /// allocated capacity. In C this changes `buffsize`, not the live byte
87    /// count `n`. The Rust analogue therefore manipulates `Vec::capacity`,
88    /// never `Vec::len` (otherwise `push_byte` would write past the live
89    /// content and leave embedded zero padding inside the token text).
90    pub fn resize(&mut self, _state: &mut LuaState, size: usize) -> Result<(), LuaError> {
91        if size < self.buffer.len() {
92            self.buffer.truncate(size);
93        }
94        if size > self.buffer.capacity() {
95            let extra = size - self.buffer.capacity();
96            self.buffer.reserve_exact(extra);
97        }
98        Ok(())
99    }
100
101    /// Append one byte to the live contents.  Panics if capacity exceeded
102    /// (callers must pre-check via `save`).
103    fn push_byte(&mut self, c: u8) {
104        self.buffer.push(c);
105    }
106}
107
108impl Default for LexBuffer {
109    fn default() -> Self {
110        Self::new()
111    }
112}
113
114/// Placeholder for `ZIO` from `lua_vm::zio`.
115/// TODO(port): replace with `use lua_vm::zio::ZIO` in Phase B.
116/// C: `ZIO` — buffered input stream.
117/// types.tsv: Zio → ZIO
118pub struct ZIO {
119    // TODO(port): full ZIO implementation lives in lua_vm::zio; this is a stub.
120    reader: Box<dyn FnMut() -> Option<Vec<u8>>>,
121    n: usize,
122    p: usize,
123    current_chunk: Vec<u8>,
124}
125
126impl ZIO {
127    /// Construct a ZIO from a reader callback that yields successive chunks.
128    pub fn new(reader: Box<dyn FnMut() -> Option<Vec<u8>>>) -> Self {
129        ZIO { reader, n: 0, p: 0, current_chunk: Vec::new() }
130    }
131
132    /// Construct a ZIO that yields the supplied bytes once and then EOZ.
133    pub fn from_bytes(bytes: Vec<u8>) -> Self {
134        let mut once = Some(bytes);
135        ZIO::new(Box::new(move || once.take()))
136    }
137
138    /// C: `#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z))`
139    /// macros.tsv: zgetc → z.getc()
140    pub fn getc(&mut self) -> i32 {
141        if self.n > 0 {
142            self.n -= 1;
143            let b = self.current_chunk[self.p] as u8;
144            self.p += 1;
145            b as i32
146        } else {
147            self.fill()
148        }
149    }
150
151    fn fill(&mut self) -> i32 {
152        match (self.reader)() {
153            None => EOZ,
154            Some(chunk) if chunk.is_empty() => EOZ,
155            Some(chunk) => {
156                self.n = chunk.len() - 1;
157                self.current_chunk = chunk;
158                self.p = 0;
159                let b = self.current_chunk[self.p] as u8;
160                self.p += 1;
161                b as i32
162            }
163        }
164    }
165}
166
167// ── Constants ─────────────────────────────────────────────────────────────────
168
169// C: #define FIRST_RESERVED  (UCHAR_MAX + 1)
170// macros.tsv: FIRST_RESERVED → const FIRST_RESERVED: i32 = 257
171/// First token kind value that is not a single-byte character.
172/// Single-byte tokens are represented by their ASCII value (0-255).
173pub const FIRST_RESERVED: i32 = 257;
174
175// C: #define LUA_ENV  "_ENV"
176// macros.tsv: LUA_ENV → const LUA_ENV: &[u8] = b"_ENV"
177/// Name of the global environment upvalue.
178pub const LUA_ENV: &[u8] = b"_ENV";
179
180// C: #define NUM_RESERVED  (cast_int(TK_WHILE - FIRST_RESERVED + 1))
181// macros.tsv: NUM_RESERVED → const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize
182/// Number of reserved words (keywords).
183pub const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize;
184
185// C: #define EOZ  (-1)   (from lzio.h)
186// macros.tsv: EOZ → const EOZ: i32 = -1
187/// End-of-stream sentinel returned by ZIO::getc.
188pub const EOZ: i32 = -1;
189
190// C: MAX_SIZE (llimits.h)
191// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = ...
192const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
193    usize::MAX
194} else {
195    i64::MAX as usize
196};
197
198// C: #define LUA_MINBUFFER  32   (llimits.h)
199// macros.tsv: LUA_MIN_BUFFER → const LUA_MIN_BUFFER: usize = 32
200const LUA_MIN_BUFFER: usize = 32;
201
202// ── Token kind constants (ORDER RESERVED — matches C enum RESERVED) ───────────
203//
204// In C these are enum values.  In Rust we use i32 constants for Phase A
205// (faithful to `Token.token: int` in C) with a TODO for a proper enum in Phase B.
206//
207// C: enum RESERVED { TK_AND = FIRST_RESERVED, TK_BREAK, ... }
208
209/// `and`
210pub const TK_AND: i32 = 257;
211/// `break`
212pub const TK_BREAK: i32 = 258;
213/// `do`
214pub const TK_DO: i32 = 259;
215/// `else`
216pub const TK_ELSE: i32 = 260;
217/// `elseif`
218pub const TK_ELSEIF: i32 = 261;
219/// `end`
220pub const TK_END: i32 = 262;
221/// `false`
222pub const TK_FALSE: i32 = 263;
223/// `for`
224pub const TK_FOR: i32 = 264;
225/// `function`
226pub const TK_FUNCTION: i32 = 265;
227/// `goto`
228pub const TK_GOTO: i32 = 266;
229/// `if`
230pub const TK_IF: i32 = 267;
231/// `in`
232pub const TK_IN: i32 = 268;
233/// `local`
234pub const TK_LOCAL: i32 = 269;
235/// `nil`
236pub const TK_NIL: i32 = 270;
237/// `not`
238pub const TK_NOT: i32 = 271;
239/// `or`
240pub const TK_OR: i32 = 272;
241/// `repeat`
242pub const TK_REPEAT: i32 = 273;
243/// `return`
244pub const TK_RETURN: i32 = 274;
245/// `then`
246pub const TK_THEN: i32 = 275;
247/// `true`
248pub const TK_TRUE: i32 = 276;
249/// `until`
250pub const TK_UNTIL: i32 = 277;
251/// `while`  (last keyword; NUM_RESERVED = TK_WHILE - FIRST_RESERVED + 1 = 22)
252pub const TK_WHILE: i32 = 278;
253/// `//`  (floor division)
254pub const TK_IDIV: i32 = 279;
255/// `..`  (concatenation)
256pub const TK_CONCAT: i32 = 280;
257/// `...` (vararg)
258pub const TK_DOTS: i32 = 281;
259/// `==`
260pub const TK_EQ: i32 = 282;
261/// `>=`
262pub const TK_GE: i32 = 283;
263/// `<=`
264pub const TK_LE: i32 = 284;
265/// `~=`
266pub const TK_NE: i32 = 285;
267/// `<<`
268pub const TK_SHL: i32 = 286;
269/// `>>`
270pub const TK_SHR: i32 = 287;
271/// `::`
272pub const TK_DBCOLON: i32 = 288;
273/// `<eof>`
274pub const TK_EOS: i32 = 289;
275/// `<number>`  (float literal)
276pub const TK_FLT: i32 = 290;
277/// `<integer>` (integer literal)
278pub const TK_INT: i32 = 291;
279/// `<name>`    (identifier)
280pub const TK_NAME: i32 = 292;
281/// `<string>`  (string literal)
282pub const TK_STRING: i32 = 293;
283
284// C: static const char *const luaX_tokens [] = { ... };
285// ORDER RESERVED — index 0 = TK_AND - FIRST_RESERVED, etc.
286/// Display strings for tokens, indexed by `token - FIRST_RESERVED`.
287pub static LUAX_TOKENS: &[&[u8]] = &[
288    // keywords (indices 0-21)
289    b"and", b"break", b"do", b"else", b"elseif",
290    b"end", b"false", b"for", b"function", b"goto", b"if",
291    b"in", b"local", b"nil", b"not", b"or", b"repeat",
292    b"return", b"then", b"true", b"until", b"while",
293    // other terminal symbols (indices 22-35)
294    b"//", b"..", b"...", b"==", b">=", b"<=", b"~=",
295    b"<<", b">>", b"::", b"<eof>",
296    b"<number>", b"<integer>", b"<name>", b"<string>",
297];
298
299// ── SemInfo / TokenValue ───────────────────────────────────────────────────────
300
301// C: typedef union { lua_Number r; lua_Integer i; TString *ts; } SemInfo;
302// types.tsv: SemInfo → TokenValue
303/// Semantic payload carried by a token.
304///
305/// Corresponds to `SemInfo` (a C union) in `llex.h`.  In Rust this is a
306/// discriminated union (enum).
307///
308/// # C mapping
309/// ```text
310/// SemInfo.r   → TokenValue::Float(f64)      (lua_Number)
311/// SemInfo.i   → TokenValue::Int(i64)        (lua_Integer)
312/// SemInfo.ts  → TokenValue::Str(GcRef<LuaString>)
313/// (no C field) → TokenValue::None           (default / unset)
314/// ```
315#[derive(Clone)]
316pub enum TokenValue {
317    /// No semantic value (default; used for single-byte and most multi-char tokens).
318    None,
319    /// Float literal payload.  C: `seminfo.r` (`lua_Number`).
320    Float(f64),
321    /// Integer literal payload.  C: `seminfo.i` (`lua_Integer`).
322    Int(i64),
323    /// String/name payload.  C: `seminfo.ts` (`TString *`).
324    Str(GcRef<LuaString>),
325}
326
327// ── Token ─────────────────────────────────────────────────────────────────────
328
329// C: typedef struct Token { int token; SemInfo seminfo; } Token;
330// types.tsv: Token → Token;  Token.token → i32 (Phase A; TODO: TokenKind enum Phase B)
331/// A single lexed token with its semantic payload.
332///
333/// `kind` is an `i32` whose value is either an ASCII byte code (for single-byte
334/// tokens like `+`, `-`, `[`) or one of the `TK_*` constants (for reserved
335/// words, multi-char symbols, and literals).
336///
337/// TODO(port): Phase B — replace `kind: i32` with a proper `TokenKind` enum
338/// covering both single-byte and named tokens (e.g. `TokenKind::Char(u8)` +
339/// named variants).
340#[derive(Clone)]
341pub struct Token {
342    // C: int token;
343    pub kind: i32,
344    // C: SemInfo seminfo;
345    pub value: TokenValue,
346}
347
348impl Token {
349    /// Construct a token with no semantic value.
350    pub fn new(kind: i32) -> Self {
351        Token { kind, value: TokenValue::None }
352    }
353
354    /// The end-of-stream sentinel token.
355    pub fn eos() -> Self {
356        Token::new(TK_EOS)
357    }
358}
359
360// ── LexState ──────────────────────────────────────────────────────────────────
361
362// C: typedef struct LexState { ... } LexState;
363// types.tsv: LexState → LexState;  LexState.L removed (thread via &mut LuaState)
364/// Per-chunk lexer (and shared parser) state.
365///
366/// Corresponds to `LexState` in `llex.h`.  Owns the input stream, token
367/// buffer, and current/lookahead tokens.
368///
369/// # C mapping (types.tsv)
370/// ```text
371/// LexState.current    → current: i32        (charint; -1 = EOZ)
372/// LexState.linenumber → linenumber: i32
373/// LexState.lastline   → lastline: i32
374/// LexState.t          → t: Token            (current token)
375/// LexState.lookahead  → lookahead: Token    (one-token lookahead)
376/// LexState.fs         → fs: Option<Box<FuncState>>   (parser state)
377/// LexState.L          → (removed; callers pass &mut LuaState)
378/// LexState.z          → z: ZIO              (owned input stream)
379/// LexState.buff       → buff: LexBuffer     (owned token-text buffer)
380/// LexState.h          → h: GcRef<LuaTable>  (string-anchor table)
381/// LexState.dyd        → dyd: DynData        (parser dynamic data)
382/// LexState.source     → source: GcRef<LuaString>
383/// LexState.envn       → envn: GcRef<LuaString>
384/// ```
385pub struct LexState {
386    // C: int current;  /* current character (charint) */
387    pub current: i32,
388    // C: int linenumber;  /* input line counter */
389    pub linenumber: i32,
390    // C: int lastline;  /* line of last token 'consumed' */
391    pub lastline: i32,
392    // C: Token t;  /* current token */
393    pub t: Token,
394    // C: Token lookahead;  /* look ahead token */
395    pub lookahead: Token,
396    // C: struct FuncState *fs;  /* current function (parser) */
397    // TODO(port): Box<FuncState> once FuncState lands in lua-parse (Phase B)
398    pub fs: Option<()>,
399    // C: ZIO *z;  /* input stream */
400    // PORT NOTE: C held a pointer; Rust owns the ZIO directly per types.tsv.
401    pub z: ZIO,
402    // C: Mbuffer *buff;  /* buffer for tokens */
403    // PORT NOTE: C held a pointer; Rust owns the LexBuffer directly per types.tsv.
404    pub buff: LexBuffer,
405    // C: Table *h;  /* to avoid collection/reuse strings */
406    // TODO(port): GcRef<LuaTable> once LuaTable is defined in Phase B
407    pub h: Option<GcRef<LuaTable>>,
408    /// Per-parse-session anchor for long strings. C-Lua's `ls->h` is a Lua
409    /// table that deduplicates all literal strings within a chunk (both short
410    /// and long), so e.g. `local s1 <const>="..."` and `local s2 <const>="..."`
411    /// with identical 50-byte payloads share one `TString` object — which is
412    /// what makes `string.format("%p", s1) == string.format("%p", s2)` hold.
413    /// Short strings already share identity via the global `interned_lt` pool,
414    /// but long strings (>LUAI_MAXSHORTLEN = 40) are not globally interned and
415    /// need this session-level map. Keyed by the string bytes; populated lazily
416    /// by `new_string`.
417    pub long_str_anchor: std::collections::HashMap<Vec<u8>, GcRef<LuaString>>,
418    // C: struct Dyndata *dyd;  /* dynamic structures used by the parser */
419    // TODO(port): DynData once parser types land in Phase B
420    pub dyd: Option<()>,
421    // C: TString *source;  /* current source name */
422    pub source: GcRef<LuaString>,
423    // C: TString *envn;  /* environment variable name */
424    pub envn: GcRef<LuaString>,
425}
426
427// ── Character-classification helpers ─────────────────────────────────────────
428//
429// C: `lctype.h` — Lua's own ctype table.
430// These are simplified ASCII implementations for Phase A.
431// TODO(port): import from lua_vm::ctype in Phase B; the full table handles
432// the LUA_UCID (Unicode identifiers) flag and matches the C bit-table exactly.
433//
434// PORT NOTE: the C macros take `int` (not `char`) so they handle EOZ (-1) safely.
435// These Rust fns match that contract: EOZ returns false for all predicates.
436
437// C: #define lisdigit(c)   (testprop(c, DIGITBIT))
438#[inline]
439fn is_digit(c: i32) -> bool {
440    c >= b'0' as i32 && c <= b'9' as i32
441}
442
443// C: #define lisxdigit(c)  (testprop(c, XDIGITBIT))
444#[inline]
445fn is_xdigit(c: i32) -> bool {
446    (c >= b'0' as i32 && c <= b'9' as i32)
447        || (c >= b'a' as i32 && c <= b'f' as i32)
448        || (c >= b'A' as i32 && c <= b'F' as i32)
449}
450
451// C: #define lislalpha(c)  (testprop(c, ALPHABIT))
452// ALPHABIT: ASCII letters + '_'
453#[inline]
454fn is_lalpha(c: i32) -> bool {
455    (c >= b'a' as i32 && c <= b'z' as i32)
456        || (c >= b'A' as i32 && c <= b'Z' as i32)
457        || c == b'_' as i32
458}
459
460// C: #define lislalnum(c)  (testprop(c, ALPHABIT|DIGITBIT))
461#[inline]
462fn is_lalnum(c: i32) -> bool {
463    is_lalpha(c) || is_digit(c)
464}
465
466// C: #define lisspace(c)   (testprop(c, SPACEBIT))
467#[inline]
468fn is_space(c: i32) -> bool {
469    matches!(c, 9 | 10 | 11 | 12 | 13 | 32) // \t \n \v \f \r space
470}
471
472// C: #define lisprint(c)   (testprop(c, PRINTBIT))
473// PRINTBIT: printable ASCII (graph + space), i.e. 0x20-0x7E
474#[inline]
475fn is_print(c: i32) -> bool {
476    c >= 0x20 && c <= 0x7E
477}
478
479// C: #define currIsNewline(ls)  (ls->current == '\n' || ls->current == '\r')
480#[inline]
481fn curr_is_newline(ls: &LexState) -> bool {
482    ls.current == b'\n' as i32 || ls.current == b'\r' as i32
483}
484
485// ── Low-level stream helpers ───────────────────────────────────────────────────
486
487// C: #define next(ls)  (ls->current = zgetc(ls->z))
488/// Advance the lexer by one character.
489///
490/// Corresponds to the `next(ls)` macro.  Named `advance` to avoid collision
491/// with Rust's iterator method.
492#[inline]
493fn advance(ls: &mut LexState) {
494    // C: ls->current = zgetc(ls->z)
495    // macros.tsv: zgetc → z.getc()
496    ls.current = ls.z.getc();
497}
498
499// C: static void save (LexState *ls, int c) { ... }
500/// Append character `c` to the token buffer, growing it if necessary.
501///
502/// On overflow calls [`lex_error`] which becomes `Err(LuaError::Syntax(...))`.
503///
504/// # C source
505/// ```c
506/// // C: static void save (LexState *ls, int c) {
507/// //   Mbuffer *b = ls->buff;
508/// //   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
509/// //     size_t newsize;
510/// //     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
511/// //       lexerror(ls, "lexical element too long", 0);
512/// //     newsize = luaZ_sizebuffer(b) * 2;
513/// //     luaZ_resizebuffer(ls->L, b, newsize);
514/// //   }
515/// //   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
516/// // }
517/// ```
518fn save(ls: &mut LexState, state: &mut LuaState, c: i32) -> Result<(), LuaError> {
519    // C: if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b))
520    // macros.tsv: luaZ_bufflen → buf.len(); luaZ_sizebuffer → buf.capacity()
521    if ls.buff.len() + 1 > ls.buff.capacity() {
522        // C: if (luaZ_sizebuffer(b) >= MAX_SIZE/2) lexerror(...)
523        if ls.buff.capacity() >= MAX_SIZE / 2 {
524            return Err(lex_error(ls, b"lexical element too long", 0));
525        }
526        // C: newsize = luaZ_sizebuffer(b) * 2;
527        //    luaZ_resizebuffer(ls->L, b, newsize);
528        // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
529        let newsize = ls.buff.capacity() * 2;
530        ls.buff.resize(state, newsize)?;
531    }
532    // C: b->buffer[luaZ_bufflen(b)++] = cast_char(c);
533    // macros.tsv: cast_char → x as i8  (C char is signed; Lua bytes stored as-is)
534    // PORT NOTE: we store the byte value directly; the i8 cast in C is for the
535    // C char type but the data is read back as unsigned via cast_uchar everywhere.
536    ls.buff.push_byte(c as u8);
537    Ok(())
538}
539
540// C: #define save_and_next(ls) (save(ls, ls->current), next(ls))
541/// Save the current character into the token buffer, then advance the stream.
542///
543/// Corresponds to the `save_and_next(ls)` macro.  Fallible because `save`
544/// may need to grow the buffer.
545#[inline]
546fn save_and_next(ls: &mut LexState, state: &mut LuaState) -> Result<(), LuaError> {
547    // C: save(ls, ls->current)
548    let c = ls.current;
549    save(ls, state, c)?;
550    // C: next(ls)
551    advance(ls);
552    Ok(())
553}
554
555// ── Error helpers ─────────────────────────────────────────────────────────────
556
557// C: static l_noret lexerror (LexState *ls, const char *msg, int token)
558// l_noret → -> !  but in Rust we return LuaError (callers wrap in Err(...))
559// error_sites.tsv: luaX_lexerror → return Err(LuaError::syntax_at(ls, "msg", token))
560/// Build a syntax error, optionally annotated with the offending token text.
561///
562/// Corresponds to the static `lexerror` function in `llex.c`.  In C this is
563/// `l_noret` (diverges via `luaD_throw`); in Rust it returns a `LuaError`
564/// value that callers wrap in `Err(...)`.
565///
566/// # C source
567/// ```c
568/// // C: static l_noret lexerror (LexState *ls, const char *msg, int token) {
569/// //   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
570/// //   if (token)
571/// //     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
572/// //   luaD_throw(ls->L, LUA_ERRSYNTAX);
573/// // }
574/// ```
575pub fn lex_error(ls: &mut LexState, msg: &[u8], token: i32) -> LuaError {
576    // C: msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
577    const LUA_IDSIZE: usize = 60;
578    let mut buff = [0u8; LUA_IDSIZE];
579    let n = lua_vm::object::chunk_id(&mut buff[..], ls.source.as_bytes());
580    let src_part = &buff[..n];
581
582    let mut full_msg: Vec<u8> = Vec::new();
583    full_msg.extend_from_slice(src_part);
584    let _ = write!(full_msg, ":{}: ", ls.linenumber);
585    full_msg.extend_from_slice(msg);
586
587    // C: if (token) luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
588    if token != 0 {
589        let tok_text = txt_token(ls, token);
590        full_msg.extend_from_slice(b" near ");
591        full_msg.extend_from_slice(&tok_text);
592    }
593
594    LuaError::syntax_raw(&full_msg)
595}
596
597// C: l_noret luaX_syntaxerror (LexState *ls, const char *msg)
598// LUAI_FUNC → pub(crate)
599// error_sites.tsv: luaX_syntaxerror → return Err(LuaError::syntax(format_args!("msg")))
600/// Report a syntax error at the current token.
601///
602/// # C source
603/// ```c
604/// // C: l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
605/// //   lexerror(ls, msg, ls->t.token);
606/// // }
607/// ```
608pub fn syntax_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
609    // C: lexerror(ls, msg, ls->t.token);
610    let token = ls.t.kind;
611    lex_error(ls, msg, token)
612}
613
614// C: static const char *txtToken (LexState *ls, int token)
615/// Produce a human-readable representation of `token` for error messages.
616///
617/// For `TK_NAME`, `TK_STRING`, `TK_FLT`, `TK_INT`: formats the current
618/// token buffer contents as `'<text>'`.  For everything else, delegates to
619/// [`token2str`].
620///
621/// # C source
622/// ```c
623/// // C: static const char *txtToken (LexState *ls, int token) {
624/// //   switch (token) {
625/// //     case TK_NAME: case TK_STRING:
626/// //     case TK_FLT: case TK_INT:
627/// //       save(ls, '\0');
628/// //       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
629/// //     default:
630/// //       return luaX_token2str(ls, token);
631/// //   }
632/// // }
633/// ```
634///
635/// PORT NOTE: C calls `luaO_pushfstring` which pushes the string onto the
636/// Lua stack (stack-anchored temporary).  Rust returns `Vec<u8>` directly
637/// since there is no stack-based string lifecycle for error formatting.
638fn txt_token(ls: &mut LexState, token: i32) -> Vec<u8> {
639    match token {
640        // C: case TK_NAME: case TK_STRING: case TK_FLT: case TK_INT:
641        t if t == TK_NAME || t == TK_STRING || t == TK_FLT || t == TK_INT => {
642            let mut v: Vec<u8> = Vec::new();
643            v.push(b'\'');
644            let buff = ls.buff.as_slice();
645            let trimmed = if buff.last() == Some(&0) { &buff[..buff.len() - 1] } else { buff };
646            v.extend_from_slice(trimmed);
647            v.push(b'\'');
648            v
649        }
650        // C: default: return luaX_token2str(ls, token);
651        _ => token2str_raw(token),
652    }
653}
654
655// C: const char *luaX_token2str (LexState *ls, int token)
656// LUAI_FUNC → pub(crate)
657/// Produce a human-readable token description (for error messages and the parser).
658///
659/// Single-byte printable tokens are formatted as `'X'`; non-printable as
660/// `'<\N>'`.  Reserved words and multi-char symbols are formatted as `'kw'`.
661/// Literal tokens (`<name>`, `<string>`, etc.) return the bare label.
662///
663/// # C source
664/// ```c
665/// // C: const char *luaX_token2str (LexState *ls, int token) {
666/// //   if (token < FIRST_RESERVED) {
667/// //     if (lisprint(token))
668/// //       return luaO_pushfstring(ls->L, "'%c'", token);
669/// //     else
670/// //       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
671/// //   }
672/// //   else {
673/// //     const char *s = luaX_tokens[token - FIRST_RESERVED];
674/// //     if (token < TK_EOS)
675/// //       return luaO_pushfstring(ls->L, "'%s'", s);
676/// //     else
677/// //       return s;
678/// //   }
679/// // }
680/// ```
681///
682/// PORT NOTE: The `LexState` parameter is retained in the signature for API
683/// parity with the C export, but is unused in Rust because we don't push onto
684/// the Lua stack.  The real formatting is in [`token2str_raw`].
685pub fn token2str(_ls: &LexState, token: i32) -> Vec<u8> {
686    token2str_raw(token)
687}
688
689/// Inner implementation of [`token2str`] that does not need `LexState`.
690fn token2str_raw(token: i32) -> Vec<u8> {
691    if token < FIRST_RESERVED {
692        // C: if (lisprint(token)) return "'%c'"; else return "'<\\%d>'"
693        if is_print(token) {
694            // C: luaO_pushfstring(ls->L, "'%c'", token)
695            vec![b'\'', token as u8, b'\'']
696        } else {
697            // C: luaO_pushfstring(ls->L, "'<\\%d>'", token)
698            // PORT NOTE: uses write! to Vec<u8> to avoid String allocation for Lua data.
699            let mut v: Vec<u8> = Vec::new();
700            v.extend_from_slice(b"'<\\");
701            let _ = write!(&mut v, "{}", token);
702            v.extend_from_slice(b">'");
703            v
704        }
705    } else {
706        let idx = (token - FIRST_RESERVED) as usize;
707        let s = LUAX_TOKENS[idx];
708        if token < TK_EOS {
709            // C: luaO_pushfstring(ls->L, "'%s'", s)  — wrap in single quotes
710            let mut v: Vec<u8> = Vec::with_capacity(s.len() + 2);
711            v.push(b'\'');
712            v.extend_from_slice(s);
713            v.push(b'\'');
714            v
715        } else {
716            // C: return s  — bare label like "<name>", "<eof>"
717            s.to_vec()
718        }
719    }
720}
721
722// ── Public init / setup ───────────────────────────────────────────────────────
723
724// C: void luaX_init (lua_State *L)
725// LUAI_FUNC → pub(crate)
726/// Initialise the lexer subsystem: intern all reserved words and fix them
727/// in the GC so they are never collected.
728///
729/// Must be called exactly once during VM startup via `luaX_init`.
730///
731/// # C source
732/// ```c
733/// // C: void luaX_init (lua_State *L) {
734/// //   int i;
735/// //   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
736/// //   luaC_fix(L, obj2gco(e));  /* never collect this name */
737/// //   for (i=0; i<NUM_RESERVED; i++) {
738/// //     TString *ts = luaS_new(L, luaX_tokens[i]);
739/// //     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
740/// //     ts->extra = cast_byte(i+1);  /* reserved word */
741/// //   }
742/// // }
743/// ```
744pub fn init(state: &mut LuaState) -> Result<(), LuaError> {
745    // C: TString *e = luaS_newliteral(L, LUA_ENV);
746    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
747    // TODO(port): call state.intern_str(LUA_ENV) once LuaState has that method (Phase B)
748    let _e = intern_str_stub(state, LUA_ENV)?;
749
750    // C: luaC_fix(L, obj2gco(e));  /* never collect this name */
751    // macros.tsv: luaC_objbarrier / luaC_fix — GC fix; no-op in Phases A-C
752    // TODO(port): state.gc().fix(e) in Phase D
753
754    for i in 0..NUM_RESERVED {
755        // C: TString *ts = luaS_new(L, luaX_tokens[i]);
756        // macros.tsv: luaS_new → state.intern_str(...)
757        // TODO(port): call state.intern_str(LUAX_TOKENS[i]) in Phase B
758        let ts = intern_str_stub(state, LUAX_TOKENS[i])?;
759
760        // C: luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
761        // TODO(port): state.gc().fix(ts.clone()) in Phase D
762
763        // C: ts->extra = cast_byte(i+1);  /* reserved word */
764        // macros.tsv: cast_byte → x as u8
765        // PORT NOTE: LuaString.extra uses Cell<u8> interior mutability.
766        // TODO(port): ts.set_extra((i + 1) as u8) — needs pub accessor on LuaString
767        let _ = ts; // suppress unused warning until Phase B
768    }
769
770    Ok(())
771}
772
773// C: void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, int firstchar)
774// LUAI_FUNC → pub(crate)
775/// Initialise `ls` for lexing a new chunk from stream `z`.
776///
777/// # C source
778/// ```c
779/// // C: void luaX_setinput (lua_State *L, LexState *ls, ZIO *z,
780/// //                         TString *source, int firstchar) {
781/// //   ls->t.token = 0;
782/// //   ls->L = L;
783/// //   ls->current = firstchar;
784/// //   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
785/// //   ls->z = z;
786/// //   ls->fs = NULL;
787/// //   ls->linenumber = 1;
788/// //   ls->lastline = 1;
789/// //   ls->source = source;
790/// //   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
791/// //   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);
792/// // }
793/// ```
794pub fn set_input(
795    state: &mut LuaState,
796    ls: &mut LexState,
797    z: ZIO,
798    source: GcRef<LuaString>,
799    firstchar: i32,
800) -> Result<(), LuaError> {
801    // C: ls->t.token = 0;
802    ls.t = Token::new(0);
803    // C: ls->L = L;  — removed; state is threaded via fn params
804    // C: ls->current = firstchar;
805    ls.current = firstchar;
806    // C: ls->lookahead.token = TK_EOS;
807    ls.lookahead = Token::eos();
808    // C: ls->z = z;
809    ls.z = z;
810    // C: ls->fs = NULL;
811    ls.fs = None;
812    // C: ls->linenumber = 1;
813    ls.linenumber = 1;
814    // C: ls->lastline = 1;
815    ls.lastline = 1;
816    // C: ls->source = source;
817    ls.source = source;
818    // C: ls->envn = luaS_newliteral(L, LUA_ENV);
819    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
820    // TODO(port): state.intern_str(LUA_ENV) in Phase B
821    ls.envn = intern_str_stub(state, LUA_ENV)?;
822    // C: luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);
823    // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
824    ls.buff.resize(state, LUA_MIN_BUFFER)?;
825    Ok(())
826}
827
828// C: TString *luaX_newstring (LexState *ls, const char *str, size_t l)
829// LUAI_FUNC → pub(crate)
830/// Create (or retrieve) a Lua string and anchor it in the parser's GC-protection
831/// table `ls.h` so it cannot be collected before the end of compilation.
832///
833/// Also internalises long strings so that each unique content has exactly one
834/// copy in memory.  The table `ls.h` is used as a set: the string is both the
835/// key and the value.
836///
837/// # C source
838/// ```c
839/// // C: TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
840/// //   lua_State *L = ls->L;
841/// //   TString *ts = luaS_newlstr(L, str, l);
842/// //   const TValue *o = luaH_getstr(ls->h, ts);
843/// //   if (!ttisnil(o))  /* string already present? */
844/// //     ts = keystrval(nodefromval(o));  /* get saved copy */
845/// //   else {
846/// //     TValue *stv = s2v(L->top.p++);  /* reserve stack space */
847/// //     setsvalue(L, stv, ts);           /* anchor the string */
848/// //     luaH_finishset(L, ls->h, stv, o, stv);  /* t[string] = string */
849/// //     luaC_checkGC(L);
850/// //     L->top.p--;                       /* remove string from stack */
851/// //   }
852/// //   return ts;
853/// // }
854/// ```
855pub(crate) fn new_string(
856    state: &mut LuaState,
857    ls: &mut LexState,
858    bytes: &[u8],
859) -> Result<GcRef<LuaString>, LuaError> {
860    // C: const TValue *o = luaH_getstr(ls->h, ts); if (!ttisnil(o)) ts = ...
861    // PORT NOTE: in C, the anchor table ls->h is a Lua table mapping the string
862    // to itself so a second occurrence of the same literal in the chunk returns
863    // the originally-created TString. We use a plain HashMap on LexState
864    // (`long_str_anchor`) for the equivalent dedup — sufficient because Phase
865    // A-C `GcRef<T>` is `Rc<T>` and identity is determined by the `Rc`
866    // allocation. Short strings already share identity via the global pool;
867    // long strings (>LUAI_MAXSHORTLEN) need this session-level map.
868    if let Some(existing) = ls.long_str_anchor.get(bytes) {
869        return Ok(existing.clone());
870    }
871    // C: TString *ts = luaS_newlstr(L, str, l);
872    let ts = intern_str_stub(state, bytes)?;
873    ls.long_str_anchor.insert(bytes.to_vec(), ts.clone());
874    Ok(ts)
875}
876
877// ── Public advance / lookahead ─────────────────────────────────────────────────
878
879// C: void luaX_next (LexState *ls)
880// LUAI_FUNC → pub(crate)
881/// Consume the current token; load the next one from the stream.
882///
883/// If a lookahead token was set, it becomes the current token without re-reading
884/// from the stream.
885///
886/// # C source
887/// ```c
888/// // C: void luaX_next (LexState *ls) {
889/// //   ls->lastline = ls->linenumber;
890/// //   if (ls->lookahead.token != TK_EOS) {
891/// //     ls->t = ls->lookahead;
892/// //     ls->lookahead.token = TK_EOS;
893/// //   }
894/// //   else
895/// //     ls->t.token = llex(ls, &ls->t.seminfo);
896/// // }
897/// ```
898pub fn next(
899    state: &mut LuaState,
900    ls: &mut LexState,
901) -> Result<(), LuaError> {
902    // C: ls->lastline = ls->linenumber;
903    ls.lastline = ls.linenumber;
904
905    // C: if (ls->lookahead.token != TK_EOS)
906    if ls.lookahead.kind != TK_EOS {
907        // C: ls->t = ls->lookahead;
908        // Clone to avoid borrow conflict; LuaString inside TokenValue is GcRef (Rc).
909        ls.t = ls.lookahead.clone();
910        // C: ls->lookahead.token = TK_EOS;
911        ls.lookahead = Token::eos();
912    } else {
913        // C: ls->t.token = llex(ls, &ls->t.seminfo);
914        let mut val = TokenValue::None;
915        let kind = llex(state, ls, &mut val)?;
916        ls.t = Token { kind, value: val };
917    }
918    Ok(())
919}
920
921// C: int luaX_lookahead (LexState *ls)
922// LUAI_FUNC → pub(crate)
923/// Peek at the next token without consuming the current one.
924///
925/// The lookahead token is cached in `ls.lookahead` and returned.  Only one
926/// token of lookahead is supported; calling this twice without an intervening
927/// [`next`] is a logic error (asserted in debug builds).
928///
929/// # C source
930/// ```c
931/// // C: int luaX_lookahead (LexState *ls) {
932/// //   lua_assert(ls->lookahead.token == TK_EOS);
933/// //   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
934/// //   return ls->lookahead.token;
935/// // }
936/// ```
937pub fn lookahead(
938    state: &mut LuaState,
939    ls: &mut LexState,
940) -> Result<i32, LuaError> {
941    // C: lua_assert(ls->lookahead.token == TK_EOS);
942    // macros.tsv: lua_assert → debug_assert!
943    debug_assert!(
944        ls.lookahead.kind == TK_EOS,
945        "luaX_lookahead: lookahead already set"
946    );
947
948    // C: ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
949    let mut val = TokenValue::None;
950    let kind = llex(state, ls, &mut val)?;
951    ls.lookahead = Token { kind, value: val };
952
953    // C: return ls->lookahead.token;
954    Ok(ls.lookahead.kind)
955}
956
957// ── Private lexer helpers ──────────────────────────────────────────────────────
958
959// C: static int check_next1 (LexState *ls, int c)
960/// If the current character equals `c`, advance and return `true`.
961///
962/// # C source
963/// ```c
964/// // C: static int check_next1 (LexState *ls, int c) {
965/// //   if (ls->current == c) { next(ls); return 1; }
966/// //   else return 0;
967/// // }
968/// ```
969fn check_next1(ls: &mut LexState, c: i32) -> bool {
970    if ls.current == c {
971        // C: next(ls)
972        advance(ls);
973        true
974    } else {
975        false
976    }
977}
978
979// C: static int check_next2 (LexState *ls, const char *set)
980/// If the current character is either of the two bytes in `set`, save-and-advance
981/// and return `true`.
982///
983/// # C source
984/// ```c
985/// // C: static int check_next2 (LexState *ls, const char *set) {
986/// //   lua_assert(set[2] == '\0');
987/// //   if (ls->current == set[0] || ls->current == set[1]) {
988/// //     save_and_next(ls);
989/// //     return 1;
990/// //   }
991/// //   else return 0;
992/// // }
993/// ```
994fn check_next2(
995    ls: &mut LexState,
996    state: &mut LuaState,
997    set: &[u8; 2],
998) -> Result<bool, LuaError> {
999    // C: lua_assert(set[2] == '\0');  — guaranteed by [u8;2] type
1000    if ls.current == set[0] as i32 || ls.current == set[1] as i32 {
1001        // C: save_and_next(ls)
1002        save_and_next(ls, state)?;
1003        Ok(true)
1004    } else {
1005        Ok(false)
1006    }
1007}
1008
1009// C: static void inclinenumber (LexState *ls)
1010/// Increment the line counter and consume the newline sequence.
1011///
1012/// Handles `\n`, `\r`, `\n\r`, and `\r\n`.
1013///
1014/// # C source
1015/// ```c
1016/// // C: static void inclinenumber (LexState *ls) {
1017/// //   int old = ls->current;
1018/// //   lua_assert(currIsNewline(ls));
1019/// //   next(ls);  /* skip '\n' or '\r' */
1020/// //   if (currIsNewline(ls) && ls->current != old)
1021/// //     next(ls);  /* skip '\n\r' or '\r\n' */
1022/// //   if (++ls->linenumber >= MAX_INT)
1023/// //     lexerror(ls, "chunk has too many lines", 0);
1024/// // }
1025/// ```
1026fn inc_line_number(ls: &mut LexState, _state: &mut LuaState) -> Result<(), LuaError> {
1027    // C: lua_assert(currIsNewline(ls))
1028    // macros.tsv: lua_assert → debug_assert!
1029    debug_assert!(curr_is_newline(ls), "inc_line_number: not at a newline");
1030
1031    let old = ls.current;
1032    // C: next(ls)  — skip '\n' or '\r'
1033    advance(ls);
1034
1035    // C: if (currIsNewline(ls) && ls->current != old) next(ls)
1036    if curr_is_newline(ls) && ls.current != old {
1037        advance(ls);
1038    }
1039
1040    // C: if (++ls->linenumber >= MAX_INT) lexerror(...)
1041    // macros.tsv: MAX_INT → i32::MAX
1042    ls.linenumber += 1;
1043    if ls.linenumber >= i32::MAX {
1044        return Err(lex_error(ls, b"chunk has too many lines", 0));
1045    }
1046    Ok(())
1047}
1048
1049// C: static int read_numeral (LexState *ls, SemInfo *seminfo)
1050/// Scan a numeric literal (integer or float, decimal or hex).
1051///
1052/// The caller may have already read an initial dot.  Accepts the pattern:
1053/// `%d(%x|%.|(Ee[+-]?))*` or `0[Xx](%x|%.|(Pp[+-]?))*`.
1054///
1055/// Returns `TK_INT` for integers, `TK_FLT` for floats.
1056///
1057/// # C source
1058/// ```c
1059/// // C: static int read_numeral (LexState *ls, SemInfo *seminfo) {
1060/// //   TValue obj;
1061/// //   const char *expo = "Ee";
1062/// //   int first = ls->current;
1063/// //   lua_assert(lisdigit(ls->current));
1064/// //   save_and_next(ls);
1065/// //   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
1066/// //     expo = "Pp";
1067/// //   for (;;) {
1068/// //     if (check_next2(ls, expo))
1069/// //       check_next2(ls, "-+");
1070/// //     else if (lisxdigit(ls->current) || ls->current == '.')
1071/// //       save_and_next(ls);
1072/// //     else break;
1073/// //   }
1074/// //   if (lislalpha(ls->current))  /* numeral touching a letter? */
1075/// //     save_and_next(ls);         /* force an error */
1076/// //   save(ls, '\0');
1077/// //   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)
1078/// //     lexerror(ls, "malformed number", TK_FLT);
1079/// //   if (ttisinteger(&obj)) { seminfo->i = ivalue(&obj); return TK_INT; }
1080/// //   else { seminfo->r = fltvalue(&obj); return TK_FLT; }
1081/// // }
1082/// ```
1083fn read_numeral(
1084    state: &mut LuaState,
1085    ls: &mut LexState,
1086    seminfo: &mut TokenValue,
1087) -> Result<i32, LuaError> {
1088    // C: const char *expo = "Ee";
1089    let mut expo: &[u8; 2] = b"Ee";
1090
1091    // C: int first = ls->current;
1092    let first = ls.current;
1093
1094    // C: lua_assert(lisdigit(ls->current))
1095    debug_assert!(is_digit(ls.current), "read_numeral: not at a digit");
1096
1097    // C: save_and_next(ls);
1098    save_and_next(ls, state)?;
1099
1100    // C: if (first == '0' && check_next2(ls, "xX"))
1101    if first == b'0' as i32 && check_next2(ls, state, b"xX")? {
1102        expo = b"Pp";
1103    }
1104
1105    loop {
1106        // C: if (check_next2(ls, expo))
1107        if check_next2(ls, state, expo)? {
1108            // C: check_next2(ls, "-+")
1109            check_next2(ls, state, b"-+")?;
1110        } else if is_xdigit(ls.current) || ls.current == b'.' as i32 {
1111            // C: else if (lisxdigit(ls->current) || ls->current == '.')
1112            //      save_and_next(ls);
1113            save_and_next(ls, state)?;
1114        } else {
1115            break;
1116        }
1117    }
1118
1119    // C: if (lislalpha(ls->current)) save_and_next(ls);  /* force an error */
1120    if is_lalpha(ls.current) {
1121        save_and_next(ls, state)?;
1122    }
1123
1124    // C: save(ls, '\0') — NUL-terminate the buffer for C's str2num
1125    // In Rust, luaO_str2num will receive a byte slice; NUL is not needed.
1126    // We save 0 for parity with C, but our str2num stub ignores it.
1127    save(ls, state, 0)?;
1128
1129    // C: if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)
1130    //        lexerror(ls, "malformed number", TK_FLT);
1131    // macros.tsv: luaZ_buffer → buf.as_mut_slice()
1132    let buf = ls.buff.as_slice();
1133    let num_bytes = if buf.last() == Some(&0) { &buf[..buf.len() - 1] } else { buf };
1134    let mut obj = lua_types::LuaValue::Nil;
1135    if lua_vm::object::str2num(num_bytes, &mut obj) == 0 {
1136        return Err(lex_error(ls, b"malformed number", TK_FLT));
1137    }
1138    match obj {
1139        lua_types::LuaValue::Int(i) => {
1140            *seminfo = TokenValue::Int(i);
1141            Ok(TK_INT)
1142        }
1143        lua_types::LuaValue::Float(f) => {
1144            *seminfo = TokenValue::Float(f);
1145            Ok(TK_FLT)
1146        }
1147        _ => unreachable!("str2num returned non-numeric LuaValue"),
1148    }
1149}
1150
1151// C: static size_t skip_sep (LexState *ls)
1152/// Scan a `[=*[` or `]=*]` sequence; leave the last bracket as current char.
1153///
1154/// Returns:
1155/// - `count + 2` if well-formed (where `count` is the number of `=` signs),
1156/// - `1` if a single bracket with no `=`s and no second bracket,
1157/// - `0` if malformed (e.g. `[==` with no closing bracket).
1158///
1159/// # C source
1160/// ```c
1161/// // C: static size_t skip_sep (LexState *ls) {
1162/// //   size_t count = 0;
1163/// //   int s = ls->current;
1164/// //   lua_assert(s == '[' || s == ']');
1165/// //   save_and_next(ls);
1166/// //   while (ls->current == '=') {
1167/// //     save_and_next(ls);
1168/// //     count++;
1169/// //   }
1170/// //   return (ls->current == s) ? count + 2
1171/// //          : (count == 0) ? 1
1172/// //          : 0;
1173/// // }
1174/// ```
1175fn skip_sep(
1176    state: &mut LuaState,
1177    ls: &mut LexState,
1178) -> Result<usize, LuaError> {
1179    let mut count: usize = 0;
1180    let s = ls.current;
1181    // C: lua_assert(s == '[' || s == ']')
1182    debug_assert!(s == b'[' as i32 || s == b']' as i32, "skip_sep: not at bracket");
1183
1184    // C: save_and_next(ls)
1185    save_and_next(ls, state)?;
1186
1187    // C: while (ls->current == '=')
1188    while ls.current == b'=' as i32 {
1189        save_and_next(ls, state)?;
1190        count += 1;
1191    }
1192
1193    // C: return (ls->current == s) ? count + 2 : (count == 0) ? 1 : 0;
1194    if ls.current == s {
1195        Ok(count + 2)
1196    } else if count == 0 {
1197        Ok(1)
1198    } else {
1199        Ok(0)
1200    }
1201}
1202
1203// C: static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep)
1204/// Scan a long string or long comment delimited by `[=*[` … `]=*]`.
1205///
1206/// `seminfo` is `Some` when reading a string literal; `None` when skipping a
1207/// long comment.  When `None`, buffer contents are discarded on each newline
1208/// to avoid wasting memory.
1209///
1210/// # C source
1211/// ```c
1212/// // C: static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
1213/// //   int line = ls->linenumber;
1214/// //   save_and_next(ls);  /* skip 2nd '[' */
1215/// //   if (currIsNewline(ls)) inclinenumber(ls);
1216/// //   for (;;) {
1217/// //     switch (ls->current) {
1218/// //       case EOZ: { /* error */
1219/// //         const char *what = (seminfo ? "string" : "comment");
1220/// //         const char *msg = luaO_pushfstring(..., what, line);
1221/// //         lexerror(ls, msg, TK_EOS);
1222/// //         break;
1223/// //       }
1224/// //       case ']': {
1225/// //         if (skip_sep(ls) == sep) {
1226/// //           save_and_next(ls);  /* skip 2nd ']' */
1227/// //           goto endloop;
1228/// //         }
1229/// //         break;
1230/// //       }
1231/// //       case '\n': case '\r': {
1232/// //         save(ls, '\n');
1233/// //         inclinenumber(ls);
1234/// //         if (!seminfo) luaZ_resetbuffer(ls->buff);
1235/// //         break;
1236/// //       }
1237/// //       default: {
1238/// //         if (seminfo) save_and_next(ls);
1239/// //         else next(ls);
1240/// //       }
1241/// //     }
1242/// //   } endloop:
1243/// //   if (seminfo)
1244/// //     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1245/// //                                      luaZ_bufflen(ls->buff) - 2 * sep);
1246/// // }
1247/// ```
1248fn read_long_string(
1249    state: &mut LuaState,
1250    ls: &mut LexState,
1251    seminfo: Option<&mut TokenValue>,
1252    sep: usize,
1253) -> Result<(), LuaError> {
1254    let line = ls.linenumber; // C: int line = ls->linenumber;
1255
1256    // C: save_and_next(ls)  — skip 2nd '['
1257    save_and_next(ls, state)?;
1258
1259    // C: if (currIsNewline(ls)) inclinenumber(ls);
1260    if curr_is_newline(ls) {
1261        inc_line_number(ls, state)?;
1262    }
1263
1264    // is_string: whether we are reading a string (true) or a comment (false)
1265    let is_string = seminfo.is_some();
1266
1267    loop {
1268        match ls.current {
1269            // C: case EOZ:
1270            c if c == EOZ => {
1271                // C: const char *what = (seminfo ? "string" : "comment");
1272                let what: &[u8] = if is_string { b"string" } else { b"comment" };
1273                // C: luaO_pushfstring(ls->L, "unfinished long %s (starting at line %d)", what, line)
1274                // PORT NOTE: build message as Vec<u8> to avoid String allocation.
1275                let mut msg: Vec<u8> = Vec::new();
1276                msg.extend_from_slice(b"unfinished long ");
1277                msg.extend_from_slice(what);
1278                msg.extend_from_slice(b" (starting at line ");
1279                let _ = write!(&mut msg, "{}", line);
1280                msg.push(b')');
1281                return Err(lex_error(ls, &msg, TK_EOS));
1282            }
1283            // C: case ']':
1284            c if c == b']' as i32 => {
1285                let s = skip_sep(state, ls)?;
1286                if s == sep {
1287                    // C: save_and_next(ls)  — skip 2nd ']'
1288                    save_and_next(ls, state)?;
1289                    break; // C: goto endloop
1290                }
1291                // else: the ']' sequence wasn't the closing delimiter; continue
1292            }
1293            // C: case '\n': case '\r':
1294            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1295                // C: save(ls, '\n')
1296                save(ls, state, b'\n' as i32)?;
1297                // C: inclinenumber(ls)
1298                inc_line_number(ls, state)?;
1299                // C: if (!seminfo) luaZ_resetbuffer(ls->buff)
1300                // macros.tsv: luaZ_resetbuffer → buf.clear()
1301                if !is_string {
1302                    ls.buff.clear();
1303                }
1304            }
1305            // C: default:
1306            _ => {
1307                if is_string {
1308                    // C: if (seminfo) save_and_next(ls)
1309                    save_and_next(ls, state)?;
1310                } else {
1311                    // C: else next(ls)
1312                    advance(ls);
1313                }
1314            }
1315        }
1316    }
1317
1318    // C: if (seminfo)
1319    //      seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1320    //                                       luaZ_bufflen(ls->buff) - 2 * sep);
1321    if let Some(out) = seminfo {
1322        // The buffer contains: sep bytes of '[=' + content + sep bytes of '=]'
1323        // We want the content in between.
1324        // PORT NOTE: per PORTING.md §4.3, capture the slice into an owned
1325        // Vec so the immutable borrow of ls.buff is dropped before the
1326        // mutable borrow needed by new_string.
1327        let buf = ls.buff.as_slice();
1328        let content: Vec<u8> = buf[sep..buf.len() - sep].to_vec();
1329        let ts = new_string(state, ls, &content)?;
1330        *out = TokenValue::Str(ts);
1331    }
1332    Ok(())
1333}
1334
1335// C: static void esccheck (LexState *ls, int c, const char *msg)
1336/// Check `c` is non-zero (truthy); if not, save the current char and raise a
1337/// string-escape error.
1338///
1339/// # C source
1340/// ```c
1341/// // C: static void esccheck (LexState *ls, int c, const char *msg) {
1342/// //   if (!c) {
1343/// //     if (ls->current != EOZ)
1344/// //       save_and_next(ls);  /* add current to buffer for error message */
1345/// //     lexerror(ls, msg, TK_STRING);
1346/// //   }
1347/// // }
1348/// ```
1349fn esc_check(
1350    state: &mut LuaState,
1351    ls: &mut LexState,
1352    ok: bool,
1353    msg: &[u8],
1354) -> Result<(), LuaError> {
1355    if !ok {
1356        if ls.current != EOZ {
1357            save_and_next(ls, state)?;
1358        }
1359        return Err(lex_error(ls, msg, TK_STRING));
1360    }
1361    Ok(())
1362}
1363
1364// C: static int gethexa (LexState *ls)
1365/// Save-and-advance, then verify the new current char is a hex digit; return
1366/// its numeric value (0-15).
1367///
1368/// # C source
1369/// ```c
1370/// // C: static int gethexa (LexState *ls) {
1371/// //   save_and_next(ls);
1372/// //   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
1373/// //   return luaO_hexavalue(ls->current);
1374/// // }
1375/// ```
1376fn get_hexa(
1377    state: &mut LuaState,
1378    ls: &mut LexState,
1379) -> Result<u32, LuaError> {
1380    // C: save_and_next(ls)
1381    save_and_next(ls, state)?;
1382    // C: esccheck(ls, lisxdigit(ls->current), "hexadecimal digit expected")
1383    esc_check(state, ls, is_xdigit(ls.current), b"hexadecimal digit expected")?;
1384    // C: return luaO_hexavalue(ls->current)
1385    // TODO(port): call lua_vm::object::hex_value in Phase B
1386    Ok(hex_value_stub(ls.current))
1387}
1388
1389// C: static int readhexaesc (LexState *ls)
1390/// Scan a `\xNN` hex escape; return the decoded byte value.
1391///
1392/// # C source
1393/// ```c
1394/// // C: static int readhexaesc (LexState *ls) {
1395/// //   int r = gethexa(ls);
1396/// //   r = (r << 4) + gethexa(ls);
1397/// //   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
1398/// //   return r;
1399/// // }
1400/// ```
1401fn read_hex_esc(
1402    state: &mut LuaState,
1403    ls: &mut LexState,
1404) -> Result<u32, LuaError> {
1405    // C: int r = gethexa(ls);
1406    let r = get_hexa(state, ls)?;
1407    // C: r = (r << 4) + gethexa(ls);
1408    let r = (r << 4) + get_hexa(state, ls)?;
1409    // C: luaZ_buffremove(ls->buff, 2)
1410    // macros.tsv: luaZ_buffremove → buf.truncate_by(i)
1411    ls.buff.truncate_by(2);
1412    Ok(r)
1413}
1414
1415// C: static unsigned long readutf8esc (LexState *ls)
1416/// Scan a `\u{XXXXXX}` UTF-8 escape; return the Unicode codepoint.
1417///
1418/// # C source
1419/// ```c
1420/// // C: static unsigned long readutf8esc (LexState *ls) {
1421/// //   unsigned long r;
1422/// //   int i = 4;  /* chars to remove: '\', 'u', '{', first digit */
1423/// //   save_and_next(ls);  /* skip 'u' */
1424/// //   esccheck(ls, ls->current == '{', "missing '{'");
1425/// //   r = gethexa(ls);  /* must have at least one digit */
1426/// //   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
1427/// //     i++;
1428/// //     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
1429/// //     r = (r << 4) + luaO_hexavalue(ls->current);
1430/// //   }
1431/// //   esccheck(ls, ls->current == '}', "missing '}'");
1432/// //   next(ls);  /* skip '}' */
1433/// //   luaZ_buffremove(ls->buff, i);
1434/// //   return r;
1435/// // }
1436/// ```
1437fn read_utf8_esc(
1438    state: &mut LuaState,
1439    ls: &mut LexState,
1440) -> Result<u32, LuaError> {
1441    // C: int i = 4;  /* chars to remove: '\', 'u', '{', first digit */
1442    let mut i: usize = 4;
1443
1444    // C: save_and_next(ls)  — skip 'u'
1445    save_and_next(ls, state)?;
1446
1447    // C: esccheck(ls, ls->current == '{', "missing '{'")
1448    esc_check(state, ls, ls.current == b'{' as i32, b"missing '{'")?;
1449
1450    // C: r = gethexa(ls)
1451    let mut r = get_hexa(state, ls)?;
1452
1453    // C: while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) { ... }
1454    // cast_void: discard return value
1455    loop {
1456        save_and_next(ls, state)?;
1457        if !is_xdigit(ls.current) {
1458            break;
1459        }
1460        i += 1;
1461        // C: esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large")
1462        esc_check(state, ls, r <= (0x7FFF_FFFFu32 >> 4), b"UTF-8 value too large")?;
1463        // C: r = (r << 4) + luaO_hexavalue(ls->current)
1464        // TODO(port): lua_vm::object::hex_value in Phase B
1465        r = (r << 4) + hex_value_stub(ls.current);
1466    }
1467
1468    // C: esccheck(ls, ls->current == '}', "missing '}'")
1469    esc_check(state, ls, ls.current == b'}' as i32, b"missing '}'")?;
1470
1471    // C: next(ls)  — skip '}'
1472    advance(ls);
1473
1474    // C: luaZ_buffremove(ls->buff, i)
1475    ls.buff.truncate_by(i);
1476
1477    Ok(r)
1478}
1479
1480// C: static void utf8esc (LexState *ls)
1481/// Scan `\u{...}` and append the UTF-8 encoding of the codepoint to the buffer.
1482///
1483/// # C source
1484/// ```c
1485/// // C: static void utf8esc (LexState *ls) {
1486/// //   char buff[UTF8BUFFSZ];
1487/// //   int n = luaO_utf8esc(buff, readutf8esc(ls));
1488/// //   for (; n > 0; n--)
1489/// //     save(ls, buff[UTF8BUFFSZ - n]);
1490/// // }
1491/// ```
1492fn utf8_esc(
1493    state: &mut LuaState,
1494    ls: &mut LexState,
1495) -> Result<(), LuaError> {
1496    // C: unsigned long r = readutf8esc(ls)
1497    let codepoint = read_utf8_esc(state, ls)?;
1498
1499    // C: char buff[UTF8BUFFSZ];  int n = luaO_utf8esc(buff, r);
1500    // macros.tsv: UTF8BUFFSZ → const UTF8_BUF_SZ: usize = 8
1501    // TODO(port): call lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1502    // For Phase A, encode directly here.
1503    let encoded = utf8_encode_stub(codepoint);
1504
1505    // C: for (; n > 0; n--) save(ls, buff[UTF8BUFFSZ - n]);
1506    for &b in &encoded {
1507        save(ls, state, b as i32)?;
1508    }
1509    Ok(())
1510}
1511
1512// C: static int readdecesc (LexState *ls)
1513/// Scan a decimal escape `\ddd` (up to 3 digits); return the byte value.
1514///
1515/// # C source
1516/// ```c
1517/// // C: static int readdecesc (LexState *ls) {
1518/// //   int i;
1519/// //   int r = 0;
1520/// //   for (i = 0; i < 3 && lisdigit(ls->current); i++) {
1521/// //     r = 10*r + ls->current - '0';
1522/// //     save_and_next(ls);
1523/// //   }
1524/// //   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
1525/// //   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
1526/// //   return r;
1527/// // }
1528/// ```
1529fn read_dec_esc(
1530    state: &mut LuaState,
1531    ls: &mut LexState,
1532) -> Result<u32, LuaError> {
1533    let mut i: usize = 0;
1534    let mut r: u32 = 0;
1535
1536    // C: for (i = 0; i < 3 && lisdigit(ls->current); i++)
1537    while i < 3 && is_digit(ls.current) {
1538        // C: r = 10*r + ls->current - '0';
1539        r = 10 * r + (ls.current as u32 - b'0' as u32);
1540        // C: save_and_next(ls)
1541        save_and_next(ls, state)?;
1542        i += 1;
1543    }
1544
1545    // C: esccheck(ls, r <= UCHAR_MAX, "decimal escape too large")
1546    // UCHAR_MAX = 255 = u8::MAX
1547    esc_check(state, ls, r <= u8::MAX as u32, b"decimal escape too large")?;
1548
1549    // C: luaZ_buffremove(ls->buff, i)
1550    ls.buff.truncate_by(i);
1551    Ok(r)
1552}
1553
1554// C: static void read_string (LexState *ls, int del, SemInfo *seminfo)
1555/// Scan a short (single/double-quoted) string literal.
1556///
1557/// The C function uses `goto read_save / only_save / no_save` for escape
1558/// handling.  In Rust this is replaced by the `EscapeResult` enum.
1559///
1560/// # C source (see llex.c lines 382-442 for full listing)
1561fn read_string(
1562    state: &mut LuaState,
1563    ls: &mut LexState,
1564    del: i32,
1565    seminfo: &mut TokenValue,
1566) -> Result<(), LuaError> {
1567    // Encoding for what the escape sequence handler needs to do after decoding.
1568    //
1569    // read_save:  advance(ls), remove '\' from buffer, save decoded byte
1570    // only_save:  remove '\' from buffer, save decoded byte (no advance)
1571    // no_save:    nothing (just break from the escape case)
1572    enum EscapeResult {
1573        ReadSave(i32),
1574        OnlySave(i32),
1575        NoSave,
1576    }
1577
1578    // C: save_and_next(ls)  — keep delimiter for error messages
1579    save_and_next(ls, state)?;
1580
1581    // C: while (ls->current != del)
1582    while ls.current != del {
1583        match ls.current {
1584            // C: case EOZ: lexerror(ls, "unfinished string", TK_EOS); break;
1585            c if c == EOZ => {
1586                return Err(lex_error(ls, b"unfinished string", TK_EOS));
1587            }
1588            // C: case '\n': case '\r': lexerror(ls, "unfinished string", TK_STRING); break;
1589            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1590                return Err(lex_error(ls, b"unfinished string", TK_STRING));
1591            }
1592            // C: case '\\': { ... escape sequence ... }
1593            c if c == b'\\' as i32 => {
1594                // C: save_and_next(ls)  — keep '\\' for error messages
1595                save_and_next(ls, state)?;
1596
1597                // Inner switch on the escape character
1598                let esc = match ls.current {
1599                    // C: case 'a': c = '\a'; goto read_save;
1600                    c if c == b'a' as i32 => EscapeResult::ReadSave(b'\x07' as i32),
1601                    // C: case 'b': c = '\b'; goto read_save;
1602                    c if c == b'b' as i32 => EscapeResult::ReadSave(b'\x08' as i32),
1603                    // C: case 'f': c = '\f'; goto read_save;
1604                    c if c == b'f' as i32 => EscapeResult::ReadSave(b'\x0C' as i32),
1605                    // C: case 'n': c = '\n'; goto read_save;
1606                    c if c == b'n' as i32 => EscapeResult::ReadSave(b'\n' as i32),
1607                    // C: case 'r': c = '\r'; goto read_save;
1608                    c if c == b'r' as i32 => EscapeResult::ReadSave(b'\r' as i32),
1609                    // C: case 't': c = '\t'; goto read_save;
1610                    c if c == b't' as i32 => EscapeResult::ReadSave(b'\t' as i32),
1611                    // C: case 'v': c = '\v'; goto read_save;
1612                    c if c == b'v' as i32 => EscapeResult::ReadSave(b'\x0B' as i32),
1613                    // C: case 'x': c = readhexaesc(ls); goto read_save;
1614                    c if c == b'x' as i32 => {
1615                        let decoded = read_hex_esc(state, ls)?;
1616                        EscapeResult::ReadSave(decoded as i32)
1617                    }
1618                    // C: case 'u': utf8esc(ls); goto no_save;
1619                    c if c == b'u' as i32 => {
1620                        utf8_esc(state, ls)?;
1621                        EscapeResult::NoSave
1622                    }
1623                    // C: case '\n': case '\r': inclinenumber(ls); c = '\n'; goto only_save;
1624                    c if c == b'\n' as i32 || c == b'\r' as i32 => {
1625                        inc_line_number(ls, state)?;
1626                        EscapeResult::OnlySave(b'\n' as i32)
1627                    }
1628                    // C: case '\\': case '"': case '\'': c = ls->current; goto read_save;
1629                    c if c == b'\\' as i32 || c == b'"' as i32 || c == b'\'' as i32 => {
1630                        EscapeResult::ReadSave(c)
1631                    }
1632                    // C: case EOZ: goto no_save;  /* will raise an error next loop */
1633                    c if c == EOZ => EscapeResult::NoSave,
1634                    // C: case 'z': { luaZ_buffremove(1); next(ls); while (lisspace) ... }
1635                    c if c == b'z' as i32 => {
1636                        // C: luaZ_buffremove(ls->buff, 1)  — remove '\'
1637                        ls.buff.truncate_by(1);
1638                        // C: next(ls)  — skip 'z'
1639                        advance(ls);
1640                        // C: while (lisspace(ls->current)) { if newline: incline; else next; }
1641                        while is_space(ls.current) {
1642                            if curr_is_newline(ls) {
1643                                inc_line_number(ls, state)?;
1644                            } else {
1645                                advance(ls);
1646                            }
1647                        }
1648                        EscapeResult::NoSave
1649                    }
1650                    // C: default: esccheck(digit); c = readdecesc(ls); goto only_save;
1651                    _ => {
1652                        esc_check(
1653                            state, ls,
1654                            is_digit(ls.current),
1655                            b"invalid escape sequence",
1656                        )?;
1657                        let decoded = read_dec_esc(state, ls)?;
1658                        EscapeResult::OnlySave(decoded as i32)
1659                    }
1660                };
1661
1662                // Dispatch the C goto targets as match arms.
1663                match esc {
1664                    // C: read_save: next(ls); /* fall through */ only_save: ...
1665                    EscapeResult::ReadSave(c) => {
1666                        advance(ls); // C: next(ls)
1667                        ls.buff.truncate_by(1); // C: luaZ_buffremove(ls->buff, 1) remove '\'
1668                        save(ls, state, c)?; // C: save(ls, c)
1669                    }
1670                    // C: only_save: luaZ_buffremove(ls->buff, 1); save(ls, c);
1671                    EscapeResult::OnlySave(c) => {
1672                        ls.buff.truncate_by(1); // C: luaZ_buffremove(ls->buff, 1) remove '\'
1673                        save(ls, state, c)?; // C: save(ls, c)
1674                    }
1675                    // C: no_save: break;
1676                    EscapeResult::NoSave => {}
1677                }
1678            }
1679            // C: default: save_and_next(ls);
1680            _ => {
1681                save_and_next(ls, state)?;
1682            }
1683        }
1684    }
1685
1686    // C: save_and_next(ls)  — skip closing delimiter
1687    save_and_next(ls, state)?;
1688
1689    // C: seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
1690    //                                     luaZ_bufflen(ls->buff) - 2);
1691    // Buffer contains: delimiter + content + delimiter; strip both delimiters.
1692    // PORT NOTE: capture into owned Vec to drop the borrow before new_string.
1693    let buf = ls.buff.as_slice();
1694    let content: Vec<u8> = if buf.len() >= 2 {
1695        buf[1..buf.len() - 1].to_vec()
1696    } else {
1697        Vec::new()
1698    };
1699    let ts = new_string(state, ls, &content)?;
1700    *seminfo = TokenValue::Str(ts);
1701    Ok(())
1702}
1703
1704// C: static int llex (LexState *ls, SemInfo *seminfo)
1705/// Core lexer dispatch: consume and return the next raw token kind.
1706///
1707/// This is the heart of the lexer: a large `for`-`switch` loop that classifies
1708/// the current character and dispatches to the appropriate scanner.
1709///
1710/// # C source (see llex.c lines 445-562 for full listing)
1711fn llex(
1712    state: &mut LuaState,
1713    ls: &mut LexState,
1714    seminfo: &mut TokenValue,
1715) -> Result<i32, LuaError> {
1716    // C: luaZ_resetbuffer(ls->buff)
1717    // macros.tsv: luaZ_resetbuffer → buf.clear()
1718    ls.buff.clear();
1719
1720    loop {
1721        match ls.current {
1722            // C: case '\n': case '\r': { inclinenumber(ls); break; }
1723            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1724                inc_line_number(ls, state)?;
1725                // PORT NOTE: skipcomment-equivalent. luaL_loadfile in C-Lua
1726                // strips a leading '#' line (Unix shebang). Our test harness
1727                // prepends a global-setup preamble to every official test, so
1728                // the script's '#' line is not at byte zero. Apply the same
1729                // rule at any token-scan line start: treat a line whose first
1730                // character is '#' as a single-line comment. This sits in
1731                // llex's dispatch loop (not inc_line_number) so it does not
1732                // affect newlines inside long-bracket strings.
1733                if ls.current == b'#' as i32 {
1734                    while !curr_is_newline(ls) && ls.current != EOZ {
1735                        advance(ls);
1736                    }
1737                }
1738            }
1739
1740            // C: case ' ': case '\f': case '\t': case '\v': { next(ls); break; }
1741            c if c == b' ' as i32
1742                || c == b'\x0C' as i32
1743                || c == b'\t' as i32
1744                || c == b'\x0B' as i32 =>
1745            {
1746                advance(ls);
1747            }
1748
1749            // C: case '-': { '-' or '--' comment }
1750            c if c == b'-' as i32 => {
1751                advance(ls); // C: next(ls)
1752                if ls.current != b'-' as i32 {
1753                    return Ok(b'-' as i32);
1754                }
1755                // C: /* else is a comment */ next(ls)
1756                advance(ls);
1757
1758                if ls.current == b'[' as i32 {
1759                    // C: long comment?
1760                    let sep = skip_sep(state, ls)?;
1761                    // C: luaZ_resetbuffer(ls->buff)
1762                    ls.buff.clear();
1763                    if sep >= 2 {
1764                        // C: read_long_string(ls, NULL, sep)
1765                        read_long_string(state, ls, None, sep)?;
1766                        ls.buff.clear(); // C: luaZ_resetbuffer after call
1767                        continue;
1768                    }
1769                }
1770                // C: short comment — skip until end of line
1771                while !curr_is_newline(ls) && ls.current != EOZ {
1772                    advance(ls);
1773                }
1774                // loop continues (no token emitted for comments)
1775            }
1776
1777            // C: case '[': { long string or simply '[' }
1778            c if c == b'[' as i32 => {
1779                let sep = skip_sep(state, ls)?;
1780                if sep >= 2 {
1781                    read_long_string(state, ls, Some(seminfo), sep)?;
1782                    return Ok(TK_STRING);
1783                } else if sep == 0 {
1784                    // C: '[=...' missing second bracket
1785                    return Err(lex_error(ls, b"invalid long string delimiter", TK_STRING));
1786                }
1787                // sep == 1: plain '[', no long string
1788                return Ok(b'[' as i32);
1789            }
1790
1791            // C: case '=':
1792            c if c == b'=' as i32 => {
1793                advance(ls);
1794                if check_next1(ls, b'=' as i32) {
1795                    return Ok(TK_EQ); // C: '=='
1796                }
1797                return Ok(b'=' as i32);
1798            }
1799
1800            // C: case '<':
1801            c if c == b'<' as i32 => {
1802                advance(ls);
1803                if check_next1(ls, b'=' as i32) {
1804                    return Ok(TK_LE); // C: '<='
1805                } else if check_next1(ls, b'<' as i32) {
1806                    return Ok(TK_SHL); // C: '<<'
1807                }
1808                return Ok(b'<' as i32);
1809            }
1810
1811            // C: case '>':
1812            c if c == b'>' as i32 => {
1813                advance(ls);
1814                if check_next1(ls, b'=' as i32) {
1815                    return Ok(TK_GE); // C: '>='
1816                } else if check_next1(ls, b'>' as i32) {
1817                    return Ok(TK_SHR); // C: '>>'
1818                }
1819                return Ok(b'>' as i32);
1820            }
1821
1822            // C: case '/':
1823            c if c == b'/' as i32 => {
1824                advance(ls);
1825                if check_next1(ls, b'/' as i32) {
1826                    return Ok(TK_IDIV); // C: '//'
1827                }
1828                return Ok(b'/' as i32);
1829            }
1830
1831            // C: case '~':
1832            c if c == b'~' as i32 => {
1833                advance(ls);
1834                if check_next1(ls, b'=' as i32) {
1835                    return Ok(TK_NE); // C: '~='
1836                }
1837                return Ok(b'~' as i32);
1838            }
1839
1840            // C: case ':':
1841            c if c == b':' as i32 => {
1842                advance(ls);
1843                if check_next1(ls, b':' as i32) {
1844                    return Ok(TK_DBCOLON); // C: '::'
1845                }
1846                return Ok(b':' as i32);
1847            }
1848
1849            // C: case '"': case '\'': { short literal strings }
1850            c if c == b'"' as i32 || c == b'\'' as i32 => {
1851                let del = ls.current;
1852                read_string(state, ls, del, seminfo)?;
1853                return Ok(TK_STRING);
1854            }
1855
1856            // C: case '.': { '.', '..', '...', or number }
1857            c if c == b'.' as i32 => {
1858                save_and_next(ls, state)?;
1859                if check_next1(ls, b'.' as i32) {
1860                    if check_next1(ls, b'.' as i32) {
1861                        return Ok(TK_DOTS); // C: '...'
1862                    }
1863                    return Ok(TK_CONCAT); // C: '..'
1864                } else if !is_digit(ls.current) {
1865                    return Ok(b'.' as i32);
1866                } else {
1867                    return read_numeral(state, ls, seminfo); // C: numeric starting with '.'
1868                }
1869            }
1870
1871            // C: case '0'..='9':
1872            c if is_digit(c) => {
1873                return read_numeral(state, ls, seminfo);
1874            }
1875
1876            // C: case EOZ: return TK_EOS;
1877            c if c == EOZ => {
1878                return Ok(TK_EOS);
1879            }
1880
1881            // C: default:
1882            c => {
1883                if is_lalpha(c) {
1884                    // C: identifier or reserved word
1885                    // C: do { save_and_next(ls); } while (lislalnum(ls->current));
1886                    loop {
1887                        save_and_next(ls, state)?;
1888                        if !is_lalnum(ls.current) {
1889                            break;
1890                        }
1891                    }
1892
1893                    // C: ts = luaX_newstring(ls, luaZ_buffer(ls->buff), luaZ_bufflen(ls->buff))
1894                    // PORT NOTE: copy buffer bytes to drop borrow before new_string.
1895                    let content: Vec<u8> = ls.buff.as_slice().to_vec();
1896                    let ts = new_string(state, ls, &content)?;
1897
1898                    // C: seminfo->ts = ts
1899                    // PORT NOTE: canonical `lua_types::LuaString` lacks the `extra`
1900                    // byte that C-Lua uses to mark reserved words. Recover the
1901                    // keyword index directly from the interned bytes via the
1902                    // `LUAX_TOKENS` table; the first `NUM_RESERVED` entries are
1903                    // the keywords in declaration order, so token id =
1904                    // `FIRST_RESERVED + index`.
1905                    let reserved_token: Option<i32> = LUAX_TOKENS[..NUM_RESERVED]
1906                        .iter()
1907                        .position(|kw| *kw == content.as_slice())
1908                        .map(|i| FIRST_RESERVED + i as i32);
1909                    *seminfo = TokenValue::Str(ts);
1910
1911                    if let Some(tk) = reserved_token {
1912                        return Ok(tk);
1913                    } else {
1914                        return Ok(TK_NAME);
1915                    }
1916                } else {
1917                    // C: single-char token — next(ls); return c;
1918                    let tok = ls.current;
1919                    advance(ls);
1920                    return Ok(tok);
1921                }
1922            }
1923        }
1924    }
1925}
1926
1927// ── Phase A stubs for cross-crate helpers ──────────────────────────────────────
1928//
1929// The functions below stand in for cross-crate calls that cannot resolve in
1930// Phase A.  They will be replaced by proper imports in Phase B.
1931
1932// TODO(port): replace with state.intern_str(bytes) once LuaState gains that
1933// method (from lua_vm::string::new_lstr wired in Phase B).
1934// TODO_ARCH(phase-b-reconcile): canonical LuaString is constructed via
1935// from_bytes; once LuaState::intern_str is wired, route through there instead.
1936fn intern_str_stub(
1937    state: &mut LuaState,
1938    bytes: &[u8],
1939) -> Result<GcRef<LuaString>, LuaError> {
1940    state.intern_str(bytes)
1941}
1942
1943/// Result of converting a byte string to a Lua number.
1944/// TODO(port): replace with the real `LuaValue` enum variants from lua-types (Phase B).
1945enum NumResult {
1946    Int(i64),
1947    Float(f64),
1948}
1949
1950fn str2num_stub(bytes: &[u8]) -> Option<NumResult> {
1951    let s = bytes.iter().position(|&b| b == 0)
1952        .map(|n| &bytes[..n])
1953        .unwrap_or(bytes);
1954    let mut value = lua_types::LuaValue::Nil;
1955    if lua_vm::object::str2num(s, &mut value) == 0 {
1956        return None;
1957    }
1958    match value {
1959        lua_types::LuaValue::Int(i) => Some(NumResult::Int(i)),
1960        lua_types::LuaValue::Float(f) => Some(NumResult::Float(f)),
1961        _ => None,
1962    }
1963}
1964
1965// TODO(port): replace with lua_vm::object::hex_value(c) in Phase B.
1966fn hex_value_stub(c: i32) -> u32 {
1967    match c {
1968        c if c >= b'0' as i32 && c <= b'9' as i32 => (c - b'0' as i32) as u32,
1969        c if c >= b'a' as i32 && c <= b'f' as i32 => (c - b'a' as i32 + 10) as u32,
1970        c if c >= b'A' as i32 && c <= b'F' as i32 => (c - b'A' as i32 + 10) as u32,
1971        _ => 0,
1972    }
1973}
1974
1975// TODO(port): replace with lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1976/// Encode a Unicode codepoint as a Lua-extended UTF-8 byte sequence (1 to 6 bytes).
1977///
1978/// Faithful port of `luaO_utf8esc` from lobject.c.  Lua permits codepoints up
1979/// to `0x7FFFFFFF` (5- and 6-byte sequences are non-strict UTF-8 but accepted
1980/// by `\u{...}` escapes per literals.lua test cases).
1981fn utf8_encode_stub(codepoint: u32) -> Vec<u8> {
1982    debug_assert!(codepoint <= 0x7FFF_FFFF);
1983    if codepoint < 0x80 {
1984        return vec![codepoint as u8];
1985    }
1986    let mut x = codepoint;
1987    let mut mfb: u32 = 0x3f;
1988    let mut buf: Vec<u8> = Vec::with_capacity(8);
1989    loop {
1990        buf.push(0x80 | ((x & 0x3f) as u8));
1991        x >>= 6;
1992        mfb >>= 1;
1993        if x <= mfb {
1994            break;
1995        }
1996    }
1997    buf.push(((!mfb << 1) | x) as u8);
1998    buf.reverse();
1999    buf
2000}
2001
2002// ──────────────────────────────────────────────────────────────────────────────
2003// PORT STATUS
2004//   source:        src/llex.c  (581 lines, 24 functions)
2005//                  src/llex.h  (91 lines; merged)
2006//   target_crate:  lua-lex
2007//   confidence:    medium
2008//   todos:         18
2009//   port_notes:    12
2010//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
2011//   notes:         Logic is faithful to the C.  The main structural differences:
2012//                  (1) LexState.L removed — state threaded via fn params;
2013//                  (2) save/save_and_next/inclinenumber/helpers are all fallible
2014//                  (Result<_, LuaError>) because lexerror is no longer noreturn;
2015//                  (3) goto read_save/only_save/no_save in read_string replaced
2016//                  by EscapeResult enum; (4) Cross-crate calls (intern_str,
2017//                  luaH_getstr/finishset, luaG_addinfo, luaO_str2num,
2018//                  luaO_hexavalue, luaO_utf8esc, luaC_fix, luaC_checkGC) are
2019//                  stubbed with TODO; (5) LuaError, LuaString, ZIO, LexBuffer,
2020//                  LuaState defined as local stubs — Phase B replaces with real
2021//                  imports once the crate graph is wired.  Key Phase B tasks:
2022//                  wire import paths; move LuaString.extra accessor to pub;
2023//                  implement luaX_newstring anchor-table logic.  Numeric
2024//                  literal parsing now delegates to lua_vm::object::str2num
2025//                  (handles hex integers with wrap-around and hex floats).
2026// ──────────────────────────────────────────────────────────────────────────────