Skip to main content

lua_lex/
lib.rs

1//! Lexical analyzer — port of `llex.c` + `llex.h`.
2//!
3//! Provides the Lua 5.4 lexer: character-by-character scanning of a [`ZIO`]
4//! input stream into [`Token`] values, with one-token lookahead.  The
5//! `llex.h` header is merged here per PORTING.md §1.
6//!
7//! # C source files
8//! - `reference/lua-5.4.7/src/llex.c`  (581 lines, 24 functions)
9//! - `reference/lua-5.4.7/src/llex.h`  (91 lines; merged here)
10//!
11//! # Design notes
12//! - `LexState.L` (back-pointer to `lua_State`) is removed.  All functions
13//!   that need `LuaState` receive it as `state: &mut LuaState`.
14//! - `Token.token` is `i32` in Phase A (matching the C `int token` field).
15//!   Single-byte tokens are their ASCII values; reserved-word tokens start at
16//!   `FIRST_RESERVED` (257).  A proper `TokenKind` enum is deferred to Phase B.
17//! - `save` / `save_and_next` are now fallible (`Result<(), LuaError>`); the
18//!   `?` operator replaces the C noreturn `lexerror` call on buffer overflow.
19//! - The `goto read_save / only_save / no_save` pattern in `read_string` is
20//!   translated via the local `EscapeResult` enum.
21
22// TODO(port): resolve remaining cross-crate calls (intern_str, table anchor,
23// number parsing, utf8 encoding) in Phase B.  Canonical cross-crate type
24// imports are now in place per harness/type-vocabulary.tsv (see below).
25
26use std::io::Write as IoWrite;
27
28// PORT NOTE: GcRef<T> = Rc<T> in Phases A–C; replaced by real GC pointer in Phase D.
29use lua_types::gc::GcRef;
30
31// Canonical cross-crate types: imported from owner crates per
32// harness/type-vocabulary.tsv.  See PORTING.md §7.
33pub use lua_types::LuaError;
34pub use lua_types::LuaString;
35pub use lua_vm::state::LuaState;
36pub use lua_vm::table::LuaTable;
37
38/// Placeholder for `LexBuffer` from `lua_vm::zio`.
39/// TODO(port): replace with `use lua_vm::zio::LexBuffer` in Phase B.
40/// types.tsv: Mbuffer → LexBuffer
41pub struct LexBuffer {
42    buffer: Vec<u8>,
43}
44
45impl LexBuffer {
46    pub fn new() -> Self {
47        LexBuffer { buffer: Vec::new() }
48    }
49
50    /// macros.tsv: luaZ_bufflen → buf.len()
51    pub fn len(&self) -> usize {
52        self.buffer.len()
53    }
54
55    /// macros.tsv: luaZ_sizebuffer → buf.capacity()
56    pub fn capacity(&self) -> usize {
57        self.buffer.capacity()
58    }
59
60    /// macros.tsv: luaZ_buffer → buf.as_mut_slice()
61    pub fn as_slice(&self) -> &[u8] {
62        &self.buffer
63    }
64
65    /// macros.tsv: luaZ_resetbuffer → buf.clear()
66    pub fn clear(&mut self) {
67        self.buffer.clear();
68    }
69
70    /// macros.tsv: luaZ_buffremove → buf.truncate_by(i)
71    pub fn truncate_by(&mut self, i: usize) {
72        let new_len = self.buffer.len().saturating_sub(i);
73        self.buffer.truncate(new_len);
74    }
75
76    /// allocated capacity. In C this changes `buffsize`, not the live byte
77    /// count `n`. The Rust analogue therefore manipulates `Vec::capacity`,
78    /// never `Vec::len` (otherwise `push_byte` would write past the live
79    /// content and leave embedded zero padding inside the token text).
80    pub fn resize(&mut self, _state: &mut LuaState, size: usize) -> Result<(), LuaError> {
81        if size < self.buffer.len() {
82            self.buffer.truncate(size);
83        }
84        if size > self.buffer.capacity() {
85            let extra = size - self.buffer.capacity();
86            self.buffer.reserve_exact(extra);
87        }
88        Ok(())
89    }
90
91    /// Append one byte to the live contents.  Panics if capacity exceeded
92    /// (callers must pre-check via `save`).
93    fn push_byte(&mut self, c: u8) {
94        self.buffer.push(c);
95    }
96}
97
98impl Default for LexBuffer {
99    fn default() -> Self {
100        Self::new()
101    }
102}
103
104/// Placeholder for `ZIO` from `lua_vm::zio`.
105/// TODO(port): replace with `use lua_vm::zio::ZIO` in Phase B.
106/// types.tsv: Zio → ZIO
107pub struct ZIO {
108    // TODO(port): full ZIO implementation lives in lua_vm::zio; this is a stub.
109    reader: Box<dyn FnMut() -> Option<Vec<u8>>>,
110    n: usize,
111    p: usize,
112    current_chunk: Vec<u8>,
113}
114
115impl ZIO {
116    /// Construct a ZIO from a reader callback that yields successive chunks.
117    pub fn new(reader: Box<dyn FnMut() -> Option<Vec<u8>>>) -> Self {
118        ZIO { reader, n: 0, p: 0, current_chunk: Vec::new() }
119    }
120
121    /// Construct a ZIO that yields the supplied bytes once and then EOZ.
122    pub fn from_bytes(bytes: Vec<u8>) -> Self {
123        let mut once = Some(bytes);
124        ZIO::new(Box::new(move || once.take()))
125    }
126
127    /// macros.tsv: zgetc → z.getc()
128    pub fn getc(&mut self) -> i32 {
129        if self.n > 0 {
130            self.n -= 1;
131            let b = self.current_chunk[self.p] as u8;
132            self.p += 1;
133            b as i32
134        } else {
135            self.fill()
136        }
137    }
138
139    fn fill(&mut self) -> i32 {
140        match (self.reader)() {
141            None => EOZ,
142            Some(chunk) if chunk.is_empty() => EOZ,
143            Some(chunk) => {
144                self.n = chunk.len() - 1;
145                self.current_chunk = chunk;
146                self.p = 0;
147                let b = self.current_chunk[self.p] as u8;
148                self.p += 1;
149                b as i32
150            }
151        }
152    }
153}
154
155// ── Constants ─────────────────────────────────────────────────────────────────
156
157// macros.tsv: FIRST_RESERVED → const FIRST_RESERVED: i32 = 257
158/// First token kind value that is not a single-byte character.
159/// Single-byte tokens are represented by their ASCII value (0-255).
160pub const FIRST_RESERVED: i32 = 257;
161
162// macros.tsv: LUA_ENV → const LUA_ENV: &[u8] = b"_ENV"
163/// Name of the global environment upvalue.
164pub const LUA_ENV: &[u8] = b"_ENV";
165
166// macros.tsv: NUM_RESERVED → const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize
167/// Number of reserved words (keywords).
168pub const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize;
169
170// macros.tsv: EOZ → const EOZ: i32 = -1
171/// End-of-stream sentinel returned by ZIO::getc.
172pub const EOZ: i32 = -1;
173
174// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = ...
175const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
176    usize::MAX
177} else {
178    i64::MAX as usize
179};
180
181// macros.tsv: LUA_MIN_BUFFER → const LUA_MIN_BUFFER: usize = 32
182const LUA_MIN_BUFFER: usize = 32;
183
184// ── Token kind constants (ORDER RESERVED — matches C enum RESERVED) ───────────
185//
186// In C these are enum values.  In Rust we use i32 constants for Phase A
187// (faithful to `Token.token: int` in C) with a TODO for a proper enum in Phase B.
188//
189
190/// `and`
191pub const TK_AND: i32 = 257;
192/// `break`
193pub const TK_BREAK: i32 = 258;
194/// `do`
195pub const TK_DO: i32 = 259;
196/// `else`
197pub const TK_ELSE: i32 = 260;
198/// `elseif`
199pub const TK_ELSEIF: i32 = 261;
200/// `end`
201pub const TK_END: i32 = 262;
202/// `false`
203pub const TK_FALSE: i32 = 263;
204/// `for`
205pub const TK_FOR: i32 = 264;
206/// `function`
207pub const TK_FUNCTION: i32 = 265;
208/// `goto`
209pub const TK_GOTO: i32 = 266;
210/// `if`
211pub const TK_IF: i32 = 267;
212/// `in`
213pub const TK_IN: i32 = 268;
214/// `local`
215pub const TK_LOCAL: i32 = 269;
216/// `nil`
217pub const TK_NIL: i32 = 270;
218/// `not`
219pub const TK_NOT: i32 = 271;
220/// `or`
221pub const TK_OR: i32 = 272;
222/// `repeat`
223pub const TK_REPEAT: i32 = 273;
224/// `return`
225pub const TK_RETURN: i32 = 274;
226/// `then`
227pub const TK_THEN: i32 = 275;
228/// `true`
229pub const TK_TRUE: i32 = 276;
230/// `until`
231pub const TK_UNTIL: i32 = 277;
232/// `while`  (last keyword; NUM_RESERVED = TK_WHILE - FIRST_RESERVED + 1 = 22)
233pub const TK_WHILE: i32 = 278;
234/// `//`  (floor division)
235pub const TK_IDIV: i32 = 279;
236/// `..`  (concatenation)
237pub const TK_CONCAT: i32 = 280;
238/// `...` (vararg)
239pub const TK_DOTS: i32 = 281;
240/// `==`
241pub const TK_EQ: i32 = 282;
242/// `>=`
243pub const TK_GE: i32 = 283;
244/// `<=`
245pub const TK_LE: i32 = 284;
246/// `~=`
247pub const TK_NE: i32 = 285;
248/// `<<`
249pub const TK_SHL: i32 = 286;
250/// `>>`
251pub const TK_SHR: i32 = 287;
252/// `::`
253pub const TK_DBCOLON: i32 = 288;
254/// `<eof>`
255pub const TK_EOS: i32 = 289;
256/// `<number>`  (float literal)
257pub const TK_FLT: i32 = 290;
258/// `<integer>` (integer literal)
259pub const TK_INT: i32 = 291;
260/// `<name>`    (identifier)
261pub const TK_NAME: i32 = 292;
262/// `<string>`  (string literal)
263pub const TK_STRING: i32 = 293;
264
265// Lua 5.5 `global`: with the upstream-default LUA_COMPAT_GLOBAL it is NOT a
266// reserved word — it always lexes as TK_NAME (so it stays a valid identifier on
267// every version), and the parser recognizes the `global` declaration statement
268// contextually (see `globalstat`/`statement` in lua-parse). There is therefore
269// no dedicated token id.
270
271// ORDER RESERVED — index 0 = TK_AND - FIRST_RESERVED, etc.
272/// Display strings for tokens, indexed by `token - FIRST_RESERVED`.
273pub static LUAX_TOKENS: &[&[u8]] = &[
274    // keywords (indices 0-21)
275    b"and", b"break", b"do", b"else", b"elseif",
276    b"end", b"false", b"for", b"function", b"goto", b"if",
277    b"in", b"local", b"nil", b"not", b"or", b"repeat",
278    b"return", b"then", b"true", b"until", b"while",
279    // other terminal symbols (indices 22-35)
280    b"//", b"..", b"...", b"==", b">=", b"<=", b"~=",
281    b"<<", b">>", b"::", b"<eof>",
282    b"<number>", b"<integer>", b"<name>", b"<string>",
283];
284
285// ── SemInfo / TokenValue ───────────────────────────────────────────────────────
286
287// types.tsv: SemInfo → TokenValue
288/// Semantic payload carried by a token.
289///
290/// Corresponds to `SemInfo` (a C union) in `llex.h`.  In Rust this is a
291/// discriminated union (enum).
292///
293/// # C mapping
294/// ```text
295/// SemInfo.r   → TokenValue::Float(f64)      (lua_Number)
296/// SemInfo.i   → TokenValue::Int(i64)        (lua_Integer)
297/// SemInfo.ts  → TokenValue::Str(GcRef<LuaString>)
298/// (no C field) → TokenValue::None           (default / unset)
299/// ```
300#[derive(Clone)]
301pub enum TokenValue {
302    /// No semantic value (default; used for single-byte and most multi-char tokens).
303    None,
304    /// Float literal payload.  C: `seminfo.r` (`lua_Number`).
305    Float(f64),
306    /// Integer literal payload.  C: `seminfo.i` (`lua_Integer`).
307    Int(i64),
308    /// String/name payload.  C: `seminfo.ts` (`TString *`).
309    Str(GcRef<LuaString>),
310}
311
312// ── Token ─────────────────────────────────────────────────────────────────────
313
314// types.tsv: Token → Token;  Token.token → i32 (Phase A; TODO: TokenKind enum Phase B)
315/// A single lexed token with its semantic payload.
316///
317/// `kind` is an `i32` whose value is either an ASCII byte code (for single-byte
318/// tokens like `+`, `-`, `[`) or one of the `TK_*` constants (for reserved
319/// words, multi-char symbols, and literals).
320///
321/// TODO(port): Phase B — replace `kind: i32` with a proper `TokenKind` enum
322/// covering both single-byte and named tokens (e.g. `TokenKind::Char(u8)` +
323/// named variants).
324#[derive(Clone)]
325pub struct Token {
326    pub kind: i32,
327    pub value: TokenValue,
328}
329
330impl Token {
331    /// Construct a token with no semantic value.
332    pub fn new(kind: i32) -> Self {
333        Token { kind, value: TokenValue::None }
334    }
335
336    /// The end-of-stream sentinel token.
337    pub fn eos() -> Self {
338        Token::new(TK_EOS)
339    }
340}
341
342// ── LexState ──────────────────────────────────────────────────────────────────
343
344// types.tsv: LexState → LexState;  LexState.L removed (thread via &mut LuaState)
345/// Per-chunk lexer (and shared parser) state.
346///
347/// Corresponds to `LexState` in `llex.h`.  Owns the input stream, token
348/// buffer, and current/lookahead tokens.
349///
350/// # C mapping (types.tsv)
351/// ```text
352/// LexState.current    → current: i32        (charint; -1 = EOZ)
353/// LexState.linenumber → linenumber: i32
354/// LexState.lastline   → lastline: i32
355/// LexState.t          → t: Token            (current token)
356/// LexState.lookahead  → lookahead: Token    (one-token lookahead)
357/// LexState.fs         → fs: Option<Box<FuncState>>   (parser state)
358/// LexState.L          → (removed; callers pass &mut LuaState)
359/// LexState.z          → z: ZIO              (owned input stream)
360/// LexState.buff       → buff: LexBuffer     (owned token-text buffer)
361/// LexState.h          → h: GcRef<LuaTable>  (string-anchor table)
362/// LexState.dyd        → dyd: DynData        (parser dynamic data)
363/// LexState.source     → source: GcRef<LuaString>
364/// LexState.envn       → envn: GcRef<LuaString>
365/// ```
366pub struct LexState {
367    pub current: i32,
368    pub linenumber: i32,
369    pub lastline: i32,
370    pub t: Token,
371    pub lookahead: Token,
372    // TODO(port): Box<FuncState> once FuncState lands in lua-parse (Phase B)
373    pub fs: Option<()>,
374    // PORT NOTE: C held a pointer; Rust owns the ZIO directly per types.tsv.
375    pub z: ZIO,
376    // PORT NOTE: C held a pointer; Rust owns the LexBuffer directly per types.tsv.
377    pub buff: LexBuffer,
378    // TODO(port): GcRef<LuaTable> once LuaTable is defined in Phase B
379    pub h: Option<GcRef<LuaTable>>,
380    /// Per-parse-session anchor for long strings. C-Lua's `ls->h` is a Lua
381    /// table that deduplicates all literal strings within a chunk (both short
382    /// and long), so e.g. `local s1 <const>="..."` and `local s2 <const>="..."`
383    /// with identical 50-byte payloads share one `TString` object — which is
384    /// what makes `string.format("%p", s1) == string.format("%p", s2)` hold.
385    /// Short strings already share identity via the global `interned_lt` pool,
386    /// but long strings (>LUAI_MAXSHORTLEN = 40) are not globally interned and
387    /// need this session-level map. Keyed by the string bytes; populated lazily
388    /// by `new_string`.
389    pub long_str_anchor: std::collections::HashMap<Vec<u8>, GcRef<LuaString>>,
390    // TODO(port): DynData once parser types land in Phase B
391    pub dyd: Option<()>,
392    pub source: GcRef<LuaString>,
393    pub envn: GcRef<LuaString>,
394    /// The active Lua version, snapshotted at lexer setup from
395    /// `state.global().lua_version` (fixed for the lifetime of a parse). The
396    /// error formatters (`lex_error`/`token2str`) take only `&LexState`, so they
397    /// read the version here rather than threading a `&LuaState` through every
398    /// syntax-error callsite. Lua 5.1 quotes the special multi-char token labels
399    /// (`<eof>`, `<name>`, …) in error messages where 5.2+ leaves them bare.
400    pub version: lua_types::LuaVersion,
401}
402
403// ── Character-classification helpers ─────────────────────────────────────────
404//
405// These are simplified ASCII implementations for Phase A.
406// TODO(port): import from lua_vm::ctype in Phase B; the full table handles
407// the LUA_UCID (Unicode identifiers) flag and matches the C bit-table exactly.
408//
409// PORT NOTE: the C macros take `int` (not `char`) so they handle EOZ (-1) safely.
410// These Rust fns match that contract: EOZ returns false for all predicates.
411
412#[inline]
413fn is_digit(c: i32) -> bool {
414    c >= b'0' as i32 && c <= b'9' as i32
415}
416
417#[inline]
418fn is_xdigit(c: i32) -> bool {
419    (c >= b'0' as i32 && c <= b'9' as i32)
420        || (c >= b'a' as i32 && c <= b'f' as i32)
421        || (c >= b'A' as i32 && c <= b'F' as i32)
422}
423
424// ALPHABIT: ASCII letters + '_'
425#[inline]
426fn is_lalpha(c: i32) -> bool {
427    (c >= b'a' as i32 && c <= b'z' as i32)
428        || (c >= b'A' as i32 && c <= b'Z' as i32)
429        || c == b'_' as i32
430}
431
432#[inline]
433fn is_lalnum(c: i32) -> bool {
434    is_lalpha(c) || is_digit(c)
435}
436
437#[inline]
438fn is_space(c: i32) -> bool {
439    matches!(c, 9 | 10 | 11 | 12 | 13 | 32) // \t \n \v \f \r space
440}
441
442// PRINTBIT: printable ASCII (graph + space), i.e. 0x20-0x7E
443#[inline]
444fn is_print(c: i32) -> bool {
445    c >= 0x20 && c <= 0x7E
446}
447
448#[inline]
449fn curr_is_newline(ls: &LexState) -> bool {
450    ls.current == b'\n' as i32 || ls.current == b'\r' as i32
451}
452
453// ── Low-level stream helpers ───────────────────────────────────────────────────
454
455/// Advance the lexer by one character.
456///
457/// Corresponds to the `next(ls)` macro.  Named `advance` to avoid collision
458/// with Rust's iterator method.
459#[inline]
460fn advance(ls: &mut LexState) {
461    // macros.tsv: zgetc → z.getc()
462    ls.current = ls.z.getc();
463}
464
465/// Append character `c` to the token buffer, growing it if necessary.
466///
467/// On overflow calls [`lex_error`] which becomes `Err(LuaError::Syntax(...))`.
468///
469/// # C source
470/// ```c
471///
472/// //   Mbuffer *b = ls->buff;
473/// //   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
474/// //     size_t newsize;
475/// //     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
476/// //       lexerror(ls, "lexical element too long", 0);
477/// //     newsize = luaZ_sizebuffer(b) * 2;
478/// //     luaZ_resizebuffer(ls->L, b, newsize);
479/// //   }
480/// //   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
481/// // }
482/// ```
483fn save(ls: &mut LexState, state: &mut LuaState, c: i32) -> Result<(), LuaError> {
484    // macros.tsv: luaZ_bufflen → buf.len(); luaZ_sizebuffer → buf.capacity()
485    if ls.buff.len() + 1 > ls.buff.capacity() {
486        if ls.buff.capacity() >= MAX_SIZE / 2 {
487            return Err(lex_error(ls, b"lexical element too long", 0));
488        }
489        //    luaZ_resizebuffer(ls->L, b, newsize);
490        // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
491        let newsize = ls.buff.capacity() * 2;
492        ls.buff.resize(state, newsize)?;
493    }
494    // macros.tsv: cast_char → x as i8  (C char is signed; Lua bytes stored as-is)
495    // PORT NOTE: we store the byte value directly; the i8 cast in C is for the
496    // C char type but the data is read back as unsigned via cast_uchar everywhere.
497    ls.buff.push_byte(c as u8);
498    Ok(())
499}
500
501/// Save the current character into the token buffer, then advance the stream.
502///
503/// Corresponds to the `save_and_next(ls)` macro.  Fallible because `save`
504/// may need to grow the buffer.
505#[inline]
506fn save_and_next(ls: &mut LexState, state: &mut LuaState) -> Result<(), LuaError> {
507    let c = ls.current;
508    save(ls, state, c)?;
509    advance(ls);
510    Ok(())
511}
512
513// ── Error helpers ─────────────────────────────────────────────────────────────
514
515// l_noret → -> !  but in Rust we return LuaError (callers wrap in Err(...))
516// error_sites.tsv: luaX_lexerror → return Err(LuaError::syntax_at(ls, "msg", token))
517/// Build a syntax error, optionally annotated with the offending token text.
518///
519/// Corresponds to the static `lexerror` function in `llex.c`.  In C this is
520/// `l_noret` (diverges via `luaD_throw`); in Rust it returns a `LuaError`
521/// value that callers wrap in `Err(...)`.
522///
523/// # C source
524/// ```c
525///
526/// //   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
527/// //   if (token)
528/// //     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
529/// //   luaD_throw(ls->L, LUA_ERRSYNTAX);
530/// // }
531/// ```
532pub fn lex_error(ls: &mut LexState, msg: &[u8], token: i32) -> LuaError {
533    const LUA_IDSIZE: usize = 60;
534    let mut buff = [0u8; LUA_IDSIZE];
535    let n = lua_vm::object::chunk_id(&mut buff[..], ls.source.as_bytes());
536    let src_part = &buff[..n];
537
538    let mut full_msg: Vec<u8> = Vec::new();
539    full_msg.extend_from_slice(src_part);
540    let _ = write!(full_msg, ":{}: ", ls.linenumber);
541    full_msg.extend_from_slice(msg);
542
543    if token != 0 {
544        let tok_text = txt_token(ls, token);
545        full_msg.extend_from_slice(b" near ");
546        full_msg.extend_from_slice(&tok_text);
547    }
548
549    LuaError::syntax_raw(&full_msg)
550}
551
552// LUAI_FUNC → pub(crate)
553// error_sites.tsv: luaX_syntaxerror → return Err(LuaError::syntax(format_args!("msg")))
554/// Report a syntax error at the current token.
555///
556/// # C source
557/// ```c
558///
559/// //   lexerror(ls, msg, ls->t.token);
560/// // }
561/// ```
562pub fn syntax_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
563    let token = ls.t.kind;
564    lex_error(ls, msg, token)
565}
566
567/// Report a semantic error at the current line WITHOUT the `near <token>`
568/// suffix.
569///
570/// Mirrors upstream `luaK_semerror` (`lcode.c`), which sets
571/// `ls->t.token = 0` before calling `luaX_syntaxerror` so the `near` clause is
572/// suppressed. Used for attribute errors (`unknown attribute '<name>'`,
573/// `global variables cannot be to-be-closed`) where the offending construct is
574/// the attribute itself, not the current lookahead token.
575pub fn sem_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
576    lex_error(ls, msg, 0)
577}
578
579/// Produce a human-readable representation of `token` for error messages.
580///
581/// For `TK_NAME`, `TK_STRING`, `TK_FLT`, `TK_INT`: formats the current
582/// token buffer contents as `'<text>'`.  For everything else, delegates to
583/// [`token2str`].
584///
585/// # C source
586/// ```c
587///
588/// //   switch (token) {
589/// //     case TK_NAME: case TK_STRING:
590/// //     case TK_FLT: case TK_INT:
591/// //       save(ls, '\0');
592/// //       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
593/// //     default:
594/// //       return luaX_token2str(ls, token);
595/// //   }
596/// // }
597/// ```
598///
599/// PORT NOTE: C calls `luaO_pushfstring` which pushes the string onto the
600/// Lua stack (stack-anchored temporary).  Rust returns `Vec<u8>` directly
601/// since there is no stack-based string lifecycle for error formatting.
602fn txt_token(ls: &mut LexState, token: i32) -> Vec<u8> {
603    match token {
604        t if t == TK_NAME || t == TK_STRING || t == TK_FLT || t == TK_INT => {
605            let mut v: Vec<u8> = Vec::new();
606            v.push(b'\'');
607            let buff = ls.buff.as_slice();
608            let trimmed = if buff.last() == Some(&0) { &buff[..buff.len() - 1] } else { buff };
609            v.extend_from_slice(trimmed);
610            v.push(b'\'');
611            v
612        }
613        _ => token2str_raw(token, ls.version),
614    }
615}
616
617// LUAI_FUNC → pub(crate)
618/// Produce a human-readable token description (for error messages and the parser).
619///
620/// Single-byte printable tokens are formatted as `'X'`; non-printable as
621/// `'<\N>'`.  Reserved words and multi-char symbols are formatted as `'kw'`.
622/// Literal tokens (`<name>`, `<string>`, etc.) return the bare label.
623///
624/// # C source
625/// ```c
626///
627/// //   if (token < FIRST_RESERVED) {
628/// //     if (lisprint(token))
629/// //       return luaO_pushfstring(ls->L, "'%c'", token);
630/// //     else
631/// //       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
632/// //   }
633/// //   else {
634/// //     const char *s = luaX_tokens[token - FIRST_RESERVED];
635/// //     if (token < TK_EOS)
636/// //       return luaO_pushfstring(ls->L, "'%s'", s);
637/// //     else
638/// //       return s;
639/// //   }
640/// // }
641/// ```
642///
643/// PORT NOTE: The `LexState` parameter is retained in the signature for API
644/// parity with the C export, but is unused in Rust because we don't push onto
645/// the Lua stack.  The real formatting is in [`token2str_raw`].
646pub fn token2str(ls: &LexState, token: i32) -> Vec<u8> {
647    token2str_raw(token, ls.version)
648}
649
650/// Inner implementation of [`token2str`] that does not need `LexState`.
651///
652/// PORT NOTE: `version` gates the 5.1 special-token quoting. Upstream 5.1's
653/// `luaX_lexerror`/`error_expected` wrap the whole near/expected token in
654/// `LUA_QS` ('%s'), so the bare multi-char labels (`<eof>`, `<name>`, …) that
655/// `luaX_token2str` returns for `token >= TK_EOS` end up quoted. 5.2 rewrote
656/// `txtToken` to leave those bare and quote only symbols/reserved/literals, so
657/// for 5.2+ the `>= TK_EOS` arm stays unquoted. (Issue #105.)
658fn token2str_raw(token: i32, version: lua_types::LuaVersion) -> Vec<u8> {
659    if token < FIRST_RESERVED {
660        if is_print(token) {
661            vec![b'\'', token as u8, b'\'']
662        } else {
663            // PORT NOTE: uses write! to Vec<u8> to avoid String allocation for Lua data.
664            let mut v: Vec<u8> = Vec::new();
665            v.extend_from_slice(b"'<\\");
666            let _ = write!(&mut v, "{}", token);
667            v.extend_from_slice(b">'");
668            v
669        }
670    } else {
671        let idx = (token - FIRST_RESERVED) as usize;
672        let s = LUAX_TOKENS[idx];
673        if token < TK_EOS || version == lua_types::LuaVersion::V51 {
674            let mut v: Vec<u8> = Vec::with_capacity(s.len() + 2);
675            v.push(b'\'');
676            v.extend_from_slice(s);
677            v.push(b'\'');
678            v
679        } else {
680            s.to_vec()
681        }
682    }
683}
684
685// ── Public init / setup ───────────────────────────────────────────────────────
686
687// LUAI_FUNC → pub(crate)
688/// Initialise the lexer subsystem: intern all reserved words and fix them
689/// in the GC so they are never collected.
690///
691/// Must be called exactly once during VM startup via `luaX_init`.
692///
693/// # C source
694/// ```c
695///
696/// //   int i;
697/// //   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
698/// //   luaC_fix(L, obj2gco(e));  /* never collect this name */
699/// //   for (i=0; i<NUM_RESERVED; i++) {
700/// //     TString *ts = luaS_new(L, luaX_tokens[i]);
701/// //     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
702/// //     ts->extra = cast_byte(i+1);  /* reserved word */
703/// //   }
704/// // }
705/// ```
706pub fn init(state: &mut LuaState) -> Result<(), LuaError> {
707    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
708    // TODO(port): call state.intern_str(LUA_ENV) once LuaState has that method (Phase B)
709    let _e = intern_str_stub(state, LUA_ENV)?;
710
711    // macros.tsv: luaC_objbarrier / luaC_fix — GC fix; no-op in Phases A-C
712    // TODO(port): state.gc().fix(e) in Phase D
713
714    for i in 0..NUM_RESERVED {
715        // macros.tsv: luaS_new → state.intern_str(...)
716        // TODO(port): call state.intern_str(LUAX_TOKENS[i]) in Phase B
717        let ts = intern_str_stub(state, LUAX_TOKENS[i])?;
718
719        // TODO(port): state.gc().fix(ts.clone()) in Phase D
720
721        // macros.tsv: cast_byte → x as u8
722        // PORT NOTE: LuaString.extra uses Cell<u8> interior mutability.
723        // TODO(port): ts.set_extra((i + 1) as u8) — needs pub accessor on LuaString
724        let _ = ts; // suppress unused warning until Phase B
725    }
726
727    Ok(())
728}
729
730// LUAI_FUNC → pub(crate)
731/// Initialise `ls` for lexing a new chunk from stream `z`.
732///
733/// # C source
734/// ```c
735///
736/// //                         TString *source, int firstchar) {
737/// //   ls->t.token = 0;
738/// //   ls->L = L;
739/// //   ls->current = firstchar;
740/// //   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
741/// //   ls->z = z;
742/// //   ls->fs = NULL;
743/// //   ls->linenumber = 1;
744/// //   ls->lastline = 1;
745/// //   ls->source = source;
746/// //   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
747/// //   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);
748/// // }
749/// ```
750pub fn set_input(
751    state: &mut LuaState,
752    ls: &mut LexState,
753    z: ZIO,
754    source: GcRef<LuaString>,
755    firstchar: i32,
756) -> Result<(), LuaError> {
757    ls.t = Token::new(0);
758    ls.current = firstchar;
759    ls.lookahead = Token::eos();
760    ls.z = z;
761    ls.fs = None;
762    ls.linenumber = 1;
763    ls.lastline = 1;
764    ls.source = source;
765    ls.version = state.global().lua_version;
766    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
767    // TODO(port): state.intern_str(LUA_ENV) in Phase B
768    ls.envn = intern_str_stub(state, LUA_ENV)?;
769    // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
770    ls.buff.resize(state, LUA_MIN_BUFFER)?;
771    Ok(())
772}
773
774// LUAI_FUNC → pub(crate)
775/// Create (or retrieve) a Lua string and anchor it in the parser's GC-protection
776/// table `ls.h` so it cannot be collected before the end of compilation.
777///
778/// Also internalises long strings so that each unique content has exactly one
779/// copy in memory.  The table `ls.h` is used as a set: the string is both the
780/// key and the value.
781///
782/// # C source
783/// ```c
784///
785/// //   lua_State *L = ls->L;
786/// //   TString *ts = luaS_newlstr(L, str, l);
787/// //   const TValue *o = luaH_getstr(ls->h, ts);
788/// //   if (!ttisnil(o))  /* string already present? */
789/// //     ts = keystrval(nodefromval(o));  /* get saved copy */
790/// //   else {
791/// //     TValue *stv = s2v(L->top.p++);  /* reserve stack space */
792/// //     setsvalue(L, stv, ts);           /* anchor the string */
793/// //     luaH_finishset(L, ls->h, stv, o, stv);  /* t[string] = string */
794/// //     luaC_checkGC(L);
795/// //     L->top.p--;                       /* remove string from stack */
796/// //   }
797/// //   return ts;
798/// // }
799/// ```
800pub(crate) fn new_string(
801    state: &mut LuaState,
802    ls: &mut LexState,
803    bytes: &[u8],
804) -> Result<GcRef<LuaString>, LuaError> {
805    // PORT NOTE: in C, the anchor table ls->h is a Lua table mapping the string
806    // to itself so a second occurrence of the same literal in the chunk returns
807    // the originally-created TString. We use a plain HashMap on LexState
808    // (`long_str_anchor`) for the equivalent dedup — sufficient because Phase
809    // A-C `GcRef<T>` is `Rc<T>` and identity is determined by the `Rc`
810    // allocation. Short strings already share identity via the global pool;
811    // long strings (>LUAI_MAXSHORTLEN) need this session-level map.
812    if let Some(existing) = ls.long_str_anchor.get(bytes) {
813        return Ok(existing.clone());
814    }
815    let ts = intern_str_stub(state, bytes)?;
816    ls.long_str_anchor.insert(bytes.to_vec(), ts.clone());
817    Ok(ts)
818}
819
820// ── Public advance / lookahead ─────────────────────────────────────────────────
821
822// LUAI_FUNC → pub(crate)
823/// Consume the current token; load the next one from the stream.
824///
825/// If a lookahead token was set, it becomes the current token without re-reading
826/// from the stream.
827///
828/// # C source
829/// ```c
830///
831/// //   ls->lastline = ls->linenumber;
832/// //   if (ls->lookahead.token != TK_EOS) {
833/// //     ls->t = ls->lookahead;
834/// //     ls->lookahead.token = TK_EOS;
835/// //   }
836/// //   else
837/// //     ls->t.token = llex(ls, &ls->t.seminfo);
838/// // }
839/// ```
840pub fn next(
841    state: &mut LuaState,
842    ls: &mut LexState,
843) -> Result<(), LuaError> {
844    ls.lastline = ls.linenumber;
845
846    if ls.lookahead.kind != TK_EOS {
847        // Clone to avoid borrow conflict; LuaString inside TokenValue is GcRef (Rc).
848        ls.t = ls.lookahead.clone();
849        ls.lookahead = Token::eos();
850    } else {
851        let mut val = TokenValue::None;
852        let kind = llex(state, ls, &mut val)?;
853        ls.t = Token { kind, value: val };
854    }
855    Ok(())
856}
857
858// LUAI_FUNC → pub(crate)
859/// Peek at the next token without consuming the current one.
860///
861/// The lookahead token is cached in `ls.lookahead` and returned.  Only one
862/// token of lookahead is supported; calling this twice without an intervening
863/// [`next`] is a logic error (asserted in debug builds).
864///
865/// # C source
866/// ```c
867///
868/// //   lua_assert(ls->lookahead.token == TK_EOS);
869/// //   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
870/// //   return ls->lookahead.token;
871/// // }
872/// ```
873pub fn lookahead(
874    state: &mut LuaState,
875    ls: &mut LexState,
876) -> Result<i32, LuaError> {
877    // macros.tsv: lua_assert → debug_assert!
878    debug_assert!(
879        ls.lookahead.kind == TK_EOS,
880        "luaX_lookahead: lookahead already set"
881    );
882
883    let mut val = TokenValue::None;
884    let kind = llex(state, ls, &mut val)?;
885    ls.lookahead = Token { kind, value: val };
886
887    Ok(ls.lookahead.kind)
888}
889
890// ── Private lexer helpers ──────────────────────────────────────────────────────
891
892/// If the current character equals `c`, advance and return `true`.
893///
894/// # C source
895/// ```c
896///
897/// //   if (ls->current == c) { next(ls); return 1; }
898/// //   else return 0;
899/// // }
900/// ```
901fn check_next1(ls: &mut LexState, c: i32) -> bool {
902    if ls.current == c {
903        advance(ls);
904        true
905    } else {
906        false
907    }
908}
909
910/// If the current character is either of the two bytes in `set`, save-and-advance
911/// and return `true`.
912///
913/// # C source
914/// ```c
915///
916/// //   lua_assert(set[2] == '\0');
917/// //   if (ls->current == set[0] || ls->current == set[1]) {
918/// //     save_and_next(ls);
919/// //     return 1;
920/// //   }
921/// //   else return 0;
922/// // }
923/// ```
924fn check_next2(
925    ls: &mut LexState,
926    state: &mut LuaState,
927    set: &[u8; 2],
928) -> Result<bool, LuaError> {
929    if ls.current == set[0] as i32 || ls.current == set[1] as i32 {
930        save_and_next(ls, state)?;
931        Ok(true)
932    } else {
933        Ok(false)
934    }
935}
936
937/// Increment the line counter and consume the newline sequence.
938///
939/// Handles `\n`, `\r`, `\n\r`, and `\r\n`.
940///
941/// # C source
942/// ```c
943///
944/// //   int old = ls->current;
945/// //   lua_assert(currIsNewline(ls));
946/// //   next(ls);  /* skip '\n' or '\r' */
947/// //   if (currIsNewline(ls) && ls->current != old)
948/// //     next(ls);  /* skip '\n\r' or '\r\n' */
949/// //   if (++ls->linenumber >= MAX_INT)
950/// //     lexerror(ls, "chunk has too many lines", 0);
951/// // }
952/// ```
953fn inc_line_number(ls: &mut LexState, _state: &mut LuaState) -> Result<(), LuaError> {
954    // macros.tsv: lua_assert → debug_assert!
955    debug_assert!(curr_is_newline(ls), "inc_line_number: not at a newline");
956
957    let old = ls.current;
958    advance(ls);
959
960    if curr_is_newline(ls) && ls.current != old {
961        advance(ls);
962    }
963
964    // macros.tsv: MAX_INT → i32::MAX
965    ls.linenumber += 1;
966    if ls.linenumber >= i32::MAX {
967        return Err(lex_error(ls, b"chunk has too many lines", 0));
968    }
969    Ok(())
970}
971
972/// Scan a numeric literal (integer or float, decimal or hex).
973///
974/// The caller may have already read an initial dot.  Accepts the pattern:
975/// `%d(%x|%.|(Ee[+-]?))*` or `0[Xx](%x|%.|(Pp[+-]?))*`.
976///
977/// Returns `TK_INT` for integers, `TK_FLT` for floats.
978///
979/// # C source
980/// ```c
981///
982/// //   TValue obj;
983/// //   const char *expo = "Ee";
984/// //   int first = ls->current;
985/// //   lua_assert(lisdigit(ls->current));
986/// //   save_and_next(ls);
987/// //   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
988/// //     expo = "Pp";
989/// //   for (;;) {
990/// //     if (check_next2(ls, expo))
991/// //       check_next2(ls, "-+");
992/// //     else if (lisxdigit(ls->current) || ls->current == '.')
993/// //       save_and_next(ls);
994/// //     else break;
995/// //   }
996/// //   if (lislalpha(ls->current))  /* numeral touching a letter? */
997/// //     save_and_next(ls);         /* force an error */
998/// //   save(ls, '\0');
999/// //   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)
1000/// //     lexerror(ls, "malformed number", TK_FLT);
1001/// //   if (ttisinteger(&obj)) { seminfo->i = ivalue(&obj); return TK_INT; }
1002/// //   else { seminfo->r = fltvalue(&obj); return TK_FLT; }
1003/// // }
1004/// ```
1005fn read_numeral(
1006    state: &mut LuaState,
1007    ls: &mut LexState,
1008    seminfo: &mut TokenValue,
1009) -> Result<i32, LuaError> {
1010    let mut expo: &[u8; 2] = b"Ee";
1011
1012    let first = ls.current;
1013
1014    debug_assert!(is_digit(ls.current), "read_numeral: not at a digit");
1015
1016    save_and_next(ls, state)?;
1017
1018    if first == b'0' as i32 && check_next2(ls, state, b"xX")? {
1019        expo = b"Pp";
1020    }
1021
1022    loop {
1023        if check_next2(ls, state, expo)? {
1024            check_next2(ls, state, b"-+")?;
1025        } else if is_xdigit(ls.current) || ls.current == b'.' as i32 {
1026            //      save_and_next(ls);
1027            save_and_next(ls, state)?;
1028        } else {
1029            break;
1030        }
1031    }
1032
1033    if is_lalpha(ls.current) {
1034        save_and_next(ls, state)?;
1035    }
1036
1037    // In Rust, luaO_str2num will receive a byte slice; NUL is not needed.
1038    // We save 0 for parity with C, but our str2num stub ignores it.
1039    save(ls, state, 0)?;
1040
1041    //        lexerror(ls, "malformed number", TK_FLT);
1042    // macros.tsv: luaZ_buffer → buf.as_mut_slice()
1043    let buf = ls.buff.as_slice();
1044    let num_bytes = if buf.last() == Some(&0) { &buf[..buf.len() - 1] } else { buf };
1045    let mut obj = lua_types::LuaValue::Nil;
1046    if lua_vm::object::str2num(num_bytes, &mut obj) == 0 {
1047        return Err(lex_error(ls, b"malformed number", TK_FLT));
1048    }
1049    match obj {
1050        lua_types::LuaValue::Int(i) => {
1051            // Lua 5.1/5.2 are float-only: `lua_Number` is the only numeric type,
1052            // so every numeric literal is parsed as a float (`lua_str2number`),
1053            // including ones written without a decimal point. A literal like
1054            // 9007199254740993 therefore loses precision exactly as in lua5.2.4
1055            // (prints `9.007199254741e+15`), rather than surviving as an i64.
1056            if is_float_only(state) {
1057                *seminfo = TokenValue::Float(i as f64);
1058                Ok(TK_FLT)
1059            } else {
1060                *seminfo = TokenValue::Int(i);
1061                Ok(TK_INT)
1062            }
1063        }
1064        lua_types::LuaValue::Float(f) => {
1065            *seminfo = TokenValue::Float(f);
1066            Ok(TK_FLT)
1067        }
1068        _ => unreachable!("str2num returned non-numeric LuaValue"),
1069    }
1070}
1071
1072/// Scan a `[=*[` or `]=*]` sequence; leave the last bracket as current char.
1073///
1074/// Returns:
1075/// - `count + 2` if well-formed (where `count` is the number of `=` signs),
1076/// - `1` if a single bracket with no `=`s and no second bracket,
1077/// - `0` if malformed (e.g. `[==` with no closing bracket).
1078///
1079/// # C source
1080/// ```c
1081///
1082/// //   size_t count = 0;
1083/// //   int s = ls->current;
1084/// //   lua_assert(s == '[' || s == ']');
1085/// //   save_and_next(ls);
1086/// //   while (ls->current == '=') {
1087/// //     save_and_next(ls);
1088/// //     count++;
1089/// //   }
1090/// //   return (ls->current == s) ? count + 2
1091/// //          : (count == 0) ? 1
1092/// //          : 0;
1093/// // }
1094/// ```
1095fn skip_sep(
1096    state: &mut LuaState,
1097    ls: &mut LexState,
1098) -> Result<usize, LuaError> {
1099    let mut count: usize = 0;
1100    let s = ls.current;
1101    debug_assert!(s == b'[' as i32 || s == b']' as i32, "skip_sep: not at bracket");
1102
1103    save_and_next(ls, state)?;
1104
1105    while ls.current == b'=' as i32 {
1106        save_and_next(ls, state)?;
1107        count += 1;
1108    }
1109
1110    if ls.current == s {
1111        Ok(count + 2)
1112    } else if count == 0 {
1113        Ok(1)
1114    } else {
1115        Ok(0)
1116    }
1117}
1118
1119/// Scan a long string or long comment delimited by `[=*[` … `]=*]`.
1120///
1121/// `seminfo` is `Some` when reading a string literal; `None` when skipping a
1122/// long comment.  When `None`, buffer contents are discarded on each newline
1123/// to avoid wasting memory.
1124///
1125/// # C source
1126/// ```c
1127///
1128/// //   int line = ls->linenumber;
1129/// //   save_and_next(ls);  /* skip 2nd '[' */
1130/// //   if (currIsNewline(ls)) inclinenumber(ls);
1131/// //   for (;;) {
1132/// //     switch (ls->current) {
1133/// //       case EOZ: { /* error */
1134/// //         const char *what = (seminfo ? "string" : "comment");
1135/// //         const char *msg = luaO_pushfstring(..., what, line);
1136/// //         lexerror(ls, msg, TK_EOS);
1137/// //         break;
1138/// //       }
1139/// //       case ']': {
1140/// //         if (skip_sep(ls) == sep) {
1141/// //           save_and_next(ls);  /* skip 2nd ']' */
1142/// //           goto endloop;
1143/// //         }
1144/// //         break;
1145/// //       }
1146/// //       case '\n': case '\r': {
1147/// //         save(ls, '\n');
1148/// //         inclinenumber(ls);
1149/// //         if (!seminfo) luaZ_resetbuffer(ls->buff);
1150/// //         break;
1151/// //       }
1152/// //       default: {
1153/// //         if (seminfo) save_and_next(ls);
1154/// //         else next(ls);
1155/// //       }
1156/// //     }
1157/// //   } endloop:
1158/// //   if (seminfo)
1159/// //     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1160/// //                                      luaZ_bufflen(ls->buff) - 2 * sep);
1161/// // }
1162/// ```
1163fn read_long_string(
1164    state: &mut LuaState,
1165    ls: &mut LexState,
1166    seminfo: Option<&mut TokenValue>,
1167    sep: usize,
1168) -> Result<(), LuaError> {
1169    let line = ls.linenumber;
1170
1171    save_and_next(ls, state)?;
1172
1173    if curr_is_newline(ls) {
1174        inc_line_number(ls, state)?;
1175    }
1176
1177    // is_string: whether we are reading a string (true) or a comment (false)
1178    let is_string = seminfo.is_some();
1179
1180    loop {
1181        match ls.current {
1182            c if c == EOZ => {
1183                let what: &[u8] = if is_string { b"string" } else { b"comment" };
1184                // PORT NOTE: build message as Vec<u8> to avoid String allocation.
1185                let mut msg: Vec<u8> = Vec::new();
1186                msg.extend_from_slice(b"unfinished long ");
1187                msg.extend_from_slice(what);
1188                msg.extend_from_slice(b" (starting at line ");
1189                let _ = write!(&mut msg, "{}", line);
1190                msg.push(b')');
1191                return Err(lex_error(ls, &msg, TK_EOS));
1192            }
1193            c if c == b']' as i32 => {
1194                let s = skip_sep(state, ls)?;
1195                if s == sep {
1196                    save_and_next(ls, state)?;
1197                    break;
1198                }
1199                // else: the ']' sequence wasn't the closing delimiter; continue
1200            }
1201            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1202                save(ls, state, b'\n' as i32)?;
1203                inc_line_number(ls, state)?;
1204                // macros.tsv: luaZ_resetbuffer → buf.clear()
1205                if !is_string {
1206                    ls.buff.clear();
1207                }
1208            }
1209            _ => {
1210                if is_string {
1211                    save_and_next(ls, state)?;
1212                } else {
1213                    advance(ls);
1214                }
1215            }
1216        }
1217    }
1218
1219    //      seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1220    //                                       luaZ_bufflen(ls->buff) - 2 * sep);
1221    if let Some(out) = seminfo {
1222        // The buffer contains: sep bytes of '[=' + content + sep bytes of '=]'
1223        // We want the content in between.
1224        // PORT NOTE: per PORTING.md §4.3, capture the slice into an owned
1225        // Vec so the immutable borrow of ls.buff is dropped before the
1226        // mutable borrow needed by new_string.
1227        let buf = ls.buff.as_slice();
1228        let content: Vec<u8> = buf[sep..buf.len() - sep].to_vec();
1229        let ts = new_string(state, ls, &content)?;
1230        *out = TokenValue::Str(ts);
1231    }
1232    Ok(())
1233}
1234
1235/// Check `c` is non-zero (truthy); if not, save the current char and raise a
1236/// string-escape error.
1237///
1238/// # C source
1239/// ```c
1240///
1241/// //   if (!c) {
1242/// //     if (ls->current != EOZ)
1243/// //       save_and_next(ls);  /* add current to buffer for error message */
1244/// //     lexerror(ls, msg, TK_STRING);
1245/// //   }
1246/// // }
1247/// ```
1248fn esc_check(
1249    state: &mut LuaState,
1250    ls: &mut LexState,
1251    ok: bool,
1252    msg: &[u8],
1253) -> Result<(), LuaError> {
1254    if !ok {
1255        if ls.current != EOZ {
1256            save_and_next(ls, state)?;
1257        }
1258        return Err(lex_error(ls, msg, TK_STRING));
1259    }
1260    Ok(())
1261}
1262
1263/// Save-and-advance, then verify the new current char is a hex digit; return
1264/// its numeric value (0-15).
1265///
1266/// # C source
1267/// ```c
1268///
1269/// //   save_and_next(ls);
1270/// //   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
1271/// //   return luaO_hexavalue(ls->current);
1272/// // }
1273/// ```
1274fn get_hexa(
1275    state: &mut LuaState,
1276    ls: &mut LexState,
1277) -> Result<u32, LuaError> {
1278    save_and_next(ls, state)?;
1279    esc_check(state, ls, is_xdigit(ls.current), b"hexadecimal digit expected")?;
1280    // TODO(port): call lua_vm::object::hex_value in Phase B
1281    Ok(hex_value_stub(ls.current))
1282}
1283
1284/// Scan a `\xNN` hex escape; return the decoded byte value.
1285///
1286/// # C source
1287/// ```c
1288///
1289/// //   int r = gethexa(ls);
1290/// //   r = (r << 4) + gethexa(ls);
1291/// //   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
1292/// //   return r;
1293/// // }
1294/// ```
1295fn read_hex_esc(
1296    state: &mut LuaState,
1297    ls: &mut LexState,
1298) -> Result<u32, LuaError> {
1299    let r = get_hexa(state, ls)?;
1300    let r = (r << 4) + get_hexa(state, ls)?;
1301    // macros.tsv: luaZ_buffremove → buf.truncate_by(i)
1302    ls.buff.truncate_by(2);
1303    Ok(r)
1304}
1305
1306/// Scan a `\u{XXXXXX}` UTF-8 escape; return the Unicode codepoint.
1307///
1308/// # C source
1309/// ```c
1310///
1311/// //   unsigned long r;
1312/// //   int i = 4;  /* chars to remove: '\', 'u', '{', first digit */
1313/// //   save_and_next(ls);  /* skip 'u' */
1314/// //   esccheck(ls, ls->current == '{', "missing '{'");
1315/// //   r = gethexa(ls);  /* must have at least one digit */
1316/// //   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
1317/// //     i++;
1318/// //     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
1319/// //     r = (r << 4) + luaO_hexavalue(ls->current);
1320/// //   }
1321/// //   esccheck(ls, ls->current == '}', "missing '}'");
1322/// //   next(ls);  /* skip '}' */
1323/// //   luaZ_buffremove(ls->buff, i);
1324/// //   return r;
1325/// // }
1326/// ```
1327fn read_utf8_esc(
1328    state: &mut LuaState,
1329    ls: &mut LexState,
1330) -> Result<u32, LuaError> {
1331    let mut i: usize = 4;
1332
1333    save_and_next(ls, state)?;
1334
1335    esc_check(state, ls, ls.current == b'{' as i32, b"missing '{'")?;
1336
1337    let mut r = get_hexa(state, ls)?;
1338
1339    // The codepoint upper bound is version-gated and the C control flow differs
1340    // between families (`llex.c readutf8esc`):
1341    //   * 5.3 (L336-340): `r = (r<<4)+digit; esccheck(r <= 0x10FFFF, ...)` —
1342    //     accumulate the digit FIRST, then bound the running value at 0x10FFFF.
1343    //   * 5.4 (L351) / 5.5 (L373): `esccheck(r <= (0x7FFFFFFFu >> 4), ...);
1344    //     r = (r<<4)+digit` — bound BEFORE the shift, allowing up to 0x7FFFFFFF.
1345    // The order (check-before-shift vs shift-before-check) is reproduced exactly
1346    // because it also determines how many digits land in the `near '...'` buffer
1347    // snippet of the error message.
1348    let is_v53 = matches!(state.global().lua_version, lua_types::LuaVersion::V53);
1349
1350    // cast_void: discard return value
1351    loop {
1352        save_and_next(ls, state)?;
1353        if !is_xdigit(ls.current) {
1354            break;
1355        }
1356        i += 1;
1357        if is_v53 {
1358            // TODO(port): lua_vm::object::hex_value in Phase B
1359            r = (r << 4) + hex_value_stub(ls.current);
1360            esc_check(state, ls, r <= 0x10_FFFF, b"UTF-8 value too large")?;
1361        } else {
1362            esc_check(state, ls, r <= (0x7FFF_FFFFu32 >> 4), b"UTF-8 value too large")?;
1363            // TODO(port): lua_vm::object::hex_value in Phase B
1364            r = (r << 4) + hex_value_stub(ls.current);
1365        }
1366    }
1367
1368    esc_check(state, ls, ls.current == b'}' as i32, b"missing '}'")?;
1369
1370    advance(ls);
1371
1372    ls.buff.truncate_by(i);
1373
1374    Ok(r)
1375}
1376
1377/// Scan `\u{...}` and append the UTF-8 encoding of the codepoint to the buffer.
1378///
1379/// # C source
1380/// ```c
1381///
1382/// //   char buff[UTF8BUFFSZ];
1383/// //   int n = luaO_utf8esc(buff, readutf8esc(ls));
1384/// //   for (; n > 0; n--)
1385/// //     save(ls, buff[UTF8BUFFSZ - n]);
1386/// // }
1387/// ```
1388fn utf8_esc(
1389    state: &mut LuaState,
1390    ls: &mut LexState,
1391) -> Result<(), LuaError> {
1392    let codepoint = read_utf8_esc(state, ls)?;
1393
1394    // macros.tsv: UTF8BUFFSZ → const UTF8_BUF_SZ: usize = 8
1395    // TODO(port): call lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1396    // For Phase A, encode directly here.
1397    let encoded = utf8_encode_stub(codepoint);
1398
1399    for &b in &encoded {
1400        save(ls, state, b as i32)?;
1401    }
1402    Ok(())
1403}
1404
1405/// Scan a decimal escape `\ddd` (up to 3 digits); return the byte value.
1406///
1407/// # C source
1408/// ```c
1409///
1410/// //   int i;
1411/// //   int r = 0;
1412/// //   for (i = 0; i < 3 && lisdigit(ls->current); i++) {
1413/// //     r = 10*r + ls->current - '0';
1414/// //     save_and_next(ls);
1415/// //   }
1416/// //   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
1417/// //   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
1418/// //   return r;
1419/// // }
1420/// ```
1421fn read_dec_esc(
1422    state: &mut LuaState,
1423    ls: &mut LexState,
1424) -> Result<u32, LuaError> {
1425    let mut i: usize = 0;
1426    let mut r: u32 = 0;
1427
1428    while i < 3 && is_digit(ls.current) {
1429        r = 10 * r + (ls.current as u32 - b'0' as u32);
1430        save_and_next(ls, state)?;
1431        i += 1;
1432    }
1433
1434    // UCHAR_MAX = 255 = u8::MAX. Lua 5.1 spells this `escape sequence too
1435    // large` (the `decimal escape too large` wording is 5.2+). Verified against
1436    // lua5.1.5; see specs/followup/5.1-roster-syntax.md §2.
1437    let too_large_msg: &[u8] = if matches!(
1438        state.global().lua_version,
1439        lua_types::LuaVersion::V51
1440    ) {
1441        b"escape sequence too large"
1442    } else {
1443        b"decimal escape too large"
1444    };
1445    esc_check(state, ls, r <= u8::MAX as u32, too_large_msg)?;
1446
1447    ls.buff.truncate_by(i);
1448    Ok(r)
1449}
1450
1451/// Scan a short (single/double-quoted) string literal.
1452///
1453/// The C function uses `goto read_save / only_save / no_save` for escape
1454/// handling.  In Rust this is replaced by the `EscapeResult` enum.
1455///
1456/// # C source (see llex.c lines 382-442 for full listing)
1457fn read_string(
1458    state: &mut LuaState,
1459    ls: &mut LexState,
1460    del: i32,
1461    seminfo: &mut TokenValue,
1462) -> Result<(), LuaError> {
1463    // Encoding for what the escape sequence handler needs to do after decoding.
1464    //
1465    // read_save:  advance(ls), remove '\' from buffer, save decoded byte
1466    // only_save:  remove '\' from buffer, save decoded byte (no advance)
1467    // no_save:    nothing (just break from the escape case)
1468    enum EscapeResult {
1469        ReadSave(i32),
1470        OnlySave(i32),
1471        NoSave,
1472    }
1473
1474    save_and_next(ls, state)?;
1475
1476    while ls.current != del {
1477        match ls.current {
1478            c if c == EOZ => {
1479                return Err(lex_error(ls, b"unfinished string", TK_EOS));
1480            }
1481            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1482                return Err(lex_error(ls, b"unfinished string", TK_STRING));
1483            }
1484            c if c == b'\\' as i32 => {
1485                save_and_next(ls, state)?;
1486
1487                // Lua 5.1's lexer does NOT recognize `\x`, `\z`, or `\u`, and it
1488                // does NOT raise on an unknown escape. For any escape char outside
1489                // the known set, the 5.1 lexer silently drops the backslash and
1490                // keeps the next character verbatim (`"\x41"` → bytes `x41`,
1491                // `"\z"` → `z`, `"\q"` → `q`). Decimal escapes (`\ddd`) and the
1492                // standard letter/quote/newline escapes still work. Verified
1493                // against lua5.1.5; see specs/followup/5.1-roster-syntax.md §2.
1494                let is_v51 = matches!(
1495                    state.global().lua_version,
1496                    lua_types::LuaVersion::V51
1497                );
1498
1499                // Inner switch on the escape character
1500                let esc = match ls.current {
1501                    c if c == b'a' as i32 => EscapeResult::ReadSave(b'\x07' as i32),
1502                    c if c == b'b' as i32 => EscapeResult::ReadSave(b'\x08' as i32),
1503                    c if c == b'f' as i32 => EscapeResult::ReadSave(b'\x0C' as i32),
1504                    c if c == b'n' as i32 => EscapeResult::ReadSave(b'\n' as i32),
1505                    c if c == b'r' as i32 => EscapeResult::ReadSave(b'\r' as i32),
1506                    c if c == b't' as i32 => EscapeResult::ReadSave(b'\t' as i32),
1507                    c if c == b'v' as i32 => EscapeResult::ReadSave(b'\x0B' as i32),
1508                    c if c == b'x' as i32 && !is_v51 => {
1509                        let decoded = read_hex_esc(state, ls)?;
1510                        EscapeResult::ReadSave(decoded as i32)
1511                    }
1512                    c if c == b'u' as i32 && !is_v51 => {
1513                        utf8_esc(state, ls)?;
1514                        EscapeResult::NoSave
1515                    }
1516                    c if c == b'\n' as i32 || c == b'\r' as i32 => {
1517                        inc_line_number(ls, state)?;
1518                        EscapeResult::OnlySave(b'\n' as i32)
1519                    }
1520                    c if c == b'\\' as i32 || c == b'"' as i32 || c == b'\'' as i32 => {
1521                        EscapeResult::ReadSave(c)
1522                    }
1523                    c if c == EOZ => EscapeResult::NoSave,
1524                    c if c == b'z' as i32 && !is_v51 => {
1525                        ls.buff.truncate_by(1);
1526                        advance(ls);
1527                        while is_space(ls.current) {
1528                            if curr_is_newline(ls) {
1529                                inc_line_number(ls, state)?;
1530                            } else {
1531                                advance(ls);
1532                            }
1533                        }
1534                        EscapeResult::NoSave
1535                    }
1536                    c if is_v51 && !is_digit(c) => {
1537                        // 5.1 unknown escape: drop the backslash, emit the char.
1538                        EscapeResult::ReadSave(c)
1539                    }
1540                    _ => {
1541                        esc_check(
1542                            state, ls,
1543                            is_digit(ls.current),
1544                            b"invalid escape sequence",
1545                        )?;
1546                        let decoded = read_dec_esc(state, ls)?;
1547                        EscapeResult::OnlySave(decoded as i32)
1548                    }
1549                };
1550
1551                // Dispatch the C goto targets as match arms.
1552                match esc {
1553                    EscapeResult::ReadSave(c) => {
1554                        advance(ls);
1555                        ls.buff.truncate_by(1);
1556                        save(ls, state, c)?;
1557                    }
1558                    EscapeResult::OnlySave(c) => {
1559                        ls.buff.truncate_by(1);
1560                        save(ls, state, c)?;
1561                    }
1562                    EscapeResult::NoSave => {}
1563                }
1564            }
1565            _ => {
1566                save_and_next(ls, state)?;
1567            }
1568        }
1569    }
1570
1571    save_and_next(ls, state)?;
1572
1573    //                                     luaZ_bufflen(ls->buff) - 2);
1574    // Buffer contains: delimiter + content + delimiter; strip both delimiters.
1575    // PORT NOTE: capture into owned Vec to drop the borrow before new_string.
1576    let buf = ls.buff.as_slice();
1577    let content: Vec<u8> = if buf.len() >= 2 {
1578        buf[1..buf.len() - 1].to_vec()
1579    } else {
1580        Vec::new()
1581    };
1582    let ts = new_string(state, ls, &content)?;
1583    *seminfo = TokenValue::Str(ts);
1584    Ok(())
1585}
1586
1587/// Core lexer dispatch: consume and return the next raw token kind.
1588///
1589/// This is the heart of the lexer: a large `for`-`switch` loop that classifies
1590/// the current character and dispatches to the appropriate scanner.
1591///
1592/// # C source (see llex.c lines 445-562 for full listing)
1593/// Whether the active version is the float-only legacy family (5.1/5.2), which
1594/// lacks the 5.3 integer operators (`//`, `<<`, `>>`, and the bitwise binops).
1595fn is_float_only(state: &LuaState) -> bool {
1596    matches!(
1597        state.global().lua_version,
1598        lua_types::LuaVersion::V51 | lua_types::LuaVersion::V52
1599    )
1600}
1601
1602fn llex(
1603    state: &mut LuaState,
1604    ls: &mut LexState,
1605    seminfo: &mut TokenValue,
1606) -> Result<i32, LuaError> {
1607    // macros.tsv: luaZ_resetbuffer → buf.clear()
1608    ls.buff.clear();
1609
1610    loop {
1611        match ls.current {
1612            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1613                inc_line_number(ls, state)?;
1614                // PORT NOTE: skipcomment-equivalent. luaL_loadfile in C-Lua
1615                // strips a leading '#' line (Unix shebang). Our test harness
1616                // prepends a global-setup preamble to every official test, so
1617                // the script's '#' line is not at byte zero. Apply the same
1618                // rule at any token-scan line start: treat a line whose first
1619                // character is '#' as a single-line comment. This sits in
1620                // llex's dispatch loop (not inc_line_number) so it does not
1621                // affect newlines inside long-bracket strings.
1622                if ls.current == b'#' as i32 {
1623                    while !curr_is_newline(ls) && ls.current != EOZ {
1624                        advance(ls);
1625                    }
1626                }
1627            }
1628
1629            c if c == b' ' as i32
1630                || c == b'\x0C' as i32
1631                || c == b'\t' as i32
1632                || c == b'\x0B' as i32 =>
1633            {
1634                advance(ls);
1635            }
1636
1637            c if c == b'-' as i32 => {
1638                advance(ls);
1639                if ls.current != b'-' as i32 {
1640                    return Ok(b'-' as i32);
1641                }
1642                advance(ls);
1643
1644                if ls.current == b'[' as i32 {
1645                    let sep = skip_sep(state, ls)?;
1646                    ls.buff.clear();
1647                    if sep >= 2 {
1648                        read_long_string(state, ls, None, sep)?;
1649                        ls.buff.clear();
1650                        continue;
1651                    }
1652                }
1653                while !curr_is_newline(ls) && ls.current != EOZ {
1654                    advance(ls);
1655                }
1656                // loop continues (no token emitted for comments)
1657            }
1658
1659            c if c == b'[' as i32 => {
1660                let sep = skip_sep(state, ls)?;
1661                if sep >= 2 {
1662                    read_long_string(state, ls, Some(seminfo), sep)?;
1663                    return Ok(TK_STRING);
1664                } else if sep == 0 {
1665                    return Err(lex_error(ls, b"invalid long string delimiter", TK_STRING));
1666                }
1667                // sep == 1: plain '[', no long string
1668                return Ok(b'[' as i32);
1669            }
1670
1671            c if c == b'=' as i32 => {
1672                advance(ls);
1673                if check_next1(ls, b'=' as i32) {
1674                    return Ok(TK_EQ);
1675                }
1676                return Ok(b'=' as i32);
1677            }
1678
1679            c if c == b'<' as i32 => {
1680                advance(ls);
1681                if check_next1(ls, b'=' as i32) {
1682                    return Ok(TK_LE);
1683                } else if !is_float_only(state) && check_next1(ls, b'<' as i32) {
1684                    // The `<<` shift operator is a Lua 5.3 addition. Under the
1685                    // float-only legacy family (5.1/5.2) it does not exist: a
1686                    // bare `<` is returned, so a second `<` then surfaces
1687                    // upstream's "unexpected symbol near '<'".
1688                    return Ok(TK_SHL);
1689                }
1690                return Ok(b'<' as i32);
1691            }
1692
1693            c if c == b'>' as i32 => {
1694                advance(ls);
1695                if check_next1(ls, b'=' as i32) {
1696                    return Ok(TK_GE);
1697                } else if !is_float_only(state) && check_next1(ls, b'>' as i32) {
1698                    // `>>` is a 5.3 addition; absent in 5.1/5.2.
1699                    return Ok(TK_SHR);
1700                }
1701                return Ok(b'>' as i32);
1702            }
1703
1704            c if c == b'/' as i32 => {
1705                advance(ls);
1706                if !is_float_only(state) && check_next1(ls, b'/' as i32) {
1707                    // Floor division `//` is a 5.3 addition; absent in 5.1/5.2,
1708                    // where the second `/` becomes "unexpected symbol near '/'".
1709                    return Ok(TK_IDIV);
1710                }
1711                return Ok(b'/' as i32);
1712            }
1713
1714            c if c == b'~' as i32 => {
1715                advance(ls);
1716                if check_next1(ls, b'=' as i32) {
1717                    return Ok(TK_NE);
1718                }
1719                return Ok(b'~' as i32);
1720            }
1721
1722            c if c == b':' as i32 => {
1723                advance(ls);
1724                // Lua 5.1 has no `::label::` token; `::` was added with `goto` in
1725                // 5.2. Under V51 the second `:` is left for the parser, which
1726                // reports `unexpected symbol near ':'`. See
1727                // specs/followup/5.1-roster-syntax.md §2.
1728                let is_v51 = matches!(
1729                    state.global().lua_version,
1730                    lua_types::LuaVersion::V51
1731                );
1732                if !is_v51 && check_next1(ls, b':' as i32) {
1733                    return Ok(TK_DBCOLON);
1734                }
1735                return Ok(b':' as i32);
1736            }
1737
1738            c if c == b'"' as i32 || c == b'\'' as i32 => {
1739                let del = ls.current;
1740                read_string(state, ls, del, seminfo)?;
1741                return Ok(TK_STRING);
1742            }
1743
1744            c if c == b'.' as i32 => {
1745                save_and_next(ls, state)?;
1746                if check_next1(ls, b'.' as i32) {
1747                    if check_next1(ls, b'.' as i32) {
1748                        return Ok(TK_DOTS);
1749                    }
1750                    return Ok(TK_CONCAT);
1751                } else if !is_digit(ls.current) {
1752                    return Ok(b'.' as i32);
1753                } else {
1754                    return read_numeral(state, ls, seminfo);
1755                }
1756            }
1757
1758            c if is_digit(c) => {
1759                return read_numeral(state, ls, seminfo);
1760            }
1761
1762            c if c == EOZ => {
1763                return Ok(TK_EOS);
1764            }
1765
1766            c => {
1767                if is_lalpha(c) {
1768                    loop {
1769                        save_and_next(ls, state)?;
1770                        if !is_lalnum(ls.current) {
1771                            break;
1772                        }
1773                    }
1774
1775                    // PORT NOTE: copy buffer bytes to drop borrow before new_string.
1776                    let content: Vec<u8> = ls.buff.as_slice().to_vec();
1777                    let ts = new_string(state, ls, &content)?;
1778
1779                    // PORT NOTE: canonical `lua_types::LuaString` lacks the `extra`
1780                    // byte that C-Lua uses to mark reserved words. Recover the
1781                    // keyword index directly from the interned bytes via the
1782                    // `LUAX_TOKENS` table; the first `NUM_RESERVED` entries are
1783                    // the keywords in declaration order, so token id =
1784                    // `FIRST_RESERVED + index`.
1785                    let reserved_token: Option<i32> = LUAX_TOKENS[..NUM_RESERVED]
1786                        .iter()
1787                        .position(|kw| *kw == content.as_slice())
1788                        .map(|i| FIRST_RESERVED + i as i32);
1789                    *seminfo = TokenValue::Str(ts);
1790
1791                    if let Some(tk) = reserved_token {
1792                        // Lua 5.1 has no `goto` keyword — `goto` is an ordinary
1793                        // identifier (`local goto = 5` is valid). The keyword and
1794                        // the `::label::` grammar were added in 5.2. So under V51
1795                        // `goto` lexes as a plain name; the parser then treats
1796                        // `goto done` as a name beginning an assignment, yielding
1797                        // the incidental `'=' expected near 'done'` the oracle
1798                        // reports. See specs/followup/5.1-roster-syntax.md §2.
1799                        if tk == TK_GOTO
1800                            && matches!(
1801                                state.global().lua_version,
1802                                lua_types::LuaVersion::V51
1803                            )
1804                        {
1805                            return Ok(TK_NAME);
1806                        }
1807                        return Ok(tk);
1808                    }
1809
1810                    // Lua 5.5: with the upstream-default `LUA_COMPAT_GLOBAL`, the
1811                    // `global` declaration word is NOT reserved — `global` stays a
1812                    // valid identifier, and the parser recognizes the declaration
1813                    // statement contextually (see `globalstat` in lua-parse). So
1814                    // `global` always lexes as a plain name, on every version.
1815                    return Ok(TK_NAME);
1816                } else {
1817                    let tok = ls.current;
1818                    advance(ls);
1819                    return Ok(tok);
1820                }
1821            }
1822        }
1823    }
1824}
1825
1826// ── Phase A stubs for cross-crate helpers ──────────────────────────────────────
1827//
1828// The functions below stand in for cross-crate calls that cannot resolve in
1829// Phase A.  They will be replaced by proper imports in Phase B.
1830
1831// TODO(port): replace with state.intern_str(bytes) once LuaState gains that
1832// method (from lua_vm::string::new_lstr wired in Phase B).
1833// TODO_ARCH(phase-b-reconcile): canonical LuaString is constructed via
1834// from_bytes; once LuaState::intern_str is wired, route through there instead.
1835fn intern_str_stub(
1836    state: &mut LuaState,
1837    bytes: &[u8],
1838) -> Result<GcRef<LuaString>, LuaError> {
1839    state.intern_str(bytes)
1840}
1841
1842// TODO(port): replace with lua_vm::object::hex_value(c) in Phase B.
1843fn hex_value_stub(c: i32) -> u32 {
1844    match c {
1845        c if c >= b'0' as i32 && c <= b'9' as i32 => (c - b'0' as i32) as u32,
1846        c if c >= b'a' as i32 && c <= b'f' as i32 => (c - b'a' as i32 + 10) as u32,
1847        c if c >= b'A' as i32 && c <= b'F' as i32 => (c - b'A' as i32 + 10) as u32,
1848        _ => 0,
1849    }
1850}
1851
1852// TODO(port): replace with lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1853/// Encode a Unicode codepoint as a Lua-extended UTF-8 byte sequence (1 to 6 bytes).
1854///
1855/// Faithful port of `luaO_utf8esc` from lobject.c.  Lua permits codepoints up
1856/// to `0x7FFFFFFF` (5- and 6-byte sequences are non-strict UTF-8 but accepted
1857/// by `\u{...}` escapes per literals.lua test cases).
1858fn utf8_encode_stub(codepoint: u32) -> Vec<u8> {
1859    debug_assert!(codepoint <= 0x7FFF_FFFF);
1860    if codepoint < 0x80 {
1861        return vec![codepoint as u8];
1862    }
1863    let mut x = codepoint;
1864    let mut mfb: u32 = 0x3f;
1865    let mut buf: Vec<u8> = Vec::with_capacity(8);
1866    loop {
1867        buf.push(0x80 | ((x & 0x3f) as u8));
1868        x >>= 6;
1869        mfb >>= 1;
1870        if x <= mfb {
1871            break;
1872        }
1873    }
1874    buf.push(((!mfb << 1) | x) as u8);
1875    buf.reverse();
1876    buf
1877}
1878
1879// ──────────────────────────────────────────────────────────────────────────────
1880// PORT STATUS
1881//   source:        src/llex.c  (581 lines, 24 functions)
1882//                  src/llex.h  (91 lines; merged)
1883//   target_crate:  lua-lex
1884//   confidence:    medium
1885//   todos:         18
1886//   port_notes:    12
1887//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
1888//   notes:         Logic is faithful to the C.  The main structural differences:
1889//                  (1) LexState.L removed — state threaded via fn params;
1890//                  (2) save/save_and_next/inclinenumber/helpers are all fallible
1891//                  (Result<_, LuaError>) because lexerror is no longer noreturn;
1892//                  (3) goto read_save/only_save/no_save in read_string replaced
1893//                  by EscapeResult enum; (4) Cross-crate calls (intern_str,
1894//                  luaH_getstr/finishset, luaG_addinfo, luaO_str2num,
1895//                  luaO_hexavalue, luaO_utf8esc, luaC_fix, luaC_checkGC) are
1896//                  stubbed with TODO; (5) LuaError, LuaString, ZIO, LexBuffer,
1897//                  LuaState defined as local stubs — Phase B replaces with real
1898//                  imports once the crate graph is wired.  Key Phase B tasks:
1899//                  wire import paths; move LuaString.extra accessor to pub;
1900//                  implement luaX_newstring anchor-table logic.  Numeric
1901//                  literal parsing now delegates to lua_vm::object::str2num
1902//                  (handles hex integers with wrap-around and hex floats).
1903// ──────────────────────────────────────────────────────────────────────────────