Skip to main content

lua_lex/
lib.rs

1//! Lexical analyzer — port of `llex.c` + `llex.h`.
2//!
3//! Provides the Lua 5.4 lexer: character-by-character scanning of a [`ZIO`]
4//! input stream into [`Token`] values, with one-token lookahead.  The
5//! `llex.h` header is merged here per PORTING.md §1.
6//!
7//! # C source files
8//! - `reference/lua-5.4.7/src/llex.c`  (581 lines, 24 functions)
9//! - `reference/lua-5.4.7/src/llex.h`  (91 lines; merged here)
10//!
11//! # Design notes
12//! - `LexState.L` (back-pointer to `lua_State`) is removed.  All functions
13//!   that need `LuaState` receive it as `state: &mut LuaState`.
14//! - `Token.token` is `i32` in Phase A (matching the C `int token` field).
15//!   Single-byte tokens are their ASCII values; reserved-word tokens start at
16//!   `FIRST_RESERVED` (257).  A proper `TokenKind` enum is deferred to Phase B.
17//! - `save` / `save_and_next` are now fallible (`Result<(), LuaError>`); the
18//!   `?` operator replaces the C noreturn `lexerror` call on buffer overflow.
19//! - The `goto read_save / only_save / no_save` pattern in `read_string` is
20//!   translated via the local `EscapeResult` enum.
21
22// TODO(port): resolve remaining cross-crate calls (intern_str, table anchor,
23// number parsing, utf8 encoding) in Phase B.  Canonical cross-crate type
24// imports are now in place per harness/type-vocabulary.tsv (see below).
25
26use std::io::Write as IoWrite;
27
28// PORT NOTE: GcRef<T> = Rc<T> in Phases A–C; replaced by real GC pointer in Phase D.
29use lua_types::gc::GcRef;
30
31// Canonical cross-crate types: imported from owner crates per
32// harness/type-vocabulary.tsv.  See PORTING.md §7.
33pub use lua_types::LuaError;
34pub use lua_types::LuaString;
35pub use lua_vm::state::LuaState;
36pub use lua_vm::table::LuaTable;
37
38/// Placeholder for `LexBuffer` from `lua_vm::zio`.
39/// TODO(port): replace with `use lua_vm::zio::LexBuffer` in Phase B.
40/// types.tsv: Mbuffer → LexBuffer
41pub struct LexBuffer {
42    buffer: Vec<u8>,
43}
44
45impl LexBuffer {
46    pub fn new() -> Self {
47        LexBuffer { buffer: Vec::new() }
48    }
49
50    /// macros.tsv: luaZ_bufflen → buf.len()
51    pub fn len(&self) -> usize {
52        self.buffer.len()
53    }
54
55    /// macros.tsv: luaZ_sizebuffer → buf.capacity()
56    pub fn capacity(&self) -> usize {
57        self.buffer.capacity()
58    }
59
60    /// macros.tsv: luaZ_buffer → buf.as_mut_slice()
61    pub fn as_slice(&self) -> &[u8] {
62        &self.buffer
63    }
64
65    /// macros.tsv: luaZ_resetbuffer → buf.clear()
66    pub fn clear(&mut self) {
67        self.buffer.clear();
68    }
69
70    /// macros.tsv: luaZ_buffremove → buf.truncate_by(i)
71    pub fn truncate_by(&mut self, i: usize) {
72        let new_len = self.buffer.len().saturating_sub(i);
73        self.buffer.truncate(new_len);
74    }
75
76    /// allocated capacity. In C this changes `buffsize`, not the live byte
77    /// count `n`. The Rust analogue therefore manipulates `Vec::capacity`,
78    /// never `Vec::len` (otherwise `push_byte` would write past the live
79    /// content and leave embedded zero padding inside the token text).
80    pub fn resize(&mut self, _state: &mut LuaState, size: usize) -> Result<(), LuaError> {
81        if size < self.buffer.len() {
82            self.buffer.truncate(size);
83        }
84        if size > self.buffer.capacity() {
85            let extra = size - self.buffer.capacity();
86            self.buffer.reserve_exact(extra);
87        }
88        Ok(())
89    }
90
91    /// Append one byte to the live contents.  Panics if capacity exceeded
92    /// (callers must pre-check via `save`).
93    fn push_byte(&mut self, c: u8) {
94        self.buffer.push(c);
95    }
96}
97
98impl Default for LexBuffer {
99    fn default() -> Self {
100        Self::new()
101    }
102}
103
104/// Placeholder for `ZIO` from `lua_vm::zio`.
105/// TODO(port): replace with `use lua_vm::zio::ZIO` in Phase B.
106/// types.tsv: Zio → ZIO
107pub struct ZIO {
108    // TODO(port): full ZIO implementation lives in lua_vm::zio; this is a stub.
109    reader: Box<dyn FnMut() -> Option<Vec<u8>>>,
110    n: usize,
111    p: usize,
112    current_chunk: Vec<u8>,
113}
114
115impl ZIO {
116    /// Construct a ZIO from a reader callback that yields successive chunks.
117    pub fn new(reader: Box<dyn FnMut() -> Option<Vec<u8>>>) -> Self {
118        ZIO {
119            reader,
120            n: 0,
121            p: 0,
122            current_chunk: Vec::new(),
123        }
124    }
125
126    /// Construct a ZIO that yields the supplied bytes once and then EOZ.
127    pub fn from_bytes(bytes: Vec<u8>) -> Self {
128        let mut once = Some(bytes);
129        ZIO::new(Box::new(move || once.take()))
130    }
131
132    /// macros.tsv: zgetc → z.getc()
133    pub fn getc(&mut self) -> i32 {
134        if self.n > 0 {
135            self.n -= 1;
136            let b = self.current_chunk[self.p] as u8;
137            self.p += 1;
138            b as i32
139        } else {
140            self.fill()
141        }
142    }
143
144    fn fill(&mut self) -> i32 {
145        match (self.reader)() {
146            None => EOZ,
147            Some(chunk) if chunk.is_empty() => EOZ,
148            Some(chunk) => {
149                self.n = chunk.len() - 1;
150                self.current_chunk = chunk;
151                self.p = 0;
152                let b = self.current_chunk[self.p] as u8;
153                self.p += 1;
154                b as i32
155            }
156        }
157    }
158}
159
160// ── Constants ─────────────────────────────────────────────────────────────────
161
162// macros.tsv: FIRST_RESERVED → const FIRST_RESERVED: i32 = 257
163/// First token kind value that is not a single-byte character.
164/// Single-byte tokens are represented by their ASCII value (0-255).
165pub const FIRST_RESERVED: i32 = 257;
166
167// macros.tsv: LUA_ENV → const LUA_ENV: &[u8] = b"_ENV"
168/// Name of the global environment upvalue.
169pub const LUA_ENV: &[u8] = b"_ENV";
170
171// macros.tsv: NUM_RESERVED → const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize
172/// Number of reserved words (keywords).
173pub const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize;
174
175// macros.tsv: EOZ → const EOZ: i32 = -1
176/// End-of-stream sentinel returned by ZIO::getc.
177pub const EOZ: i32 = -1;
178
179// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = ...
180const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
181    usize::MAX
182} else {
183    i64::MAX as usize
184};
185
186// macros.tsv: LUA_MIN_BUFFER → const LUA_MIN_BUFFER: usize = 32
187const LUA_MIN_BUFFER: usize = 32;
188
189// ── Token kind constants (ORDER RESERVED — matches C enum RESERVED) ───────────
190//
191// In C these are enum values.  In Rust we use i32 constants for Phase A
192// (faithful to `Token.token: int` in C) with a TODO for a proper enum in Phase B.
193//
194
195/// `and`
196pub const TK_AND: i32 = 257;
197/// `break`
198pub const TK_BREAK: i32 = 258;
199/// `do`
200pub const TK_DO: i32 = 259;
201/// `else`
202pub const TK_ELSE: i32 = 260;
203/// `elseif`
204pub const TK_ELSEIF: i32 = 261;
205/// `end`
206pub const TK_END: i32 = 262;
207/// `false`
208pub const TK_FALSE: i32 = 263;
209/// `for`
210pub const TK_FOR: i32 = 264;
211/// `function`
212pub const TK_FUNCTION: i32 = 265;
213/// `goto`
214pub const TK_GOTO: i32 = 266;
215/// `if`
216pub const TK_IF: i32 = 267;
217/// `in`
218pub const TK_IN: i32 = 268;
219/// `local`
220pub const TK_LOCAL: i32 = 269;
221/// `nil`
222pub const TK_NIL: i32 = 270;
223/// `not`
224pub const TK_NOT: i32 = 271;
225/// `or`
226pub const TK_OR: i32 = 272;
227/// `repeat`
228pub const TK_REPEAT: i32 = 273;
229/// `return`
230pub const TK_RETURN: i32 = 274;
231/// `then`
232pub const TK_THEN: i32 = 275;
233/// `true`
234pub const TK_TRUE: i32 = 276;
235/// `until`
236pub const TK_UNTIL: i32 = 277;
237/// `while`  (last keyword; NUM_RESERVED = TK_WHILE - FIRST_RESERVED + 1 = 22)
238pub const TK_WHILE: i32 = 278;
239/// `//`  (floor division)
240pub const TK_IDIV: i32 = 279;
241/// `..`  (concatenation)
242pub const TK_CONCAT: i32 = 280;
243/// `...` (vararg)
244pub const TK_DOTS: i32 = 281;
245/// `==`
246pub const TK_EQ: i32 = 282;
247/// `>=`
248pub const TK_GE: i32 = 283;
249/// `<=`
250pub const TK_LE: i32 = 284;
251/// `~=`
252pub const TK_NE: i32 = 285;
253/// `<<`
254pub const TK_SHL: i32 = 286;
255/// `>>`
256pub const TK_SHR: i32 = 287;
257/// `::`
258pub const TK_DBCOLON: i32 = 288;
259/// `<eof>`
260pub const TK_EOS: i32 = 289;
261/// `<number>`  (float literal)
262pub const TK_FLT: i32 = 290;
263/// `<integer>` (integer literal)
264pub const TK_INT: i32 = 291;
265/// `<name>`    (identifier)
266pub const TK_NAME: i32 = 292;
267/// `<string>`  (string literal)
268pub const TK_STRING: i32 = 293;
269
270// Lua 5.5 `global`: with the upstream-default LUA_COMPAT_GLOBAL it is NOT a
271// reserved word — it always lexes as TK_NAME (so it stays a valid identifier on
272// every version), and the parser recognizes the `global` declaration statement
273// contextually (see `globalstat`/`statement` in lua-parse). There is therefore
274// no dedicated token id.
275
276// ORDER RESERVED — index 0 = TK_AND - FIRST_RESERVED, etc.
277/// Display strings for tokens, indexed by `token - FIRST_RESERVED`.
278pub static LUAX_TOKENS: &[&[u8]] = &[
279    // keywords (indices 0-21)
280    b"and",
281    b"break",
282    b"do",
283    b"else",
284    b"elseif",
285    b"end",
286    b"false",
287    b"for",
288    b"function",
289    b"goto",
290    b"if",
291    b"in",
292    b"local",
293    b"nil",
294    b"not",
295    b"or",
296    b"repeat",
297    b"return",
298    b"then",
299    b"true",
300    b"until",
301    b"while",
302    // other terminal symbols (indices 22-35)
303    b"//",
304    b"..",
305    b"...",
306    b"==",
307    b">=",
308    b"<=",
309    b"~=",
310    b"<<",
311    b">>",
312    b"::",
313    b"<eof>",
314    b"<number>",
315    b"<integer>",
316    b"<name>",
317    b"<string>",
318];
319
320// ── SemInfo / TokenValue ───────────────────────────────────────────────────────
321
322// types.tsv: SemInfo → TokenValue
323/// Semantic payload carried by a token.
324///
325/// Corresponds to `SemInfo` (a C union) in `llex.h`.  In Rust this is a
326/// discriminated union (enum).
327///
328/// # C mapping
329/// ```text
330/// SemInfo.r   → TokenValue::Float(f64)      (lua_Number)
331/// SemInfo.i   → TokenValue::Int(i64)        (lua_Integer)
332/// SemInfo.ts  → TokenValue::Str(GcRef<LuaString>)
333/// (no C field) → TokenValue::None           (default / unset)
334/// ```
335#[derive(Clone)]
336pub enum TokenValue {
337    /// No semantic value (default; used for single-byte and most multi-char tokens).
338    None,
339    /// Float literal payload.  C: `seminfo.r` (`lua_Number`).
340    Float(f64),
341    /// Integer literal payload.  C: `seminfo.i` (`lua_Integer`).
342    Int(i64),
343    /// String/name payload.  C: `seminfo.ts` (`TString *`).
344    Str(GcRef<LuaString>),
345}
346
347// ── Token ─────────────────────────────────────────────────────────────────────
348
349// types.tsv: Token → Token;  Token.token → i32 (Phase A; TODO: TokenKind enum Phase B)
350/// A single lexed token with its semantic payload.
351///
352/// `kind` is an `i32` whose value is either an ASCII byte code (for single-byte
353/// tokens like `+`, `-`, `[`) or one of the `TK_*` constants (for reserved
354/// words, multi-char symbols, and literals).
355///
356/// TODO(port): Phase B — replace `kind: i32` with a proper `TokenKind` enum
357/// covering both single-byte and named tokens (e.g. `TokenKind::Char(u8)` +
358/// named variants).
359#[derive(Clone)]
360pub struct Token {
361    pub kind: i32,
362    pub value: TokenValue,
363}
364
365impl Token {
366    /// Construct a token with no semantic value.
367    pub fn new(kind: i32) -> Self {
368        Token {
369            kind,
370            value: TokenValue::None,
371        }
372    }
373
374    /// The end-of-stream sentinel token.
375    pub fn eos() -> Self {
376        Token::new(TK_EOS)
377    }
378}
379
380// ── LexState ──────────────────────────────────────────────────────────────────
381
382// types.tsv: LexState → LexState;  LexState.L removed (thread via &mut LuaState)
383/// Per-chunk lexer (and shared parser) state.
384///
385/// Corresponds to `LexState` in `llex.h`.  Owns the input stream, token
386/// buffer, and current/lookahead tokens.
387///
388/// # C mapping (types.tsv)
389/// ```text
390/// LexState.current    → current: i32        (charint; -1 = EOZ)
391/// LexState.linenumber → linenumber: i32
392/// LexState.lastline   → lastline: i32
393/// LexState.t          → t: Token            (current token)
394/// LexState.lookahead  → lookahead: Token    (one-token lookahead)
395/// LexState.fs         → fs: Option<Box<FuncState>>   (parser state)
396/// LexState.L          → (removed; callers pass &mut LuaState)
397/// LexState.z          → z: ZIO              (owned input stream)
398/// LexState.buff       → buff: LexBuffer     (owned token-text buffer)
399/// LexState.h          → h: GcRef<LuaTable>  (string-anchor table)
400/// LexState.dyd        → dyd: DynData        (parser dynamic data)
401/// LexState.source     → source: GcRef<LuaString>
402/// LexState.envn       → envn: GcRef<LuaString>
403/// ```
404pub struct LexState {
405    pub current: i32,
406    pub linenumber: i32,
407    pub lastline: i32,
408    pub t: Token,
409    pub lookahead: Token,
410    // TODO(port): Box<FuncState> once FuncState lands in lua-parse (Phase B)
411    pub fs: Option<()>,
412    // PORT NOTE: C held a pointer; Rust owns the ZIO directly per types.tsv.
413    pub z: ZIO,
414    // PORT NOTE: C held a pointer; Rust owns the LexBuffer directly per types.tsv.
415    pub buff: LexBuffer,
416    // TODO(port): GcRef<LuaTable> once LuaTable is defined in Phase B
417    pub h: Option<GcRef<LuaTable>>,
418    /// Per-parse-session anchor for long strings. C-Lua's `ls->h` is a Lua
419    /// table that deduplicates all literal strings within a chunk (both short
420    /// and long), so e.g. `local s1 <const>="..."` and `local s2 <const>="..."`
421    /// with identical 50-byte payloads share one `TString` object — which is
422    /// what makes `string.format("%p", s1) == string.format("%p", s2)` hold.
423    /// Short strings already share identity via the global `interned_lt` pool,
424    /// but long strings (>LUAI_MAXSHORTLEN = 40) are not globally interned and
425    /// need this session-level map. Keyed by the string bytes; populated lazily
426    /// by `new_string`.
427    pub long_str_anchor: std::collections::HashMap<Vec<u8>, GcRef<LuaString>>,
428    // TODO(port): DynData once parser types land in Phase B
429    pub dyd: Option<()>,
430    pub source: GcRef<LuaString>,
431    pub envn: GcRef<LuaString>,
432    /// The active Lua version, snapshotted at lexer setup from
433    /// `state.global().lua_version` (fixed for the lifetime of a parse). The
434    /// error formatters (`lex_error`/`token2str`) take only `&LexState`, so they
435    /// read the version here rather than threading a `&LuaState` through every
436    /// syntax-error callsite. Lua 5.1 quotes the special multi-char token labels
437    /// (`<eof>`, `<name>`, …) in error messages where 5.2+ leaves them bare.
438    pub version: lua_types::LuaVersion,
439}
440
441// ── Character-classification helpers ─────────────────────────────────────────
442//
443// These are simplified ASCII implementations for Phase A.
444// TODO(port): import from lua_vm::ctype in Phase B; the full table handles
445// the LUA_UCID (Unicode identifiers) flag and matches the C bit-table exactly.
446//
447// PORT NOTE: the C macros take `int` (not `char`) so they handle EOZ (-1) safely.
448// These Rust fns match that contract: EOZ returns false for all predicates.
449
450#[inline]
451fn is_digit(c: i32) -> bool {
452    c >= b'0' as i32 && c <= b'9' as i32
453}
454
455#[inline]
456fn is_xdigit(c: i32) -> bool {
457    (c >= b'0' as i32 && c <= b'9' as i32)
458        || (c >= b'a' as i32 && c <= b'f' as i32)
459        || (c >= b'A' as i32 && c <= b'F' as i32)
460}
461
462// ALPHABIT: ASCII letters + '_'
463#[inline]
464fn is_lalpha(c: i32) -> bool {
465    (c >= b'a' as i32 && c <= b'z' as i32)
466        || (c >= b'A' as i32 && c <= b'Z' as i32)
467        || c == b'_' as i32
468}
469
470#[inline]
471fn is_lalnum(c: i32) -> bool {
472    is_lalpha(c) || is_digit(c)
473}
474
475#[inline]
476fn is_space(c: i32) -> bool {
477    matches!(c, 9 | 10 | 11 | 12 | 13 | 32) // \t \n \v \f \r space
478}
479
480// PRINTBIT: printable ASCII (graph + space), i.e. 0x20-0x7E
481#[inline]
482fn is_print(c: i32) -> bool {
483    c >= 0x20 && c <= 0x7E
484}
485
486#[inline]
487fn curr_is_newline(ls: &LexState) -> bool {
488    ls.current == b'\n' as i32 || ls.current == b'\r' as i32
489}
490
491// ── Low-level stream helpers ───────────────────────────────────────────────────
492
493/// Advance the lexer by one character.
494///
495/// Corresponds to the `next(ls)` macro.  Named `advance` to avoid collision
496/// with Rust's iterator method.
497#[inline]
498fn advance(ls: &mut LexState) {
499    // macros.tsv: zgetc → z.getc()
500    ls.current = ls.z.getc();
501}
502
503/// Append character `c` to the token buffer, growing it if necessary.
504///
505/// On overflow calls [`lex_error`] which becomes `Err(LuaError::Syntax(...))`.
506///
507/// # C source
508/// ```c
509///
510/// //   Mbuffer *b = ls->buff;
511/// //   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
512/// //     size_t newsize;
513/// //     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
514/// //       lexerror(ls, "lexical element too long", 0);
515/// //     newsize = luaZ_sizebuffer(b) * 2;
516/// //     luaZ_resizebuffer(ls->L, b, newsize);
517/// //   }
518/// //   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
519/// // }
520/// ```
521fn save(ls: &mut LexState, state: &mut LuaState, c: i32) -> Result<(), LuaError> {
522    // macros.tsv: luaZ_bufflen → buf.len(); luaZ_sizebuffer → buf.capacity()
523    if ls.buff.len() + 1 > ls.buff.capacity() {
524        if ls.buff.capacity() >= MAX_SIZE / 2 {
525            return Err(lex_error(ls, b"lexical element too long", 0));
526        }
527        //    luaZ_resizebuffer(ls->L, b, newsize);
528        // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
529        let newsize = ls.buff.capacity() * 2;
530        ls.buff.resize(state, newsize)?;
531    }
532    // macros.tsv: cast_char → x as i8  (C char is signed; Lua bytes stored as-is)
533    // PORT NOTE: we store the byte value directly; the i8 cast in C is for the
534    // C char type but the data is read back as unsigned via cast_uchar everywhere.
535    ls.buff.push_byte(c as u8);
536    Ok(())
537}
538
539/// Save the current character into the token buffer, then advance the stream.
540///
541/// Corresponds to the `save_and_next(ls)` macro.  Fallible because `save`
542/// may need to grow the buffer.
543#[inline]
544fn save_and_next(ls: &mut LexState, state: &mut LuaState) -> Result<(), LuaError> {
545    let c = ls.current;
546    save(ls, state, c)?;
547    advance(ls);
548    Ok(())
549}
550
551// ── Error helpers ─────────────────────────────────────────────────────────────
552
553// l_noret → -> !  but in Rust we return LuaError (callers wrap in Err(...))
554// error_sites.tsv: luaX_lexerror → return Err(LuaError::syntax_at(ls, "msg", token))
555/// Build a syntax error, optionally annotated with the offending token text.
556///
557/// Corresponds to the static `lexerror` function in `llex.c`.  In C this is
558/// `l_noret` (diverges via `luaD_throw`); in Rust it returns a `LuaError`
559/// value that callers wrap in `Err(...)`.
560///
561/// # C source
562/// ```c
563///
564/// //   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
565/// //   if (token)
566/// //     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
567/// //   luaD_throw(ls->L, LUA_ERRSYNTAX);
568/// // }
569/// ```
570pub fn lex_error(ls: &mut LexState, msg: &[u8], token: i32) -> LuaError {
571    const LUA_IDSIZE: usize = 60;
572    let mut buff = [0u8; LUA_IDSIZE];
573    let n = lua_vm::object::chunk_id(&mut buff[..], ls.source.as_bytes());
574    let src_part = &buff[..n];
575
576    let mut full_msg: Vec<u8> = Vec::new();
577    full_msg.extend_from_slice(src_part);
578    let _ = write!(full_msg, ":{}: ", ls.linenumber);
579    full_msg.extend_from_slice(msg);
580
581    if token != 0 {
582        let tok_text = txt_token(ls, token);
583        full_msg.extend_from_slice(b" near ");
584        full_msg.extend_from_slice(&tok_text);
585    }
586
587    LuaError::syntax_raw(&full_msg)
588}
589
590// LUAI_FUNC → pub(crate)
591// error_sites.tsv: luaX_syntaxerror → return Err(LuaError::syntax(format_args!("msg")))
592/// Report a syntax error at the current token.
593///
594/// # C source
595/// ```c
596///
597/// //   lexerror(ls, msg, ls->t.token);
598/// // }
599/// ```
600pub fn syntax_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
601    let token = ls.t.kind;
602    lex_error(ls, msg, token)
603}
604
605/// Report a semantic error at the current line WITHOUT the `near <token>`
606/// suffix.
607///
608/// Mirrors upstream `luaK_semerror` (`lcode.c`), which sets
609/// `ls->t.token = 0` before calling `luaX_syntaxerror` so the `near` clause is
610/// suppressed. Used for attribute errors (`unknown attribute '<name>'`,
611/// `global variables cannot be to-be-closed`) where the offending construct is
612/// the attribute itself, not the current lookahead token.
613pub fn sem_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
614    lex_error(ls, msg, 0)
615}
616
617/// Produce a human-readable representation of `token` for error messages.
618///
619/// For `TK_NAME`, `TK_STRING`, `TK_FLT`, `TK_INT`: formats the current
620/// token buffer contents as `'<text>'`.  For everything else, delegates to
621/// [`token2str`].
622///
623/// # C source
624/// ```c
625///
626/// //   switch (token) {
627/// //     case TK_NAME: case TK_STRING:
628/// //     case TK_FLT: case TK_INT:
629/// //       save(ls, '\0');
630/// //       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
631/// //     default:
632/// //       return luaX_token2str(ls, token);
633/// //   }
634/// // }
635/// ```
636///
637/// PORT NOTE: C calls `luaO_pushfstring` which pushes the string onto the
638/// Lua stack (stack-anchored temporary).  Rust returns `Vec<u8>` directly
639/// since there is no stack-based string lifecycle for error formatting.
640fn txt_token(ls: &mut LexState, token: i32) -> Vec<u8> {
641    match token {
642        t if t == TK_NAME || t == TK_STRING || t == TK_FLT || t == TK_INT => {
643            let mut v: Vec<u8> = Vec::new();
644            v.push(b'\'');
645            let buff = ls.buff.as_slice();
646            let trimmed = if buff.last() == Some(&0) {
647                &buff[..buff.len() - 1]
648            } else {
649                buff
650            };
651            v.extend_from_slice(trimmed);
652            v.push(b'\'');
653            v
654        }
655        _ => token2str_raw(token, ls.version),
656    }
657}
658
659// LUAI_FUNC → pub(crate)
660/// Produce a human-readable token description (for error messages and the parser).
661///
662/// Single-byte printable tokens are formatted as `'X'`; non-printable as
663/// `'<\N>'`.  Reserved words and multi-char symbols are formatted as `'kw'`.
664/// Literal tokens (`<name>`, `<string>`, etc.) return the bare label.
665///
666/// # C source
667/// ```c
668///
669/// //   if (token < FIRST_RESERVED) {
670/// //     if (lisprint(token))
671/// //       return luaO_pushfstring(ls->L, "'%c'", token);
672/// //     else
673/// //       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
674/// //   }
675/// //   else {
676/// //     const char *s = luaX_tokens[token - FIRST_RESERVED];
677/// //     if (token < TK_EOS)
678/// //       return luaO_pushfstring(ls->L, "'%s'", s);
679/// //     else
680/// //       return s;
681/// //   }
682/// // }
683/// ```
684///
685/// PORT NOTE: The `LexState` parameter is retained in the signature for API
686/// parity with the C export, but is unused in Rust because we don't push onto
687/// the Lua stack.  The real formatting is in [`token2str_raw`].
688pub fn token2str(ls: &LexState, token: i32) -> Vec<u8> {
689    token2str_raw(token, ls.version)
690}
691
692/// Inner implementation of [`token2str`] that does not need `LexState`.
693///
694/// PORT NOTE: `version` gates the 5.1 special-token quoting. Upstream 5.1's
695/// `luaX_lexerror`/`error_expected` wrap the whole near/expected token in
696/// `LUA_QS` ('%s'), so the bare multi-char labels (`<eof>`, `<name>`, …) that
697/// `luaX_token2str` returns for `token >= TK_EOS` end up quoted. 5.2 rewrote
698/// `txtToken` to leave those bare and quote only symbols/reserved/literals, so
699/// for 5.2+ the `>= TK_EOS` arm stays unquoted. (Issue #105.)
700fn token2str_raw(token: i32, version: lua_types::LuaVersion) -> Vec<u8> {
701    if token < FIRST_RESERVED {
702        if is_print(token) {
703            vec![b'\'', token as u8, b'\'']
704        } else {
705            // PORT NOTE: uses write! to Vec<u8> to avoid String allocation for Lua data.
706            let mut v: Vec<u8> = Vec::new();
707            v.extend_from_slice(b"'<\\");
708            let _ = write!(&mut v, "{}", token);
709            v.extend_from_slice(b">'");
710            v
711        }
712    } else {
713        let idx = (token - FIRST_RESERVED) as usize;
714        let s = LUAX_TOKENS[idx];
715        if token < TK_EOS || version == lua_types::LuaVersion::V51 {
716            let mut v: Vec<u8> = Vec::with_capacity(s.len() + 2);
717            v.push(b'\'');
718            v.extend_from_slice(s);
719            v.push(b'\'');
720            v
721        } else {
722            s.to_vec()
723        }
724    }
725}
726
727// ── Public init / setup ───────────────────────────────────────────────────────
728
729// LUAI_FUNC → pub(crate)
730/// Initialise the lexer subsystem: intern all reserved words and fix them
731/// in the GC so they are never collected.
732///
733/// Must be called exactly once during VM startup via `luaX_init`.
734///
735/// # C source
736/// ```c
737///
738/// //   int i;
739/// //   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
740/// //   luaC_fix(L, obj2gco(e));  /* never collect this name */
741/// //   for (i=0; i<NUM_RESERVED; i++) {
742/// //     TString *ts = luaS_new(L, luaX_tokens[i]);
743/// //     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
744/// //     ts->extra = cast_byte(i+1);  /* reserved word */
745/// //   }
746/// // }
747/// ```
748pub fn init(state: &mut LuaState) -> Result<(), LuaError> {
749    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
750    // TODO(port): call state.intern_str(LUA_ENV) once LuaState has that method (Phase B)
751    let _e = intern_str_stub(state, LUA_ENV)?;
752
753    // macros.tsv: luaC_objbarrier / luaC_fix — GC fix; no-op in Phases A-C
754    // TODO(port): state.gc().fix(e) in Phase D
755
756    for i in 0..NUM_RESERVED {
757        // macros.tsv: luaS_new → state.intern_str(...)
758        // TODO(port): call state.intern_str(LUAX_TOKENS[i]) in Phase B
759        let ts = intern_str_stub(state, LUAX_TOKENS[i])?;
760
761        // TODO(port): state.gc().fix(ts.clone()) in Phase D
762
763        // macros.tsv: cast_byte → x as u8
764        // PORT NOTE: LuaString.extra uses Cell<u8> interior mutability.
765        // TODO(port): ts.set_extra((i + 1) as u8) — needs pub accessor on LuaString
766        let _ = ts; // suppress unused warning until Phase B
767    }
768
769    Ok(())
770}
771
772// LUAI_FUNC → pub(crate)
773/// Initialise `ls` for lexing a new chunk from stream `z`.
774///
775/// # C source
776/// ```c
777///
778/// //                         TString *source, int firstchar) {
779/// //   ls->t.token = 0;
780/// //   ls->L = L;
781/// //   ls->current = firstchar;
782/// //   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
783/// //   ls->z = z;
784/// //   ls->fs = NULL;
785/// //   ls->linenumber = 1;
786/// //   ls->lastline = 1;
787/// //   ls->source = source;
788/// //   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
789/// //   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);
790/// // }
791/// ```
792pub fn set_input(
793    state: &mut LuaState,
794    ls: &mut LexState,
795    z: ZIO,
796    source: GcRef<LuaString>,
797    firstchar: i32,
798) -> Result<(), LuaError> {
799    ls.t = Token::new(0);
800    ls.current = firstchar;
801    ls.lookahead = Token::eos();
802    ls.z = z;
803    ls.fs = None;
804    ls.linenumber = 1;
805    ls.lastline = 1;
806    ls.source = source;
807    ls.version = state.global().lua_version;
808    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
809    // TODO(port): state.intern_str(LUA_ENV) in Phase B
810    ls.envn = intern_str_stub(state, LUA_ENV)?;
811    // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
812    ls.buff.resize(state, LUA_MIN_BUFFER)?;
813    Ok(())
814}
815
816// LUAI_FUNC → pub(crate)
817/// Create (or retrieve) a Lua string and anchor it in the parser's GC-protection
818/// table `ls.h` so it cannot be collected before the end of compilation.
819///
820/// Also internalises long strings so that each unique content has exactly one
821/// copy in memory.  The table `ls.h` is used as a set: the string is both the
822/// key and the value.
823///
824/// # C source
825/// ```c
826///
827/// //   lua_State *L = ls->L;
828/// //   TString *ts = luaS_newlstr(L, str, l);
829/// //   const TValue *o = luaH_getstr(ls->h, ts);
830/// //   if (!ttisnil(o))  /* string already present? */
831/// //     ts = keystrval(nodefromval(o));  /* get saved copy */
832/// //   else {
833/// //     TValue *stv = s2v(L->top.p++);  /* reserve stack space */
834/// //     setsvalue(L, stv, ts);           /* anchor the string */
835/// //     luaH_finishset(L, ls->h, stv, o, stv);  /* t[string] = string */
836/// //     luaC_checkGC(L);
837/// //     L->top.p--;                       /* remove string from stack */
838/// //   }
839/// //   return ts;
840/// // }
841/// ```
842pub(crate) fn new_string(
843    state: &mut LuaState,
844    ls: &mut LexState,
845    bytes: &[u8],
846) -> Result<GcRef<LuaString>, LuaError> {
847    // PORT NOTE: in C, the anchor table ls->h is a Lua table mapping the string
848    // to itself so a second occurrence of the same literal in the chunk returns
849    // the originally-created TString. We use a plain HashMap on LexState
850    // (`long_str_anchor`) for the equivalent dedup — sufficient because Phase
851    // A-C `GcRef<T>` is `Rc<T>` and identity is determined by the `Rc`
852    // allocation. Short strings already share identity via the global pool;
853    // long strings (>LUAI_MAXSHORTLEN) need this session-level map.
854    if let Some(existing) = ls.long_str_anchor.get(bytes) {
855        return Ok(existing.clone());
856    }
857    let ts = intern_str_stub(state, bytes)?;
858    ls.long_str_anchor.insert(bytes.to_vec(), ts.clone());
859    Ok(ts)
860}
861
862// ── Public advance / lookahead ─────────────────────────────────────────────────
863
864// LUAI_FUNC → pub(crate)
865/// Consume the current token; load the next one from the stream.
866///
867/// If a lookahead token was set, it becomes the current token without re-reading
868/// from the stream.
869///
870/// # C source
871/// ```c
872///
873/// //   ls->lastline = ls->linenumber;
874/// //   if (ls->lookahead.token != TK_EOS) {
875/// //     ls->t = ls->lookahead;
876/// //     ls->lookahead.token = TK_EOS;
877/// //   }
878/// //   else
879/// //     ls->t.token = llex(ls, &ls->t.seminfo);
880/// // }
881/// ```
882pub fn next(state: &mut LuaState, ls: &mut LexState) -> Result<(), LuaError> {
883    ls.lastline = ls.linenumber;
884
885    if ls.lookahead.kind != TK_EOS {
886        // Clone to avoid borrow conflict; LuaString inside TokenValue is GcRef (Rc).
887        ls.t = ls.lookahead.clone();
888        ls.lookahead = Token::eos();
889    } else {
890        let mut val = TokenValue::None;
891        let kind = llex(state, ls, &mut val)?;
892        ls.t = Token { kind, value: val };
893    }
894    Ok(())
895}
896
897// LUAI_FUNC → pub(crate)
898/// Peek at the next token without consuming the current one.
899///
900/// The lookahead token is cached in `ls.lookahead` and returned.  Only one
901/// token of lookahead is supported; calling this twice without an intervening
902/// [`next`] is a logic error (asserted in debug builds).
903///
904/// # C source
905/// ```c
906///
907/// //   lua_assert(ls->lookahead.token == TK_EOS);
908/// //   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
909/// //   return ls->lookahead.token;
910/// // }
911/// ```
912pub fn lookahead(state: &mut LuaState, ls: &mut LexState) -> Result<i32, LuaError> {
913    // macros.tsv: lua_assert → debug_assert!
914    debug_assert!(
915        ls.lookahead.kind == TK_EOS,
916        "luaX_lookahead: lookahead already set"
917    );
918
919    let mut val = TokenValue::None;
920    let kind = llex(state, ls, &mut val)?;
921    ls.lookahead = Token { kind, value: val };
922
923    Ok(ls.lookahead.kind)
924}
925
926// ── Private lexer helpers ──────────────────────────────────────────────────────
927
928/// If the current character equals `c`, advance and return `true`.
929///
930/// # C source
931/// ```c
932///
933/// //   if (ls->current == c) { next(ls); return 1; }
934/// //   else return 0;
935/// // }
936/// ```
937fn check_next1(ls: &mut LexState, c: i32) -> bool {
938    if ls.current == c {
939        advance(ls);
940        true
941    } else {
942        false
943    }
944}
945
946/// If the current character is either of the two bytes in `set`, save-and-advance
947/// and return `true`.
948///
949/// # C source
950/// ```c
951///
952/// //   lua_assert(set[2] == '\0');
953/// //   if (ls->current == set[0] || ls->current == set[1]) {
954/// //     save_and_next(ls);
955/// //     return 1;
956/// //   }
957/// //   else return 0;
958/// // }
959/// ```
960fn check_next2(ls: &mut LexState, state: &mut LuaState, set: &[u8; 2]) -> Result<bool, LuaError> {
961    if ls.current == set[0] as i32 || ls.current == set[1] as i32 {
962        save_and_next(ls, state)?;
963        Ok(true)
964    } else {
965        Ok(false)
966    }
967}
968
969/// Increment the line counter and consume the newline sequence.
970///
971/// Handles `\n`, `\r`, `\n\r`, and `\r\n`.
972///
973/// # C source
974/// ```c
975///
976/// //   int old = ls->current;
977/// //   lua_assert(currIsNewline(ls));
978/// //   next(ls);  /* skip '\n' or '\r' */
979/// //   if (currIsNewline(ls) && ls->current != old)
980/// //     next(ls);  /* skip '\n\r' or '\r\n' */
981/// //   if (++ls->linenumber >= MAX_INT)
982/// //     lexerror(ls, "chunk has too many lines", 0);
983/// // }
984/// ```
985fn inc_line_number(ls: &mut LexState, _state: &mut LuaState) -> Result<(), LuaError> {
986    // macros.tsv: lua_assert → debug_assert!
987    debug_assert!(curr_is_newline(ls), "inc_line_number: not at a newline");
988
989    let old = ls.current;
990    advance(ls);
991
992    if curr_is_newline(ls) && ls.current != old {
993        advance(ls);
994    }
995
996    // macros.tsv: MAX_INT → i32::MAX
997    ls.linenumber += 1;
998    if ls.linenumber >= i32::MAX {
999        return Err(lex_error(ls, b"chunk has too many lines", 0));
1000    }
1001    Ok(())
1002}
1003
1004/// Scan a numeric literal (integer or float, decimal or hex).
1005///
1006/// The caller may have already read an initial dot.  Accepts the pattern:
1007/// `%d(%x|%.|(Ee[+-]?))*` or `0[Xx](%x|%.|(Pp[+-]?))*`.
1008///
1009/// Returns `TK_INT` for integers, `TK_FLT` for floats.
1010///
1011/// # C source
1012/// ```c
1013///
1014/// //   TValue obj;
1015/// //   const char *expo = "Ee";
1016/// //   int first = ls->current;
1017/// //   lua_assert(lisdigit(ls->current));
1018/// //   save_and_next(ls);
1019/// //   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
1020/// //     expo = "Pp";
1021/// //   for (;;) {
1022/// //     if (check_next2(ls, expo))
1023/// //       check_next2(ls, "-+");
1024/// //     else if (lisxdigit(ls->current) || ls->current == '.')
1025/// //       save_and_next(ls);
1026/// //     else break;
1027/// //   }
1028/// //   if (lislalpha(ls->current))  /* numeral touching a letter? */
1029/// //     save_and_next(ls);         /* force an error */
1030/// //   save(ls, '\0');
1031/// //   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)
1032/// //     lexerror(ls, "malformed number", TK_FLT);
1033/// //   if (ttisinteger(&obj)) { seminfo->i = ivalue(&obj); return TK_INT; }
1034/// //   else { seminfo->r = fltvalue(&obj); return TK_FLT; }
1035/// // }
1036/// ```
1037fn read_numeral(
1038    state: &mut LuaState,
1039    ls: &mut LexState,
1040    seminfo: &mut TokenValue,
1041) -> Result<i32, LuaError> {
1042    let mut expo: &[u8; 2] = b"Ee";
1043
1044    let first = ls.current;
1045
1046    debug_assert!(is_digit(ls.current), "read_numeral: not at a digit");
1047
1048    save_and_next(ls, state)?;
1049
1050    if first == b'0' as i32 && check_next2(ls, state, b"xX")? {
1051        expo = b"Pp";
1052    }
1053
1054    loop {
1055        if check_next2(ls, state, expo)? {
1056            check_next2(ls, state, b"-+")?;
1057        } else if is_xdigit(ls.current) || ls.current == b'.' as i32 {
1058            //      save_and_next(ls);
1059            save_and_next(ls, state)?;
1060        } else {
1061            break;
1062        }
1063    }
1064
1065    if is_lalpha(ls.current) {
1066        save_and_next(ls, state)?;
1067    }
1068
1069    // In Rust, luaO_str2num will receive a byte slice; NUL is not needed.
1070    // We save 0 for parity with C, but our str2num stub ignores it.
1071    save(ls, state, 0)?;
1072
1073    //        lexerror(ls, "malformed number", TK_FLT);
1074    // macros.tsv: luaZ_buffer → buf.as_mut_slice()
1075    let buf = ls.buff.as_slice();
1076    let num_bytes = if buf.last() == Some(&0) {
1077        &buf[..buf.len() - 1]
1078    } else {
1079        buf
1080    };
1081    let mut obj = lua_types::LuaValue::Nil;
1082    if lua_vm::object::str2num(num_bytes, &mut obj) == 0 {
1083        return Err(lex_error(ls, b"malformed number", TK_FLT));
1084    }
1085    match obj {
1086        lua_types::LuaValue::Int(i) => {
1087            // Lua 5.1/5.2 are float-only: `lua_Number` is the only numeric type,
1088            // so every numeric literal is parsed as a float (`lua_str2number`),
1089            // including ones written without a decimal point. A literal like
1090            // 9007199254740993 therefore loses precision exactly as in lua5.2.4
1091            // (prints `9.007199254741e+15`), rather than surviving as an i64.
1092            if is_float_only(state) {
1093                *seminfo = TokenValue::Float(i as f64);
1094                Ok(TK_FLT)
1095            } else {
1096                *seminfo = TokenValue::Int(i);
1097                Ok(TK_INT)
1098            }
1099        }
1100        lua_types::LuaValue::Float(f) => {
1101            *seminfo = TokenValue::Float(f);
1102            Ok(TK_FLT)
1103        }
1104        _ => unreachable!("str2num returned non-numeric LuaValue"),
1105    }
1106}
1107
1108/// Scan a `[=*[` or `]=*]` sequence; leave the last bracket as current char.
1109///
1110/// Returns:
1111/// - `count + 2` if well-formed (where `count` is the number of `=` signs),
1112/// - `1` if a single bracket with no `=`s and no second bracket,
1113/// - `0` if malformed (e.g. `[==` with no closing bracket).
1114///
1115/// # C source
1116/// ```c
1117///
1118/// //   size_t count = 0;
1119/// //   int s = ls->current;
1120/// //   lua_assert(s == '[' || s == ']');
1121/// //   save_and_next(ls);
1122/// //   while (ls->current == '=') {
1123/// //     save_and_next(ls);
1124/// //     count++;
1125/// //   }
1126/// //   return (ls->current == s) ? count + 2
1127/// //          : (count == 0) ? 1
1128/// //          : 0;
1129/// // }
1130/// ```
1131fn skip_sep(state: &mut LuaState, ls: &mut LexState) -> Result<usize, LuaError> {
1132    let mut count: usize = 0;
1133    let s = ls.current;
1134    debug_assert!(
1135        s == b'[' as i32 || s == b']' as i32,
1136        "skip_sep: not at bracket"
1137    );
1138
1139    save_and_next(ls, state)?;
1140
1141    while ls.current == b'=' as i32 {
1142        save_and_next(ls, state)?;
1143        count += 1;
1144    }
1145
1146    if ls.current == s {
1147        Ok(count + 2)
1148    } else if count == 0 {
1149        Ok(1)
1150    } else {
1151        Ok(0)
1152    }
1153}
1154
1155/// Scan a long string or long comment delimited by `[=*[` … `]=*]`.
1156///
1157/// `seminfo` is `Some` when reading a string literal; `None` when skipping a
1158/// long comment.  When `None`, buffer contents are discarded on each newline
1159/// to avoid wasting memory.
1160///
1161/// # C source
1162/// ```c
1163///
1164/// //   int line = ls->linenumber;
1165/// //   save_and_next(ls);  /* skip 2nd '[' */
1166/// //   if (currIsNewline(ls)) inclinenumber(ls);
1167/// //   for (;;) {
1168/// //     switch (ls->current) {
1169/// //       case EOZ: { /* error */
1170/// //         const char *what = (seminfo ? "string" : "comment");
1171/// //         const char *msg = luaO_pushfstring(..., what, line);
1172/// //         lexerror(ls, msg, TK_EOS);
1173/// //         break;
1174/// //       }
1175/// //       case ']': {
1176/// //         if (skip_sep(ls) == sep) {
1177/// //           save_and_next(ls);  /* skip 2nd ']' */
1178/// //           goto endloop;
1179/// //         }
1180/// //         break;
1181/// //       }
1182/// //       case '\n': case '\r': {
1183/// //         save(ls, '\n');
1184/// //         inclinenumber(ls);
1185/// //         if (!seminfo) luaZ_resetbuffer(ls->buff);
1186/// //         break;
1187/// //       }
1188/// //       default: {
1189/// //         if (seminfo) save_and_next(ls);
1190/// //         else next(ls);
1191/// //       }
1192/// //     }
1193/// //   } endloop:
1194/// //   if (seminfo)
1195/// //     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1196/// //                                      luaZ_bufflen(ls->buff) - 2 * sep);
1197/// // }
1198/// ```
1199fn read_long_string(
1200    state: &mut LuaState,
1201    ls: &mut LexState,
1202    seminfo: Option<&mut TokenValue>,
1203    sep: usize,
1204) -> Result<(), LuaError> {
1205    let line = ls.linenumber;
1206
1207    save_and_next(ls, state)?;
1208
1209    if curr_is_newline(ls) {
1210        inc_line_number(ls, state)?;
1211    }
1212
1213    // is_string: whether we are reading a string (true) or a comment (false)
1214    let is_string = seminfo.is_some();
1215
1216    loop {
1217        match ls.current {
1218            c if c == EOZ => {
1219                let what: &[u8] = if is_string { b"string" } else { b"comment" };
1220                // PORT NOTE: build message as Vec<u8> to avoid String allocation.
1221                let mut msg: Vec<u8> = Vec::new();
1222                msg.extend_from_slice(b"unfinished long ");
1223                msg.extend_from_slice(what);
1224                msg.extend_from_slice(b" (starting at line ");
1225                let _ = write!(&mut msg, "{}", line);
1226                msg.push(b')');
1227                return Err(lex_error(ls, &msg, TK_EOS));
1228            }
1229            c if c == b']' as i32 => {
1230                let s = skip_sep(state, ls)?;
1231                if s == sep {
1232                    save_and_next(ls, state)?;
1233                    break;
1234                }
1235                // else: the ']' sequence wasn't the closing delimiter; continue
1236            }
1237            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1238                save(ls, state, b'\n' as i32)?;
1239                inc_line_number(ls, state)?;
1240                // macros.tsv: luaZ_resetbuffer → buf.clear()
1241                if !is_string {
1242                    ls.buff.clear();
1243                }
1244            }
1245            _ => {
1246                if is_string {
1247                    save_and_next(ls, state)?;
1248                } else {
1249                    advance(ls);
1250                }
1251            }
1252        }
1253    }
1254
1255    //      seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1256    //                                       luaZ_bufflen(ls->buff) - 2 * sep);
1257    if let Some(out) = seminfo {
1258        // The buffer contains: sep bytes of '[=' + content + sep bytes of '=]'
1259        // We want the content in between.
1260        // PORT NOTE: per PORTING.md §4.3, capture the slice into an owned
1261        // Vec so the immutable borrow of ls.buff is dropped before the
1262        // mutable borrow needed by new_string.
1263        let buf = ls.buff.as_slice();
1264        let content: Vec<u8> = buf[sep..buf.len() - sep].to_vec();
1265        let ts = new_string(state, ls, &content)?;
1266        *out = TokenValue::Str(ts);
1267    }
1268    Ok(())
1269}
1270
1271/// Check `c` is non-zero (truthy); if not, save the current char and raise a
1272/// string-escape error.
1273///
1274/// # C source
1275/// ```c
1276///
1277/// //   if (!c) {
1278/// //     if (ls->current != EOZ)
1279/// //       save_and_next(ls);  /* add current to buffer for error message */
1280/// //     lexerror(ls, msg, TK_STRING);
1281/// //   }
1282/// // }
1283/// ```
1284fn esc_check(
1285    state: &mut LuaState,
1286    ls: &mut LexState,
1287    ok: bool,
1288    msg: &[u8],
1289) -> Result<(), LuaError> {
1290    if !ok {
1291        if ls.current != EOZ {
1292            save_and_next(ls, state)?;
1293        }
1294        return Err(lex_error(ls, msg, TK_STRING));
1295    }
1296    Ok(())
1297}
1298
1299/// Save-and-advance, then verify the new current char is a hex digit; return
1300/// its numeric value (0-15).
1301///
1302/// # C source
1303/// ```c
1304///
1305/// //   save_and_next(ls);
1306/// //   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
1307/// //   return luaO_hexavalue(ls->current);
1308/// // }
1309/// ```
1310fn get_hexa(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1311    save_and_next(ls, state)?;
1312    esc_check(
1313        state,
1314        ls,
1315        is_xdigit(ls.current),
1316        b"hexadecimal digit expected",
1317    )?;
1318    // TODO(port): call lua_vm::object::hex_value in Phase B
1319    Ok(hex_value_stub(ls.current))
1320}
1321
1322/// Scan a `\xNN` hex escape; return the decoded byte value.
1323///
1324/// # C source
1325/// ```c
1326///
1327/// //   int r = gethexa(ls);
1328/// //   r = (r << 4) + gethexa(ls);
1329/// //   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
1330/// //   return r;
1331/// // }
1332/// ```
1333fn read_hex_esc(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1334    let r = get_hexa(state, ls)?;
1335    let r = (r << 4) + get_hexa(state, ls)?;
1336    // macros.tsv: luaZ_buffremove → buf.truncate_by(i)
1337    ls.buff.truncate_by(2);
1338    Ok(r)
1339}
1340
1341/// Scan a `\u{XXXXXX}` UTF-8 escape; return the Unicode codepoint.
1342///
1343/// # C source
1344/// ```c
1345///
1346/// //   unsigned long r;
1347/// //   int i = 4;  /* chars to remove: '\', 'u', '{', first digit */
1348/// //   save_and_next(ls);  /* skip 'u' */
1349/// //   esccheck(ls, ls->current == '{', "missing '{'");
1350/// //   r = gethexa(ls);  /* must have at least one digit */
1351/// //   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
1352/// //     i++;
1353/// //     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
1354/// //     r = (r << 4) + luaO_hexavalue(ls->current);
1355/// //   }
1356/// //   esccheck(ls, ls->current == '}', "missing '}'");
1357/// //   next(ls);  /* skip '}' */
1358/// //   luaZ_buffremove(ls->buff, i);
1359/// //   return r;
1360/// // }
1361/// ```
1362fn read_utf8_esc(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1363    let mut i: usize = 4;
1364
1365    save_and_next(ls, state)?;
1366
1367    esc_check(state, ls, ls.current == b'{' as i32, b"missing '{'")?;
1368
1369    let mut r = get_hexa(state, ls)?;
1370
1371    // The codepoint upper bound is version-gated and the C control flow differs
1372    // between families (`llex.c readutf8esc`):
1373    //   * 5.3 (L336-340): `r = (r<<4)+digit; esccheck(r <= 0x10FFFF, ...)` —
1374    //     accumulate the digit FIRST, then bound the running value at 0x10FFFF.
1375    //   * 5.4 (L351) / 5.5 (L373): `esccheck(r <= (0x7FFFFFFFu >> 4), ...);
1376    //     r = (r<<4)+digit` — bound BEFORE the shift, allowing up to 0x7FFFFFFF.
1377    // The order (check-before-shift vs shift-before-check) is reproduced exactly
1378    // because it also determines how many digits land in the `near '...'` buffer
1379    // snippet of the error message.
1380    let is_v53 = matches!(state.global().lua_version, lua_types::LuaVersion::V53);
1381
1382    // cast_void: discard return value
1383    loop {
1384        save_and_next(ls, state)?;
1385        if !is_xdigit(ls.current) {
1386            break;
1387        }
1388        i += 1;
1389        if is_v53 {
1390            // TODO(port): lua_vm::object::hex_value in Phase B
1391            r = (r << 4) + hex_value_stub(ls.current);
1392            esc_check(state, ls, r <= 0x10_FFFF, b"UTF-8 value too large")?;
1393        } else {
1394            esc_check(
1395                state,
1396                ls,
1397                r <= (0x7FFF_FFFFu32 >> 4),
1398                b"UTF-8 value too large",
1399            )?;
1400            // TODO(port): lua_vm::object::hex_value in Phase B
1401            r = (r << 4) + hex_value_stub(ls.current);
1402        }
1403    }
1404
1405    esc_check(state, ls, ls.current == b'}' as i32, b"missing '}'")?;
1406
1407    advance(ls);
1408
1409    ls.buff.truncate_by(i);
1410
1411    Ok(r)
1412}
1413
1414/// Scan `\u{...}` and append the UTF-8 encoding of the codepoint to the buffer.
1415///
1416/// # C source
1417/// ```c
1418///
1419/// //   char buff[UTF8BUFFSZ];
1420/// //   int n = luaO_utf8esc(buff, readutf8esc(ls));
1421/// //   for (; n > 0; n--)
1422/// //     save(ls, buff[UTF8BUFFSZ - n]);
1423/// // }
1424/// ```
1425fn utf8_esc(state: &mut LuaState, ls: &mut LexState) -> Result<(), LuaError> {
1426    let codepoint = read_utf8_esc(state, ls)?;
1427
1428    // macros.tsv: UTF8BUFFSZ → const UTF8_BUF_SZ: usize = 8
1429    // TODO(port): call lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1430    // For Phase A, encode directly here.
1431    let encoded = utf8_encode_stub(codepoint);
1432
1433    for &b in &encoded {
1434        save(ls, state, b as i32)?;
1435    }
1436    Ok(())
1437}
1438
1439/// Scan a decimal escape `\ddd` (up to 3 digits); return the byte value.
1440///
1441/// # C source
1442/// ```c
1443///
1444/// //   int i;
1445/// //   int r = 0;
1446/// //   for (i = 0; i < 3 && lisdigit(ls->current); i++) {
1447/// //     r = 10*r + ls->current - '0';
1448/// //     save_and_next(ls);
1449/// //   }
1450/// //   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
1451/// //   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
1452/// //   return r;
1453/// // }
1454/// ```
1455fn read_dec_esc(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1456    let mut i: usize = 0;
1457    let mut r: u32 = 0;
1458
1459    while i < 3 && is_digit(ls.current) {
1460        r = 10 * r + (ls.current as u32 - b'0' as u32);
1461        save_and_next(ls, state)?;
1462        i += 1;
1463    }
1464
1465    // UCHAR_MAX = 255 = u8::MAX. Lua 5.1 spells this `escape sequence too
1466    // large` (the `decimal escape too large` wording is 5.2+). Verified against
1467    // lua5.1.5; see specs/followup/5.1-roster-syntax.md §2.
1468    let too_large_msg: &[u8] = if matches!(state.global().lua_version, lua_types::LuaVersion::V51) {
1469        b"escape sequence too large"
1470    } else {
1471        b"decimal escape too large"
1472    };
1473    esc_check(state, ls, r <= u8::MAX as u32, too_large_msg)?;
1474
1475    ls.buff.truncate_by(i);
1476    Ok(r)
1477}
1478
1479/// Scan a short (single/double-quoted) string literal.
1480///
1481/// The C function uses `goto read_save / only_save / no_save` for escape
1482/// handling.  In Rust this is replaced by the `EscapeResult` enum.
1483///
1484/// # C source (see llex.c lines 382-442 for full listing)
1485fn read_string(
1486    state: &mut LuaState,
1487    ls: &mut LexState,
1488    del: i32,
1489    seminfo: &mut TokenValue,
1490) -> Result<(), LuaError> {
1491    // Encoding for what the escape sequence handler needs to do after decoding.
1492    //
1493    // read_save:  advance(ls), remove '\' from buffer, save decoded byte
1494    // only_save:  remove '\' from buffer, save decoded byte (no advance)
1495    // no_save:    nothing (just break from the escape case)
1496    enum EscapeResult {
1497        ReadSave(i32),
1498        OnlySave(i32),
1499        NoSave,
1500    }
1501
1502    save_and_next(ls, state)?;
1503
1504    while ls.current != del {
1505        match ls.current {
1506            c if c == EOZ => {
1507                return Err(lex_error(ls, b"unfinished string", TK_EOS));
1508            }
1509            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1510                return Err(lex_error(ls, b"unfinished string", TK_STRING));
1511            }
1512            c if c == b'\\' as i32 => {
1513                save_and_next(ls, state)?;
1514
1515                // Lua 5.1's lexer does NOT recognize `\x`, `\z`, or `\u`, and it
1516                // does NOT raise on an unknown escape. For any escape char outside
1517                // the known set, the 5.1 lexer silently drops the backslash and
1518                // keeps the next character verbatim (`"\x41"` → bytes `x41`,
1519                // `"\z"` → `z`, `"\q"` → `q`). Decimal escapes (`\ddd`) and the
1520                // standard letter/quote/newline escapes still work. Verified
1521                // against lua5.1.5; see specs/followup/5.1-roster-syntax.md §2.
1522                let is_v51 = matches!(state.global().lua_version, lua_types::LuaVersion::V51);
1523
1524                // Inner switch on the escape character
1525                let esc = match ls.current {
1526                    c if c == b'a' as i32 => EscapeResult::ReadSave(b'\x07' as i32),
1527                    c if c == b'b' as i32 => EscapeResult::ReadSave(b'\x08' as i32),
1528                    c if c == b'f' as i32 => EscapeResult::ReadSave(b'\x0C' as i32),
1529                    c if c == b'n' as i32 => EscapeResult::ReadSave(b'\n' as i32),
1530                    c if c == b'r' as i32 => EscapeResult::ReadSave(b'\r' as i32),
1531                    c if c == b't' as i32 => EscapeResult::ReadSave(b'\t' as i32),
1532                    c if c == b'v' as i32 => EscapeResult::ReadSave(b'\x0B' as i32),
1533                    c if c == b'x' as i32 && !is_v51 => {
1534                        let decoded = read_hex_esc(state, ls)?;
1535                        EscapeResult::ReadSave(decoded as i32)
1536                    }
1537                    c if c == b'u' as i32 && !is_v51 => {
1538                        utf8_esc(state, ls)?;
1539                        EscapeResult::NoSave
1540                    }
1541                    c if c == b'\n' as i32 || c == b'\r' as i32 => {
1542                        inc_line_number(ls, state)?;
1543                        EscapeResult::OnlySave(b'\n' as i32)
1544                    }
1545                    c if c == b'\\' as i32 || c == b'"' as i32 || c == b'\'' as i32 => {
1546                        EscapeResult::ReadSave(c)
1547                    }
1548                    c if c == EOZ => EscapeResult::NoSave,
1549                    c if c == b'z' as i32 && !is_v51 => {
1550                        ls.buff.truncate_by(1);
1551                        advance(ls);
1552                        while is_space(ls.current) {
1553                            if curr_is_newline(ls) {
1554                                inc_line_number(ls, state)?;
1555                            } else {
1556                                advance(ls);
1557                            }
1558                        }
1559                        EscapeResult::NoSave
1560                    }
1561                    c if is_v51 && !is_digit(c) => {
1562                        // 5.1 unknown escape: drop the backslash, emit the char.
1563                        EscapeResult::ReadSave(c)
1564                    }
1565                    _ => {
1566                        esc_check(state, ls, is_digit(ls.current), b"invalid escape sequence")?;
1567                        let decoded = read_dec_esc(state, ls)?;
1568                        EscapeResult::OnlySave(decoded as i32)
1569                    }
1570                };
1571
1572                // Dispatch the C goto targets as match arms.
1573                match esc {
1574                    EscapeResult::ReadSave(c) => {
1575                        advance(ls);
1576                        ls.buff.truncate_by(1);
1577                        save(ls, state, c)?;
1578                    }
1579                    EscapeResult::OnlySave(c) => {
1580                        ls.buff.truncate_by(1);
1581                        save(ls, state, c)?;
1582                    }
1583                    EscapeResult::NoSave => {}
1584                }
1585            }
1586            _ => {
1587                save_and_next(ls, state)?;
1588            }
1589        }
1590    }
1591
1592    save_and_next(ls, state)?;
1593
1594    //                                     luaZ_bufflen(ls->buff) - 2);
1595    // Buffer contains: delimiter + content + delimiter; strip both delimiters.
1596    // PORT NOTE: capture into owned Vec to drop the borrow before new_string.
1597    let buf = ls.buff.as_slice();
1598    let content: Vec<u8> = if buf.len() >= 2 {
1599        buf[1..buf.len() - 1].to_vec()
1600    } else {
1601        Vec::new()
1602    };
1603    let ts = new_string(state, ls, &content)?;
1604    *seminfo = TokenValue::Str(ts);
1605    Ok(())
1606}
1607
1608/// Core lexer dispatch: consume and return the next raw token kind.
1609///
1610/// This is the heart of the lexer: a large `for`-`switch` loop that classifies
1611/// the current character and dispatches to the appropriate scanner.
1612///
1613/// # C source (see llex.c lines 445-562 for full listing)
1614/// Whether the active version is the float-only legacy family (5.1/5.2), which
1615/// lacks the 5.3 integer operators (`//`, `<<`, `>>`, and the bitwise binops).
1616fn is_float_only(state: &LuaState) -> bool {
1617    matches!(
1618        state.global().lua_version,
1619        lua_types::LuaVersion::V51 | lua_types::LuaVersion::V52
1620    )
1621}
1622
1623fn llex(
1624    state: &mut LuaState,
1625    ls: &mut LexState,
1626    seminfo: &mut TokenValue,
1627) -> Result<i32, LuaError> {
1628    // macros.tsv: luaZ_resetbuffer → buf.clear()
1629    ls.buff.clear();
1630
1631    loop {
1632        match ls.current {
1633            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1634                inc_line_number(ls, state)?;
1635                // PORT NOTE: skipcomment-equivalent. luaL_loadfile in C-Lua
1636                // strips a leading '#' line (Unix shebang). Our test harness
1637                // prepends a global-setup preamble to every official test, so
1638                // the script's '#' line is not at byte zero. Apply the same
1639                // rule at any token-scan line start: treat a line whose first
1640                // character is '#' as a single-line comment. This sits in
1641                // llex's dispatch loop (not inc_line_number) so it does not
1642                // affect newlines inside long-bracket strings.
1643                if ls.current == b'#' as i32 {
1644                    while !curr_is_newline(ls) && ls.current != EOZ {
1645                        advance(ls);
1646                    }
1647                }
1648            }
1649
1650            c if c == b' ' as i32
1651                || c == b'\x0C' as i32
1652                || c == b'\t' as i32
1653                || c == b'\x0B' as i32 =>
1654            {
1655                advance(ls);
1656            }
1657
1658            c if c == b'-' as i32 => {
1659                advance(ls);
1660                if ls.current != b'-' as i32 {
1661                    return Ok(b'-' as i32);
1662                }
1663                advance(ls);
1664
1665                if ls.current == b'[' as i32 {
1666                    let sep = skip_sep(state, ls)?;
1667                    ls.buff.clear();
1668                    if sep >= 2 {
1669                        read_long_string(state, ls, None, sep)?;
1670                        ls.buff.clear();
1671                        continue;
1672                    }
1673                }
1674                while !curr_is_newline(ls) && ls.current != EOZ {
1675                    advance(ls);
1676                }
1677                // loop continues (no token emitted for comments)
1678            }
1679
1680            c if c == b'[' as i32 => {
1681                let sep = skip_sep(state, ls)?;
1682                if sep >= 2 {
1683                    read_long_string(state, ls, Some(seminfo), sep)?;
1684                    return Ok(TK_STRING);
1685                } else if sep == 0 {
1686                    return Err(lex_error(ls, b"invalid long string delimiter", TK_STRING));
1687                }
1688                // sep == 1: plain '[', no long string
1689                return Ok(b'[' as i32);
1690            }
1691
1692            c if c == b'=' as i32 => {
1693                advance(ls);
1694                if check_next1(ls, b'=' as i32) {
1695                    return Ok(TK_EQ);
1696                }
1697                return Ok(b'=' as i32);
1698            }
1699
1700            c if c == b'<' as i32 => {
1701                advance(ls);
1702                if check_next1(ls, b'=' as i32) {
1703                    return Ok(TK_LE);
1704                } else if !is_float_only(state) && check_next1(ls, b'<' as i32) {
1705                    // The `<<` shift operator is a Lua 5.3 addition. Under the
1706                    // float-only legacy family (5.1/5.2) it does not exist: a
1707                    // bare `<` is returned, so a second `<` then surfaces
1708                    // upstream's "unexpected symbol near '<'".
1709                    return Ok(TK_SHL);
1710                }
1711                return Ok(b'<' as i32);
1712            }
1713
1714            c if c == b'>' as i32 => {
1715                advance(ls);
1716                if check_next1(ls, b'=' as i32) {
1717                    return Ok(TK_GE);
1718                } else if !is_float_only(state) && check_next1(ls, b'>' as i32) {
1719                    // `>>` is a 5.3 addition; absent in 5.1/5.2.
1720                    return Ok(TK_SHR);
1721                }
1722                return Ok(b'>' as i32);
1723            }
1724
1725            c if c == b'/' as i32 => {
1726                advance(ls);
1727                if !is_float_only(state) && check_next1(ls, b'/' as i32) {
1728                    // Floor division `//` is a 5.3 addition; absent in 5.1/5.2,
1729                    // where the second `/` becomes "unexpected symbol near '/'".
1730                    return Ok(TK_IDIV);
1731                }
1732                return Ok(b'/' as i32);
1733            }
1734
1735            c if c == b'~' as i32 => {
1736                advance(ls);
1737                if check_next1(ls, b'=' as i32) {
1738                    return Ok(TK_NE);
1739                }
1740                return Ok(b'~' as i32);
1741            }
1742
1743            c if c == b':' as i32 => {
1744                advance(ls);
1745                // Lua 5.1 has no `::label::` token; `::` was added with `goto` in
1746                // 5.2. Under V51 the second `:` is left for the parser, which
1747                // reports `unexpected symbol near ':'`. See
1748                // specs/followup/5.1-roster-syntax.md §2.
1749                let is_v51 = matches!(state.global().lua_version, lua_types::LuaVersion::V51);
1750                if !is_v51 && check_next1(ls, b':' as i32) {
1751                    return Ok(TK_DBCOLON);
1752                }
1753                return Ok(b':' as i32);
1754            }
1755
1756            c if c == b'"' as i32 || c == b'\'' as i32 => {
1757                let del = ls.current;
1758                read_string(state, ls, del, seminfo)?;
1759                return Ok(TK_STRING);
1760            }
1761
1762            c if c == b'.' as i32 => {
1763                save_and_next(ls, state)?;
1764                if check_next1(ls, b'.' as i32) {
1765                    if check_next1(ls, b'.' as i32) {
1766                        return Ok(TK_DOTS);
1767                    }
1768                    return Ok(TK_CONCAT);
1769                } else if !is_digit(ls.current) {
1770                    return Ok(b'.' as i32);
1771                } else {
1772                    return read_numeral(state, ls, seminfo);
1773                }
1774            }
1775
1776            c if is_digit(c) => {
1777                return read_numeral(state, ls, seminfo);
1778            }
1779
1780            c if c == EOZ => {
1781                return Ok(TK_EOS);
1782            }
1783
1784            c => {
1785                if is_lalpha(c) {
1786                    loop {
1787                        save_and_next(ls, state)?;
1788                        if !is_lalnum(ls.current) {
1789                            break;
1790                        }
1791                    }
1792
1793                    // PORT NOTE: copy buffer bytes to drop borrow before new_string.
1794                    let content: Vec<u8> = ls.buff.as_slice().to_vec();
1795                    let ts = new_string(state, ls, &content)?;
1796
1797                    // PORT NOTE: canonical `lua_types::LuaString` lacks the `extra`
1798                    // byte that C-Lua uses to mark reserved words. Recover the
1799                    // keyword index directly from the interned bytes via the
1800                    // `LUAX_TOKENS` table; the first `NUM_RESERVED` entries are
1801                    // the keywords in declaration order, so token id =
1802                    // `FIRST_RESERVED + index`.
1803                    let reserved_token: Option<i32> = LUAX_TOKENS[..NUM_RESERVED]
1804                        .iter()
1805                        .position(|kw| *kw == content.as_slice())
1806                        .map(|i| FIRST_RESERVED + i as i32);
1807                    *seminfo = TokenValue::Str(ts);
1808
1809                    if let Some(tk) = reserved_token {
1810                        // Lua 5.1 has no `goto` keyword — `goto` is an ordinary
1811                        // identifier (`local goto = 5` is valid). The keyword and
1812                        // the `::label::` grammar were added in 5.2. So under V51
1813                        // `goto` lexes as a plain name; the parser then treats
1814                        // `goto done` as a name beginning an assignment, yielding
1815                        // the incidental `'=' expected near 'done'` the oracle
1816                        // reports. See specs/followup/5.1-roster-syntax.md §2.
1817                        if tk == TK_GOTO
1818                            && matches!(state.global().lua_version, lua_types::LuaVersion::V51)
1819                        {
1820                            return Ok(TK_NAME);
1821                        }
1822                        return Ok(tk);
1823                    }
1824
1825                    // Lua 5.5: with the upstream-default `LUA_COMPAT_GLOBAL`, the
1826                    // `global` declaration word is NOT reserved — `global` stays a
1827                    // valid identifier, and the parser recognizes the declaration
1828                    // statement contextually (see `globalstat` in lua-parse). So
1829                    // `global` always lexes as a plain name, on every version.
1830                    return Ok(TK_NAME);
1831                } else {
1832                    let tok = ls.current;
1833                    advance(ls);
1834                    return Ok(tok);
1835                }
1836            }
1837        }
1838    }
1839}
1840
1841// ── Phase A stubs for cross-crate helpers ──────────────────────────────────────
1842//
1843// The functions below stand in for cross-crate calls that cannot resolve in
1844// Phase A.  They will be replaced by proper imports in Phase B.
1845
1846// TODO(port): replace with state.intern_str(bytes) once LuaState gains that
1847// method (from lua_vm::string::new_lstr wired in Phase B).
1848// TODO_ARCH(phase-b-reconcile): canonical LuaString is constructed via
1849// from_bytes; once LuaState::intern_str is wired, route through there instead.
1850fn intern_str_stub(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaString>, LuaError> {
1851    state.intern_str(bytes)
1852}
1853
1854// TODO(port): replace with lua_vm::object::hex_value(c) in Phase B.
1855fn hex_value_stub(c: i32) -> u32 {
1856    match c {
1857        c if c >= b'0' as i32 && c <= b'9' as i32 => (c - b'0' as i32) as u32,
1858        c if c >= b'a' as i32 && c <= b'f' as i32 => (c - b'a' as i32 + 10) as u32,
1859        c if c >= b'A' as i32 && c <= b'F' as i32 => (c - b'A' as i32 + 10) as u32,
1860        _ => 0,
1861    }
1862}
1863
1864// TODO(port): replace with lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1865/// Encode a Unicode codepoint as a Lua-extended UTF-8 byte sequence (1 to 6 bytes).
1866///
1867/// Faithful port of `luaO_utf8esc` from lobject.c.  Lua permits codepoints up
1868/// to `0x7FFFFFFF` (5- and 6-byte sequences are non-strict UTF-8 but accepted
1869/// by `\u{...}` escapes per literals.lua test cases).
1870fn utf8_encode_stub(codepoint: u32) -> Vec<u8> {
1871    debug_assert!(codepoint <= 0x7FFF_FFFF);
1872    if codepoint < 0x80 {
1873        return vec![codepoint as u8];
1874    }
1875    let mut x = codepoint;
1876    let mut mfb: u32 = 0x3f;
1877    let mut buf: Vec<u8> = Vec::with_capacity(8);
1878    loop {
1879        buf.push(0x80 | ((x & 0x3f) as u8));
1880        x >>= 6;
1881        mfb >>= 1;
1882        if x <= mfb {
1883            break;
1884        }
1885    }
1886    buf.push(((!mfb << 1) | x) as u8);
1887    buf.reverse();
1888    buf
1889}
1890
1891// ──────────────────────────────────────────────────────────────────────────────
1892// PORT STATUS
1893//   source:        src/llex.c  (581 lines, 24 functions)
1894//                  src/llex.h  (91 lines; merged)
1895//   target_crate:  lua-lex
1896//   confidence:    medium
1897//   todos:         18
1898//   port_notes:    12
1899//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
1900//   notes:         Logic is faithful to the C.  The main structural differences:
1901//                  (1) LexState.L removed — state threaded via fn params;
1902//                  (2) save/save_and_next/inclinenumber/helpers are all fallible
1903//                  (Result<_, LuaError>) because lexerror is no longer noreturn;
1904//                  (3) goto read_save/only_save/no_save in read_string replaced
1905//                  by EscapeResult enum; (4) Cross-crate calls (intern_str,
1906//                  luaH_getstr/finishset, luaG_addinfo, luaO_str2num,
1907//                  luaO_hexavalue, luaO_utf8esc, luaC_fix, luaC_checkGC) are
1908//                  stubbed with TODO; (5) LuaError, LuaString, ZIO, LexBuffer,
1909//                  LuaState defined as local stubs — Phase B replaces with real
1910//                  imports once the crate graph is wired.  Key Phase B tasks:
1911//                  wire import paths; move LuaString.extra accessor to pub;
1912//                  implement luaX_newstring anchor-table logic.  Numeric
1913//                  literal parsing now delegates to lua_vm::object::str2num
1914//                  (handles hex integers with wrap-around and hex floats).
1915// ──────────────────────────────────────────────────────────────────────────────