Skip to main content

lua_lex/
lib.rs

1//! Lexical analyzer — port of `llex.c` + `llex.h`.
2//!
3//! Provides the Lua 5.4 lexer: character-by-character scanning of a [`ZIO`]
4//! input stream into [`Token`] values, with one-token lookahead.  The
5//! `llex.h` header is merged here per PORTING.md §1.
6//!
7//! # C source files
8//! - `reference/lua-5.4.7/src/llex.c`  (581 lines, 24 functions)
9//! - `reference/lua-5.4.7/src/llex.h`  (91 lines; merged here)
10//!
11//! # Design notes
12//! - `LexState.L` (back-pointer to `lua_State`) is removed.  All functions
13//!   that need `LuaState` receive it as `state: &mut LuaState`.
14//! - `Token.token` is `i32` in Phase A (matching the C `int token` field).
15//!   Single-byte tokens are their ASCII values; reserved-word tokens start at
16//!   `FIRST_RESERVED` (257).  A proper `TokenKind` enum is deferred to Phase B.
17//! - `save` / `save_and_next` are now fallible (`Result<(), LuaError>`); the
18//!   `?` operator replaces the C noreturn `lexerror` call on buffer overflow.
19//! - The `goto read_save / only_save / no_save` pattern in `read_string` is
20//!   translated via the local `EscapeResult` enum.
21
22// TODO(port): resolve remaining cross-crate calls (intern_str, table anchor,
23// number parsing, utf8 encoding) in Phase B.  Canonical cross-crate type
24// imports are now in place per harness/type-vocabulary.tsv (see below).
25
26use std::rc::Rc;
27use std::io::Write as IoWrite;
28
29// PORT NOTE: GcRef<T> = Rc<T> in Phases A–C; replaced by real GC pointer in Phase D.
30// TODO(port): move GcRef to lua-types once the GC crate is defined (Phase D).
31use lua_types::gc::GcRef;
32
33// Canonical cross-crate types: imported from owner crates per
34// harness/type-vocabulary.tsv.  See PORTING.md §7.
35pub use lua_types::LuaError;
36pub use lua_types::LuaString;
37pub use lua_vm::state::LuaState;
38pub use lua_vm::table::LuaTable;
39
40/// Placeholder for `LexBuffer` from `lua_vm::zio`.
41/// TODO(port): replace with `use lua_vm::zio::LexBuffer` in Phase B.
42/// types.tsv: Mbuffer → LexBuffer
43pub struct LexBuffer {
44    buffer: Vec<u8>,
45}
46
47impl LexBuffer {
48    pub fn new() -> Self {
49        LexBuffer { buffer: Vec::new() }
50    }
51
52    /// macros.tsv: luaZ_bufflen → buf.len()
53    pub fn len(&self) -> usize {
54        self.buffer.len()
55    }
56
57    /// macros.tsv: luaZ_sizebuffer → buf.capacity()
58    pub fn capacity(&self) -> usize {
59        self.buffer.capacity()
60    }
61
62    /// macros.tsv: luaZ_buffer → buf.as_mut_slice()
63    pub fn as_slice(&self) -> &[u8] {
64        &self.buffer
65    }
66
67    /// macros.tsv: luaZ_resetbuffer → buf.clear()
68    pub fn clear(&mut self) {
69        self.buffer.clear();
70    }
71
72    /// macros.tsv: luaZ_buffremove → buf.truncate_by(i)
73    pub fn truncate_by(&mut self, i: usize) {
74        let new_len = self.buffer.len().saturating_sub(i);
75        self.buffer.truncate(new_len);
76    }
77
78    /// allocated capacity. In C this changes `buffsize`, not the live byte
79    /// count `n`. The Rust analogue therefore manipulates `Vec::capacity`,
80    /// never `Vec::len` (otherwise `push_byte` would write past the live
81    /// content and leave embedded zero padding inside the token text).
82    pub fn resize(&mut self, _state: &mut LuaState, size: usize) -> Result<(), LuaError> {
83        if size < self.buffer.len() {
84            self.buffer.truncate(size);
85        }
86        if size > self.buffer.capacity() {
87            let extra = size - self.buffer.capacity();
88            self.buffer.reserve_exact(extra);
89        }
90        Ok(())
91    }
92
93    /// Append one byte to the live contents.  Panics if capacity exceeded
94    /// (callers must pre-check via `save`).
95    fn push_byte(&mut self, c: u8) {
96        self.buffer.push(c);
97    }
98}
99
100impl Default for LexBuffer {
101    fn default() -> Self {
102        Self::new()
103    }
104}
105
106/// Placeholder for `ZIO` from `lua_vm::zio`.
107/// TODO(port): replace with `use lua_vm::zio::ZIO` in Phase B.
108/// types.tsv: Zio → ZIO
109pub struct ZIO {
110    // TODO(port): full ZIO implementation lives in lua_vm::zio; this is a stub.
111    reader: Box<dyn FnMut() -> Option<Vec<u8>>>,
112    n: usize,
113    p: usize,
114    current_chunk: Vec<u8>,
115}
116
117impl ZIO {
118    /// Construct a ZIO from a reader callback that yields successive chunks.
119    pub fn new(reader: Box<dyn FnMut() -> Option<Vec<u8>>>) -> Self {
120        ZIO { reader, n: 0, p: 0, current_chunk: Vec::new() }
121    }
122
123    /// Construct a ZIO that yields the supplied bytes once and then EOZ.
124    pub fn from_bytes(bytes: Vec<u8>) -> Self {
125        let mut once = Some(bytes);
126        ZIO::new(Box::new(move || once.take()))
127    }
128
129    /// macros.tsv: zgetc → z.getc()
130    pub fn getc(&mut self) -> i32 {
131        if self.n > 0 {
132            self.n -= 1;
133            let b = self.current_chunk[self.p] as u8;
134            self.p += 1;
135            b as i32
136        } else {
137            self.fill()
138        }
139    }
140
141    fn fill(&mut self) -> i32 {
142        match (self.reader)() {
143            None => EOZ,
144            Some(chunk) if chunk.is_empty() => EOZ,
145            Some(chunk) => {
146                self.n = chunk.len() - 1;
147                self.current_chunk = chunk;
148                self.p = 0;
149                let b = self.current_chunk[self.p] as u8;
150                self.p += 1;
151                b as i32
152            }
153        }
154    }
155}
156
157// ── Constants ─────────────────────────────────────────────────────────────────
158
159// macros.tsv: FIRST_RESERVED → const FIRST_RESERVED: i32 = 257
160/// First token kind value that is not a single-byte character.
161/// Single-byte tokens are represented by their ASCII value (0-255).
162pub const FIRST_RESERVED: i32 = 257;
163
164// macros.tsv: LUA_ENV → const LUA_ENV: &[u8] = b"_ENV"
165/// Name of the global environment upvalue.
166pub const LUA_ENV: &[u8] = b"_ENV";
167
168// macros.tsv: NUM_RESERVED → const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize
169/// Number of reserved words (keywords).
170pub const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize;
171
172// macros.tsv: EOZ → const EOZ: i32 = -1
173/// End-of-stream sentinel returned by ZIO::getc.
174pub const EOZ: i32 = -1;
175
176// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = ...
177const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
178    usize::MAX
179} else {
180    i64::MAX as usize
181};
182
183// macros.tsv: LUA_MIN_BUFFER → const LUA_MIN_BUFFER: usize = 32
184const LUA_MIN_BUFFER: usize = 32;
185
186// ── Token kind constants (ORDER RESERVED — matches C enum RESERVED) ───────────
187//
188// In C these are enum values.  In Rust we use i32 constants for Phase A
189// (faithful to `Token.token: int` in C) with a TODO for a proper enum in Phase B.
190//
191
192/// `and`
193pub const TK_AND: i32 = 257;
194/// `break`
195pub const TK_BREAK: i32 = 258;
196/// `do`
197pub const TK_DO: i32 = 259;
198/// `else`
199pub const TK_ELSE: i32 = 260;
200/// `elseif`
201pub const TK_ELSEIF: i32 = 261;
202/// `end`
203pub const TK_END: i32 = 262;
204/// `false`
205pub const TK_FALSE: i32 = 263;
206/// `for`
207pub const TK_FOR: i32 = 264;
208/// `function`
209pub const TK_FUNCTION: i32 = 265;
210/// `goto`
211pub const TK_GOTO: i32 = 266;
212/// `if`
213pub const TK_IF: i32 = 267;
214/// `in`
215pub const TK_IN: i32 = 268;
216/// `local`
217pub const TK_LOCAL: i32 = 269;
218/// `nil`
219pub const TK_NIL: i32 = 270;
220/// `not`
221pub const TK_NOT: i32 = 271;
222/// `or`
223pub const TK_OR: i32 = 272;
224/// `repeat`
225pub const TK_REPEAT: i32 = 273;
226/// `return`
227pub const TK_RETURN: i32 = 274;
228/// `then`
229pub const TK_THEN: i32 = 275;
230/// `true`
231pub const TK_TRUE: i32 = 276;
232/// `until`
233pub const TK_UNTIL: i32 = 277;
234/// `while`  (last keyword; NUM_RESERVED = TK_WHILE - FIRST_RESERVED + 1 = 22)
235pub const TK_WHILE: i32 = 278;
236/// `//`  (floor division)
237pub const TK_IDIV: i32 = 279;
238/// `..`  (concatenation)
239pub const TK_CONCAT: i32 = 280;
240/// `...` (vararg)
241pub const TK_DOTS: i32 = 281;
242/// `==`
243pub const TK_EQ: i32 = 282;
244/// `>=`
245pub const TK_GE: i32 = 283;
246/// `<=`
247pub const TK_LE: i32 = 284;
248/// `~=`
249pub const TK_NE: i32 = 285;
250/// `<<`
251pub const TK_SHL: i32 = 286;
252/// `>>`
253pub const TK_SHR: i32 = 287;
254/// `::`
255pub const TK_DBCOLON: i32 = 288;
256/// `<eof>`
257pub const TK_EOS: i32 = 289;
258/// `<number>`  (float literal)
259pub const TK_FLT: i32 = 290;
260/// `<integer>` (integer literal)
261pub const TK_INT: i32 = 291;
262/// `<name>`    (identifier)
263pub const TK_NAME: i32 = 292;
264/// `<string>`  (string literal)
265pub const TK_STRING: i32 = 293;
266
267// ORDER RESERVED — index 0 = TK_AND - FIRST_RESERVED, etc.
268/// Display strings for tokens, indexed by `token - FIRST_RESERVED`.
269pub static LUAX_TOKENS: &[&[u8]] = &[
270    // keywords (indices 0-21)
271    b"and", b"break", b"do", b"else", b"elseif",
272    b"end", b"false", b"for", b"function", b"goto", b"if",
273    b"in", b"local", b"nil", b"not", b"or", b"repeat",
274    b"return", b"then", b"true", b"until", b"while",
275    // other terminal symbols (indices 22-35)
276    b"//", b"..", b"...", b"==", b">=", b"<=", b"~=",
277    b"<<", b">>", b"::", b"<eof>",
278    b"<number>", b"<integer>", b"<name>", b"<string>",
279];
280
281// ── SemInfo / TokenValue ───────────────────────────────────────────────────────
282
283// types.tsv: SemInfo → TokenValue
284/// Semantic payload carried by a token.
285///
286/// Corresponds to `SemInfo` (a C union) in `llex.h`.  In Rust this is a
287/// discriminated union (enum).
288///
289/// # C mapping
290/// ```text
291/// SemInfo.r   → TokenValue::Float(f64)      (lua_Number)
292/// SemInfo.i   → TokenValue::Int(i64)        (lua_Integer)
293/// SemInfo.ts  → TokenValue::Str(GcRef<LuaString>)
294/// (no C field) → TokenValue::None           (default / unset)
295/// ```
296#[derive(Clone)]
297pub enum TokenValue {
298    /// No semantic value (default; used for single-byte and most multi-char tokens).
299    None,
300    /// Float literal payload.  C: `seminfo.r` (`lua_Number`).
301    Float(f64),
302    /// Integer literal payload.  C: `seminfo.i` (`lua_Integer`).
303    Int(i64),
304    /// String/name payload.  C: `seminfo.ts` (`TString *`).
305    Str(GcRef<LuaString>),
306}
307
308// ── Token ─────────────────────────────────────────────────────────────────────
309
310// types.tsv: Token → Token;  Token.token → i32 (Phase A; TODO: TokenKind enum Phase B)
311/// A single lexed token with its semantic payload.
312///
313/// `kind` is an `i32` whose value is either an ASCII byte code (for single-byte
314/// tokens like `+`, `-`, `[`) or one of the `TK_*` constants (for reserved
315/// words, multi-char symbols, and literals).
316///
317/// TODO(port): Phase B — replace `kind: i32` with a proper `TokenKind` enum
318/// covering both single-byte and named tokens (e.g. `TokenKind::Char(u8)` +
319/// named variants).
320#[derive(Clone)]
321pub struct Token {
322    pub kind: i32,
323    pub value: TokenValue,
324}
325
326impl Token {
327    /// Construct a token with no semantic value.
328    pub fn new(kind: i32) -> Self {
329        Token { kind, value: TokenValue::None }
330    }
331
332    /// The end-of-stream sentinel token.
333    pub fn eos() -> Self {
334        Token::new(TK_EOS)
335    }
336}
337
338// ── LexState ──────────────────────────────────────────────────────────────────
339
340// types.tsv: LexState → LexState;  LexState.L removed (thread via &mut LuaState)
341/// Per-chunk lexer (and shared parser) state.
342///
343/// Corresponds to `LexState` in `llex.h`.  Owns the input stream, token
344/// buffer, and current/lookahead tokens.
345///
346/// # C mapping (types.tsv)
347/// ```text
348/// LexState.current    → current: i32        (charint; -1 = EOZ)
349/// LexState.linenumber → linenumber: i32
350/// LexState.lastline   → lastline: i32
351/// LexState.t          → t: Token            (current token)
352/// LexState.lookahead  → lookahead: Token    (one-token lookahead)
353/// LexState.fs         → fs: Option<Box<FuncState>>   (parser state)
354/// LexState.L          → (removed; callers pass &mut LuaState)
355/// LexState.z          → z: ZIO              (owned input stream)
356/// LexState.buff       → buff: LexBuffer     (owned token-text buffer)
357/// LexState.h          → h: GcRef<LuaTable>  (string-anchor table)
358/// LexState.dyd        → dyd: DynData        (parser dynamic data)
359/// LexState.source     → source: GcRef<LuaString>
360/// LexState.envn       → envn: GcRef<LuaString>
361/// ```
362pub struct LexState {
363    pub current: i32,
364    pub linenumber: i32,
365    pub lastline: i32,
366    pub t: Token,
367    pub lookahead: Token,
368    // TODO(port): Box<FuncState> once FuncState lands in lua-parse (Phase B)
369    pub fs: Option<()>,
370    // PORT NOTE: C held a pointer; Rust owns the ZIO directly per types.tsv.
371    pub z: ZIO,
372    // PORT NOTE: C held a pointer; Rust owns the LexBuffer directly per types.tsv.
373    pub buff: LexBuffer,
374    // TODO(port): GcRef<LuaTable> once LuaTable is defined in Phase B
375    pub h: Option<GcRef<LuaTable>>,
376    /// Per-parse-session anchor for long strings. C-Lua's `ls->h` is a Lua
377    /// table that deduplicates all literal strings within a chunk (both short
378    /// and long), so e.g. `local s1 <const>="..."` and `local s2 <const>="..."`
379    /// with identical 50-byte payloads share one `TString` object — which is
380    /// what makes `string.format("%p", s1) == string.format("%p", s2)` hold.
381    /// Short strings already share identity via the global `interned_lt` pool,
382    /// but long strings (>LUAI_MAXSHORTLEN = 40) are not globally interned and
383    /// need this session-level map. Keyed by the string bytes; populated lazily
384    /// by `new_string`.
385    pub long_str_anchor: std::collections::HashMap<Vec<u8>, GcRef<LuaString>>,
386    // TODO(port): DynData once parser types land in Phase B
387    pub dyd: Option<()>,
388    pub source: GcRef<LuaString>,
389    pub envn: GcRef<LuaString>,
390}
391
392// ── Character-classification helpers ─────────────────────────────────────────
393//
394// These are simplified ASCII implementations for Phase A.
395// TODO(port): import from lua_vm::ctype in Phase B; the full table handles
396// the LUA_UCID (Unicode identifiers) flag and matches the C bit-table exactly.
397//
398// PORT NOTE: the C macros take `int` (not `char`) so they handle EOZ (-1) safely.
399// These Rust fns match that contract: EOZ returns false for all predicates.
400
401#[inline]
402fn is_digit(c: i32) -> bool {
403    c >= b'0' as i32 && c <= b'9' as i32
404}
405
406#[inline]
407fn is_xdigit(c: i32) -> bool {
408    (c >= b'0' as i32 && c <= b'9' as i32)
409        || (c >= b'a' as i32 && c <= b'f' as i32)
410        || (c >= b'A' as i32 && c <= b'F' as i32)
411}
412
413// ALPHABIT: ASCII letters + '_'
414#[inline]
415fn is_lalpha(c: i32) -> bool {
416    (c >= b'a' as i32 && c <= b'z' as i32)
417        || (c >= b'A' as i32 && c <= b'Z' as i32)
418        || c == b'_' as i32
419}
420
421#[inline]
422fn is_lalnum(c: i32) -> bool {
423    is_lalpha(c) || is_digit(c)
424}
425
426#[inline]
427fn is_space(c: i32) -> bool {
428    matches!(c, 9 | 10 | 11 | 12 | 13 | 32) // \t \n \v \f \r space
429}
430
431// PRINTBIT: printable ASCII (graph + space), i.e. 0x20-0x7E
432#[inline]
433fn is_print(c: i32) -> bool {
434    c >= 0x20 && c <= 0x7E
435}
436
437#[inline]
438fn curr_is_newline(ls: &LexState) -> bool {
439    ls.current == b'\n' as i32 || ls.current == b'\r' as i32
440}
441
442// ── Low-level stream helpers ───────────────────────────────────────────────────
443
444/// Advance the lexer by one character.
445///
446/// Corresponds to the `next(ls)` macro.  Named `advance` to avoid collision
447/// with Rust's iterator method.
448#[inline]
449fn advance(ls: &mut LexState) {
450    // macros.tsv: zgetc → z.getc()
451    ls.current = ls.z.getc();
452}
453
454/// Append character `c` to the token buffer, growing it if necessary.
455///
456/// On overflow calls [`lex_error`] which becomes `Err(LuaError::Syntax(...))`.
457///
458/// # C source
459/// ```c
460///
461/// //   Mbuffer *b = ls->buff;
462/// //   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
463/// //     size_t newsize;
464/// //     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
465/// //       lexerror(ls, "lexical element too long", 0);
466/// //     newsize = luaZ_sizebuffer(b) * 2;
467/// //     luaZ_resizebuffer(ls->L, b, newsize);
468/// //   }
469/// //   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
470/// // }
471/// ```
472fn save(ls: &mut LexState, state: &mut LuaState, c: i32) -> Result<(), LuaError> {
473    // macros.tsv: luaZ_bufflen → buf.len(); luaZ_sizebuffer → buf.capacity()
474    if ls.buff.len() + 1 > ls.buff.capacity() {
475        if ls.buff.capacity() >= MAX_SIZE / 2 {
476            return Err(lex_error(ls, b"lexical element too long", 0));
477        }
478        //    luaZ_resizebuffer(ls->L, b, newsize);
479        // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
480        let newsize = ls.buff.capacity() * 2;
481        ls.buff.resize(state, newsize)?;
482    }
483    // macros.tsv: cast_char → x as i8  (C char is signed; Lua bytes stored as-is)
484    // PORT NOTE: we store the byte value directly; the i8 cast in C is for the
485    // C char type but the data is read back as unsigned via cast_uchar everywhere.
486    ls.buff.push_byte(c as u8);
487    Ok(())
488}
489
490/// Save the current character into the token buffer, then advance the stream.
491///
492/// Corresponds to the `save_and_next(ls)` macro.  Fallible because `save`
493/// may need to grow the buffer.
494#[inline]
495fn save_and_next(ls: &mut LexState, state: &mut LuaState) -> Result<(), LuaError> {
496    let c = ls.current;
497    save(ls, state, c)?;
498    advance(ls);
499    Ok(())
500}
501
502// ── Error helpers ─────────────────────────────────────────────────────────────
503
504// l_noret → -> !  but in Rust we return LuaError (callers wrap in Err(...))
505// error_sites.tsv: luaX_lexerror → return Err(LuaError::syntax_at(ls, "msg", token))
506/// Build a syntax error, optionally annotated with the offending token text.
507///
508/// Corresponds to the static `lexerror` function in `llex.c`.  In C this is
509/// `l_noret` (diverges via `luaD_throw`); in Rust it returns a `LuaError`
510/// value that callers wrap in `Err(...)`.
511///
512/// # C source
513/// ```c
514///
515/// //   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
516/// //   if (token)
517/// //     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
518/// //   luaD_throw(ls->L, LUA_ERRSYNTAX);
519/// // }
520/// ```
521pub fn lex_error(ls: &mut LexState, msg: &[u8], token: i32) -> LuaError {
522    const LUA_IDSIZE: usize = 60;
523    let mut buff = [0u8; LUA_IDSIZE];
524    let n = lua_vm::object::chunk_id(&mut buff[..], ls.source.as_bytes());
525    let src_part = &buff[..n];
526
527    let mut full_msg: Vec<u8> = Vec::new();
528    full_msg.extend_from_slice(src_part);
529    let _ = write!(full_msg, ":{}: ", ls.linenumber);
530    full_msg.extend_from_slice(msg);
531
532    if token != 0 {
533        let tok_text = txt_token(ls, token);
534        full_msg.extend_from_slice(b" near ");
535        full_msg.extend_from_slice(&tok_text);
536    }
537
538    LuaError::syntax_raw(&full_msg)
539}
540
541// LUAI_FUNC → pub(crate)
542// error_sites.tsv: luaX_syntaxerror → return Err(LuaError::syntax(format_args!("msg")))
543/// Report a syntax error at the current token.
544///
545/// # C source
546/// ```c
547///
548/// //   lexerror(ls, msg, ls->t.token);
549/// // }
550/// ```
551pub fn syntax_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
552    let token = ls.t.kind;
553    lex_error(ls, msg, token)
554}
555
556/// Produce a human-readable representation of `token` for error messages.
557///
558/// For `TK_NAME`, `TK_STRING`, `TK_FLT`, `TK_INT`: formats the current
559/// token buffer contents as `'<text>'`.  For everything else, delegates to
560/// [`token2str`].
561///
562/// # C source
563/// ```c
564///
565/// //   switch (token) {
566/// //     case TK_NAME: case TK_STRING:
567/// //     case TK_FLT: case TK_INT:
568/// //       save(ls, '\0');
569/// //       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
570/// //     default:
571/// //       return luaX_token2str(ls, token);
572/// //   }
573/// // }
574/// ```
575///
576/// PORT NOTE: C calls `luaO_pushfstring` which pushes the string onto the
577/// Lua stack (stack-anchored temporary).  Rust returns `Vec<u8>` directly
578/// since there is no stack-based string lifecycle for error formatting.
579fn txt_token(ls: &mut LexState, token: i32) -> Vec<u8> {
580    match token {
581        t if t == TK_NAME || t == TK_STRING || t == TK_FLT || t == TK_INT => {
582            let mut v: Vec<u8> = Vec::new();
583            v.push(b'\'');
584            let buff = ls.buff.as_slice();
585            let trimmed = if buff.last() == Some(&0) { &buff[..buff.len() - 1] } else { buff };
586            v.extend_from_slice(trimmed);
587            v.push(b'\'');
588            v
589        }
590        _ => token2str_raw(token),
591    }
592}
593
594// LUAI_FUNC → pub(crate)
595/// Produce a human-readable token description (for error messages and the parser).
596///
597/// Single-byte printable tokens are formatted as `'X'`; non-printable as
598/// `'<\N>'`.  Reserved words and multi-char symbols are formatted as `'kw'`.
599/// Literal tokens (`<name>`, `<string>`, etc.) return the bare label.
600///
601/// # C source
602/// ```c
603///
604/// //   if (token < FIRST_RESERVED) {
605/// //     if (lisprint(token))
606/// //       return luaO_pushfstring(ls->L, "'%c'", token);
607/// //     else
608/// //       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
609/// //   }
610/// //   else {
611/// //     const char *s = luaX_tokens[token - FIRST_RESERVED];
612/// //     if (token < TK_EOS)
613/// //       return luaO_pushfstring(ls->L, "'%s'", s);
614/// //     else
615/// //       return s;
616/// //   }
617/// // }
618/// ```
619///
620/// PORT NOTE: The `LexState` parameter is retained in the signature for API
621/// parity with the C export, but is unused in Rust because we don't push onto
622/// the Lua stack.  The real formatting is in [`token2str_raw`].
623pub fn token2str(_ls: &LexState, token: i32) -> Vec<u8> {
624    token2str_raw(token)
625}
626
627/// Inner implementation of [`token2str`] that does not need `LexState`.
628fn token2str_raw(token: i32) -> Vec<u8> {
629    if token < FIRST_RESERVED {
630        if is_print(token) {
631            vec![b'\'', token as u8, b'\'']
632        } else {
633            // PORT NOTE: uses write! to Vec<u8> to avoid String allocation for Lua data.
634            let mut v: Vec<u8> = Vec::new();
635            v.extend_from_slice(b"'<\\");
636            let _ = write!(&mut v, "{}", token);
637            v.extend_from_slice(b">'");
638            v
639        }
640    } else {
641        let idx = (token - FIRST_RESERVED) as usize;
642        let s = LUAX_TOKENS[idx];
643        if token < TK_EOS {
644            let mut v: Vec<u8> = Vec::with_capacity(s.len() + 2);
645            v.push(b'\'');
646            v.extend_from_slice(s);
647            v.push(b'\'');
648            v
649        } else {
650            s.to_vec()
651        }
652    }
653}
654
655// ── Public init / setup ───────────────────────────────────────────────────────
656
657// LUAI_FUNC → pub(crate)
658/// Initialise the lexer subsystem: intern all reserved words and fix them
659/// in the GC so they are never collected.
660///
661/// Must be called exactly once during VM startup via `luaX_init`.
662///
663/// # C source
664/// ```c
665///
666/// //   int i;
667/// //   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
668/// //   luaC_fix(L, obj2gco(e));  /* never collect this name */
669/// //   for (i=0; i<NUM_RESERVED; i++) {
670/// //     TString *ts = luaS_new(L, luaX_tokens[i]);
671/// //     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
672/// //     ts->extra = cast_byte(i+1);  /* reserved word */
673/// //   }
674/// // }
675/// ```
676pub fn init(state: &mut LuaState) -> Result<(), LuaError> {
677    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
678    // TODO(port): call state.intern_str(LUA_ENV) once LuaState has that method (Phase B)
679    let _e = intern_str_stub(state, LUA_ENV)?;
680
681    // macros.tsv: luaC_objbarrier / luaC_fix — GC fix; no-op in Phases A-C
682    // TODO(port): state.gc().fix(e) in Phase D
683
684    for i in 0..NUM_RESERVED {
685        // macros.tsv: luaS_new → state.intern_str(...)
686        // TODO(port): call state.intern_str(LUAX_TOKENS[i]) in Phase B
687        let ts = intern_str_stub(state, LUAX_TOKENS[i])?;
688
689        // TODO(port): state.gc().fix(ts.clone()) in Phase D
690
691        // macros.tsv: cast_byte → x as u8
692        // PORT NOTE: LuaString.extra uses Cell<u8> interior mutability.
693        // TODO(port): ts.set_extra((i + 1) as u8) — needs pub accessor on LuaString
694        let _ = ts; // suppress unused warning until Phase B
695    }
696
697    Ok(())
698}
699
700// LUAI_FUNC → pub(crate)
701/// Initialise `ls` for lexing a new chunk from stream `z`.
702///
703/// # C source
704/// ```c
705///
706/// //                         TString *source, int firstchar) {
707/// //   ls->t.token = 0;
708/// //   ls->L = L;
709/// //   ls->current = firstchar;
710/// //   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
711/// //   ls->z = z;
712/// //   ls->fs = NULL;
713/// //   ls->linenumber = 1;
714/// //   ls->lastline = 1;
715/// //   ls->source = source;
716/// //   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
717/// //   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);
718/// // }
719/// ```
720pub fn set_input(
721    state: &mut LuaState,
722    ls: &mut LexState,
723    z: ZIO,
724    source: GcRef<LuaString>,
725    firstchar: i32,
726) -> Result<(), LuaError> {
727    ls.t = Token::new(0);
728    ls.current = firstchar;
729    ls.lookahead = Token::eos();
730    ls.z = z;
731    ls.fs = None;
732    ls.linenumber = 1;
733    ls.lastline = 1;
734    ls.source = source;
735    // macros.tsv: luaS_newliteral → state.intern_str(b"...")
736    // TODO(port): state.intern_str(LUA_ENV) in Phase B
737    ls.envn = intern_str_stub(state, LUA_ENV)?;
738    // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
739    ls.buff.resize(state, LUA_MIN_BUFFER)?;
740    Ok(())
741}
742
743// LUAI_FUNC → pub(crate)
744/// Create (or retrieve) a Lua string and anchor it in the parser's GC-protection
745/// table `ls.h` so it cannot be collected before the end of compilation.
746///
747/// Also internalises long strings so that each unique content has exactly one
748/// copy in memory.  The table `ls.h` is used as a set: the string is both the
749/// key and the value.
750///
751/// # C source
752/// ```c
753///
754/// //   lua_State *L = ls->L;
755/// //   TString *ts = luaS_newlstr(L, str, l);
756/// //   const TValue *o = luaH_getstr(ls->h, ts);
757/// //   if (!ttisnil(o))  /* string already present? */
758/// //     ts = keystrval(nodefromval(o));  /* get saved copy */
759/// //   else {
760/// //     TValue *stv = s2v(L->top.p++);  /* reserve stack space */
761/// //     setsvalue(L, stv, ts);           /* anchor the string */
762/// //     luaH_finishset(L, ls->h, stv, o, stv);  /* t[string] = string */
763/// //     luaC_checkGC(L);
764/// //     L->top.p--;                       /* remove string from stack */
765/// //   }
766/// //   return ts;
767/// // }
768/// ```
769pub(crate) fn new_string(
770    state: &mut LuaState,
771    ls: &mut LexState,
772    bytes: &[u8],
773) -> Result<GcRef<LuaString>, LuaError> {
774    // PORT NOTE: in C, the anchor table ls->h is a Lua table mapping the string
775    // to itself so a second occurrence of the same literal in the chunk returns
776    // the originally-created TString. We use a plain HashMap on LexState
777    // (`long_str_anchor`) for the equivalent dedup — sufficient because Phase
778    // A-C `GcRef<T>` is `Rc<T>` and identity is determined by the `Rc`
779    // allocation. Short strings already share identity via the global pool;
780    // long strings (>LUAI_MAXSHORTLEN) need this session-level map.
781    if let Some(existing) = ls.long_str_anchor.get(bytes) {
782        return Ok(existing.clone());
783    }
784    let ts = intern_str_stub(state, bytes)?;
785    ls.long_str_anchor.insert(bytes.to_vec(), ts.clone());
786    Ok(ts)
787}
788
789// ── Public advance / lookahead ─────────────────────────────────────────────────
790
791// LUAI_FUNC → pub(crate)
792/// Consume the current token; load the next one from the stream.
793///
794/// If a lookahead token was set, it becomes the current token without re-reading
795/// from the stream.
796///
797/// # C source
798/// ```c
799///
800/// //   ls->lastline = ls->linenumber;
801/// //   if (ls->lookahead.token != TK_EOS) {
802/// //     ls->t = ls->lookahead;
803/// //     ls->lookahead.token = TK_EOS;
804/// //   }
805/// //   else
806/// //     ls->t.token = llex(ls, &ls->t.seminfo);
807/// // }
808/// ```
809pub fn next(
810    state: &mut LuaState,
811    ls: &mut LexState,
812) -> Result<(), LuaError> {
813    ls.lastline = ls.linenumber;
814
815    if ls.lookahead.kind != TK_EOS {
816        // Clone to avoid borrow conflict; LuaString inside TokenValue is GcRef (Rc).
817        ls.t = ls.lookahead.clone();
818        ls.lookahead = Token::eos();
819    } else {
820        let mut val = TokenValue::None;
821        let kind = llex(state, ls, &mut val)?;
822        ls.t = Token { kind, value: val };
823    }
824    Ok(())
825}
826
827// LUAI_FUNC → pub(crate)
828/// Peek at the next token without consuming the current one.
829///
830/// The lookahead token is cached in `ls.lookahead` and returned.  Only one
831/// token of lookahead is supported; calling this twice without an intervening
832/// [`next`] is a logic error (asserted in debug builds).
833///
834/// # C source
835/// ```c
836///
837/// //   lua_assert(ls->lookahead.token == TK_EOS);
838/// //   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
839/// //   return ls->lookahead.token;
840/// // }
841/// ```
842pub fn lookahead(
843    state: &mut LuaState,
844    ls: &mut LexState,
845) -> Result<i32, LuaError> {
846    // macros.tsv: lua_assert → debug_assert!
847    debug_assert!(
848        ls.lookahead.kind == TK_EOS,
849        "luaX_lookahead: lookahead already set"
850    );
851
852    let mut val = TokenValue::None;
853    let kind = llex(state, ls, &mut val)?;
854    ls.lookahead = Token { kind, value: val };
855
856    Ok(ls.lookahead.kind)
857}
858
859// ── Private lexer helpers ──────────────────────────────────────────────────────
860
861/// If the current character equals `c`, advance and return `true`.
862///
863/// # C source
864/// ```c
865///
866/// //   if (ls->current == c) { next(ls); return 1; }
867/// //   else return 0;
868/// // }
869/// ```
870fn check_next1(ls: &mut LexState, c: i32) -> bool {
871    if ls.current == c {
872        advance(ls);
873        true
874    } else {
875        false
876    }
877}
878
879/// If the current character is either of the two bytes in `set`, save-and-advance
880/// and return `true`.
881///
882/// # C source
883/// ```c
884///
885/// //   lua_assert(set[2] == '\0');
886/// //   if (ls->current == set[0] || ls->current == set[1]) {
887/// //     save_and_next(ls);
888/// //     return 1;
889/// //   }
890/// //   else return 0;
891/// // }
892/// ```
893fn check_next2(
894    ls: &mut LexState,
895    state: &mut LuaState,
896    set: &[u8; 2],
897) -> Result<bool, LuaError> {
898    if ls.current == set[0] as i32 || ls.current == set[1] as i32 {
899        save_and_next(ls, state)?;
900        Ok(true)
901    } else {
902        Ok(false)
903    }
904}
905
906/// Increment the line counter and consume the newline sequence.
907///
908/// Handles `\n`, `\r`, `\n\r`, and `\r\n`.
909///
910/// # C source
911/// ```c
912///
913/// //   int old = ls->current;
914/// //   lua_assert(currIsNewline(ls));
915/// //   next(ls);  /* skip '\n' or '\r' */
916/// //   if (currIsNewline(ls) && ls->current != old)
917/// //     next(ls);  /* skip '\n\r' or '\r\n' */
918/// //   if (++ls->linenumber >= MAX_INT)
919/// //     lexerror(ls, "chunk has too many lines", 0);
920/// // }
921/// ```
922fn inc_line_number(ls: &mut LexState, _state: &mut LuaState) -> Result<(), LuaError> {
923    // macros.tsv: lua_assert → debug_assert!
924    debug_assert!(curr_is_newline(ls), "inc_line_number: not at a newline");
925
926    let old = ls.current;
927    advance(ls);
928
929    if curr_is_newline(ls) && ls.current != old {
930        advance(ls);
931    }
932
933    // macros.tsv: MAX_INT → i32::MAX
934    ls.linenumber += 1;
935    if ls.linenumber >= i32::MAX {
936        return Err(lex_error(ls, b"chunk has too many lines", 0));
937    }
938    Ok(())
939}
940
941/// Scan a numeric literal (integer or float, decimal or hex).
942///
943/// The caller may have already read an initial dot.  Accepts the pattern:
944/// `%d(%x|%.|(Ee[+-]?))*` or `0[Xx](%x|%.|(Pp[+-]?))*`.
945///
946/// Returns `TK_INT` for integers, `TK_FLT` for floats.
947///
948/// # C source
949/// ```c
950///
951/// //   TValue obj;
952/// //   const char *expo = "Ee";
953/// //   int first = ls->current;
954/// //   lua_assert(lisdigit(ls->current));
955/// //   save_and_next(ls);
956/// //   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
957/// //     expo = "Pp";
958/// //   for (;;) {
959/// //     if (check_next2(ls, expo))
960/// //       check_next2(ls, "-+");
961/// //     else if (lisxdigit(ls->current) || ls->current == '.')
962/// //       save_and_next(ls);
963/// //     else break;
964/// //   }
965/// //   if (lislalpha(ls->current))  /* numeral touching a letter? */
966/// //     save_and_next(ls);         /* force an error */
967/// //   save(ls, '\0');
968/// //   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)
969/// //     lexerror(ls, "malformed number", TK_FLT);
970/// //   if (ttisinteger(&obj)) { seminfo->i = ivalue(&obj); return TK_INT; }
971/// //   else { seminfo->r = fltvalue(&obj); return TK_FLT; }
972/// // }
973/// ```
974fn read_numeral(
975    state: &mut LuaState,
976    ls: &mut LexState,
977    seminfo: &mut TokenValue,
978) -> Result<i32, LuaError> {
979    let mut expo: &[u8; 2] = b"Ee";
980
981    let first = ls.current;
982
983    debug_assert!(is_digit(ls.current), "read_numeral: not at a digit");
984
985    save_and_next(ls, state)?;
986
987    if first == b'0' as i32 && check_next2(ls, state, b"xX")? {
988        expo = b"Pp";
989    }
990
991    loop {
992        if check_next2(ls, state, expo)? {
993            check_next2(ls, state, b"-+")?;
994        } else if is_xdigit(ls.current) || ls.current == b'.' as i32 {
995            //      save_and_next(ls);
996            save_and_next(ls, state)?;
997        } else {
998            break;
999        }
1000    }
1001
1002    if is_lalpha(ls.current) {
1003        save_and_next(ls, state)?;
1004    }
1005
1006    // In Rust, luaO_str2num will receive a byte slice; NUL is not needed.
1007    // We save 0 for parity with C, but our str2num stub ignores it.
1008    save(ls, state, 0)?;
1009
1010    //        lexerror(ls, "malformed number", TK_FLT);
1011    // macros.tsv: luaZ_buffer → buf.as_mut_slice()
1012    let buf = ls.buff.as_slice();
1013    let num_bytes = if buf.last() == Some(&0) { &buf[..buf.len() - 1] } else { buf };
1014    let mut obj = lua_types::LuaValue::Nil;
1015    if lua_vm::object::str2num(num_bytes, &mut obj) == 0 {
1016        return Err(lex_error(ls, b"malformed number", TK_FLT));
1017    }
1018    match obj {
1019        lua_types::LuaValue::Int(i) => {
1020            *seminfo = TokenValue::Int(i);
1021            Ok(TK_INT)
1022        }
1023        lua_types::LuaValue::Float(f) => {
1024            *seminfo = TokenValue::Float(f);
1025            Ok(TK_FLT)
1026        }
1027        _ => unreachable!("str2num returned non-numeric LuaValue"),
1028    }
1029}
1030
1031/// Scan a `[=*[` or `]=*]` sequence; leave the last bracket as current char.
1032///
1033/// Returns:
1034/// - `count + 2` if well-formed (where `count` is the number of `=` signs),
1035/// - `1` if a single bracket with no `=`s and no second bracket,
1036/// - `0` if malformed (e.g. `[==` with no closing bracket).
1037///
1038/// # C source
1039/// ```c
1040///
1041/// //   size_t count = 0;
1042/// //   int s = ls->current;
1043/// //   lua_assert(s == '[' || s == ']');
1044/// //   save_and_next(ls);
1045/// //   while (ls->current == '=') {
1046/// //     save_and_next(ls);
1047/// //     count++;
1048/// //   }
1049/// //   return (ls->current == s) ? count + 2
1050/// //          : (count == 0) ? 1
1051/// //          : 0;
1052/// // }
1053/// ```
1054fn skip_sep(
1055    state: &mut LuaState,
1056    ls: &mut LexState,
1057) -> Result<usize, LuaError> {
1058    let mut count: usize = 0;
1059    let s = ls.current;
1060    debug_assert!(s == b'[' as i32 || s == b']' as i32, "skip_sep: not at bracket");
1061
1062    save_and_next(ls, state)?;
1063
1064    while ls.current == b'=' as i32 {
1065        save_and_next(ls, state)?;
1066        count += 1;
1067    }
1068
1069    if ls.current == s {
1070        Ok(count + 2)
1071    } else if count == 0 {
1072        Ok(1)
1073    } else {
1074        Ok(0)
1075    }
1076}
1077
1078/// Scan a long string or long comment delimited by `[=*[` … `]=*]`.
1079///
1080/// `seminfo` is `Some` when reading a string literal; `None` when skipping a
1081/// long comment.  When `None`, buffer contents are discarded on each newline
1082/// to avoid wasting memory.
1083///
1084/// # C source
1085/// ```c
1086///
1087/// //   int line = ls->linenumber;
1088/// //   save_and_next(ls);  /* skip 2nd '[' */
1089/// //   if (currIsNewline(ls)) inclinenumber(ls);
1090/// //   for (;;) {
1091/// //     switch (ls->current) {
1092/// //       case EOZ: { /* error */
1093/// //         const char *what = (seminfo ? "string" : "comment");
1094/// //         const char *msg = luaO_pushfstring(..., what, line);
1095/// //         lexerror(ls, msg, TK_EOS);
1096/// //         break;
1097/// //       }
1098/// //       case ']': {
1099/// //         if (skip_sep(ls) == sep) {
1100/// //           save_and_next(ls);  /* skip 2nd ']' */
1101/// //           goto endloop;
1102/// //         }
1103/// //         break;
1104/// //       }
1105/// //       case '\n': case '\r': {
1106/// //         save(ls, '\n');
1107/// //         inclinenumber(ls);
1108/// //         if (!seminfo) luaZ_resetbuffer(ls->buff);
1109/// //         break;
1110/// //       }
1111/// //       default: {
1112/// //         if (seminfo) save_and_next(ls);
1113/// //         else next(ls);
1114/// //       }
1115/// //     }
1116/// //   } endloop:
1117/// //   if (seminfo)
1118/// //     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1119/// //                                      luaZ_bufflen(ls->buff) - 2 * sep);
1120/// // }
1121/// ```
1122fn read_long_string(
1123    state: &mut LuaState,
1124    ls: &mut LexState,
1125    seminfo: Option<&mut TokenValue>,
1126    sep: usize,
1127) -> Result<(), LuaError> {
1128    let line = ls.linenumber;
1129
1130    save_and_next(ls, state)?;
1131
1132    if curr_is_newline(ls) {
1133        inc_line_number(ls, state)?;
1134    }
1135
1136    // is_string: whether we are reading a string (true) or a comment (false)
1137    let is_string = seminfo.is_some();
1138
1139    loop {
1140        match ls.current {
1141            c if c == EOZ => {
1142                let what: &[u8] = if is_string { b"string" } else { b"comment" };
1143                // PORT NOTE: build message as Vec<u8> to avoid String allocation.
1144                let mut msg: Vec<u8> = Vec::new();
1145                msg.extend_from_slice(b"unfinished long ");
1146                msg.extend_from_slice(what);
1147                msg.extend_from_slice(b" (starting at line ");
1148                let _ = write!(&mut msg, "{}", line);
1149                msg.push(b')');
1150                return Err(lex_error(ls, &msg, TK_EOS));
1151            }
1152            c if c == b']' as i32 => {
1153                let s = skip_sep(state, ls)?;
1154                if s == sep {
1155                    save_and_next(ls, state)?;
1156                    break;
1157                }
1158                // else: the ']' sequence wasn't the closing delimiter; continue
1159            }
1160            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1161                save(ls, state, b'\n' as i32)?;
1162                inc_line_number(ls, state)?;
1163                // macros.tsv: luaZ_resetbuffer → buf.clear()
1164                if !is_string {
1165                    ls.buff.clear();
1166                }
1167            }
1168            _ => {
1169                if is_string {
1170                    save_and_next(ls, state)?;
1171                } else {
1172                    advance(ls);
1173                }
1174            }
1175        }
1176    }
1177
1178    //      seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1179    //                                       luaZ_bufflen(ls->buff) - 2 * sep);
1180    if let Some(out) = seminfo {
1181        // The buffer contains: sep bytes of '[=' + content + sep bytes of '=]'
1182        // We want the content in between.
1183        // PORT NOTE: per PORTING.md §4.3, capture the slice into an owned
1184        // Vec so the immutable borrow of ls.buff is dropped before the
1185        // mutable borrow needed by new_string.
1186        let buf = ls.buff.as_slice();
1187        let content: Vec<u8> = buf[sep..buf.len() - sep].to_vec();
1188        let ts = new_string(state, ls, &content)?;
1189        *out = TokenValue::Str(ts);
1190    }
1191    Ok(())
1192}
1193
1194/// Check `c` is non-zero (truthy); if not, save the current char and raise a
1195/// string-escape error.
1196///
1197/// # C source
1198/// ```c
1199///
1200/// //   if (!c) {
1201/// //     if (ls->current != EOZ)
1202/// //       save_and_next(ls);  /* add current to buffer for error message */
1203/// //     lexerror(ls, msg, TK_STRING);
1204/// //   }
1205/// // }
1206/// ```
1207fn esc_check(
1208    state: &mut LuaState,
1209    ls: &mut LexState,
1210    ok: bool,
1211    msg: &[u8],
1212) -> Result<(), LuaError> {
1213    if !ok {
1214        if ls.current != EOZ {
1215            save_and_next(ls, state)?;
1216        }
1217        return Err(lex_error(ls, msg, TK_STRING));
1218    }
1219    Ok(())
1220}
1221
1222/// Save-and-advance, then verify the new current char is a hex digit; return
1223/// its numeric value (0-15).
1224///
1225/// # C source
1226/// ```c
1227///
1228/// //   save_and_next(ls);
1229/// //   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
1230/// //   return luaO_hexavalue(ls->current);
1231/// // }
1232/// ```
1233fn get_hexa(
1234    state: &mut LuaState,
1235    ls: &mut LexState,
1236) -> Result<u32, LuaError> {
1237    save_and_next(ls, state)?;
1238    esc_check(state, ls, is_xdigit(ls.current), b"hexadecimal digit expected")?;
1239    // TODO(port): call lua_vm::object::hex_value in Phase B
1240    Ok(hex_value_stub(ls.current))
1241}
1242
1243/// Scan a `\xNN` hex escape; return the decoded byte value.
1244///
1245/// # C source
1246/// ```c
1247///
1248/// //   int r = gethexa(ls);
1249/// //   r = (r << 4) + gethexa(ls);
1250/// //   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
1251/// //   return r;
1252/// // }
1253/// ```
1254fn read_hex_esc(
1255    state: &mut LuaState,
1256    ls: &mut LexState,
1257) -> Result<u32, LuaError> {
1258    let r = get_hexa(state, ls)?;
1259    let r = (r << 4) + get_hexa(state, ls)?;
1260    // macros.tsv: luaZ_buffremove → buf.truncate_by(i)
1261    ls.buff.truncate_by(2);
1262    Ok(r)
1263}
1264
1265/// Scan a `\u{XXXXXX}` UTF-8 escape; return the Unicode codepoint.
1266///
1267/// # C source
1268/// ```c
1269///
1270/// //   unsigned long r;
1271/// //   int i = 4;  /* chars to remove: '\', 'u', '{', first digit */
1272/// //   save_and_next(ls);  /* skip 'u' */
1273/// //   esccheck(ls, ls->current == '{', "missing '{'");
1274/// //   r = gethexa(ls);  /* must have at least one digit */
1275/// //   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
1276/// //     i++;
1277/// //     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
1278/// //     r = (r << 4) + luaO_hexavalue(ls->current);
1279/// //   }
1280/// //   esccheck(ls, ls->current == '}', "missing '}'");
1281/// //   next(ls);  /* skip '}' */
1282/// //   luaZ_buffremove(ls->buff, i);
1283/// //   return r;
1284/// // }
1285/// ```
1286fn read_utf8_esc(
1287    state: &mut LuaState,
1288    ls: &mut LexState,
1289) -> Result<u32, LuaError> {
1290    let mut i: usize = 4;
1291
1292    save_and_next(ls, state)?;
1293
1294    esc_check(state, ls, ls.current == b'{' as i32, b"missing '{'")?;
1295
1296    let mut r = get_hexa(state, ls)?;
1297
1298    // cast_void: discard return value
1299    loop {
1300        save_and_next(ls, state)?;
1301        if !is_xdigit(ls.current) {
1302            break;
1303        }
1304        i += 1;
1305        esc_check(state, ls, r <= (0x7FFF_FFFFu32 >> 4), b"UTF-8 value too large")?;
1306        // TODO(port): lua_vm::object::hex_value in Phase B
1307        r = (r << 4) + hex_value_stub(ls.current);
1308    }
1309
1310    esc_check(state, ls, ls.current == b'}' as i32, b"missing '}'")?;
1311
1312    advance(ls);
1313
1314    ls.buff.truncate_by(i);
1315
1316    Ok(r)
1317}
1318
1319/// Scan `\u{...}` and append the UTF-8 encoding of the codepoint to the buffer.
1320///
1321/// # C source
1322/// ```c
1323///
1324/// //   char buff[UTF8BUFFSZ];
1325/// //   int n = luaO_utf8esc(buff, readutf8esc(ls));
1326/// //   for (; n > 0; n--)
1327/// //     save(ls, buff[UTF8BUFFSZ - n]);
1328/// // }
1329/// ```
1330fn utf8_esc(
1331    state: &mut LuaState,
1332    ls: &mut LexState,
1333) -> Result<(), LuaError> {
1334    let codepoint = read_utf8_esc(state, ls)?;
1335
1336    // macros.tsv: UTF8BUFFSZ → const UTF8_BUF_SZ: usize = 8
1337    // TODO(port): call lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1338    // For Phase A, encode directly here.
1339    let encoded = utf8_encode_stub(codepoint);
1340
1341    for &b in &encoded {
1342        save(ls, state, b as i32)?;
1343    }
1344    Ok(())
1345}
1346
1347/// Scan a decimal escape `\ddd` (up to 3 digits); return the byte value.
1348///
1349/// # C source
1350/// ```c
1351///
1352/// //   int i;
1353/// //   int r = 0;
1354/// //   for (i = 0; i < 3 && lisdigit(ls->current); i++) {
1355/// //     r = 10*r + ls->current - '0';
1356/// //     save_and_next(ls);
1357/// //   }
1358/// //   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
1359/// //   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
1360/// //   return r;
1361/// // }
1362/// ```
1363fn read_dec_esc(
1364    state: &mut LuaState,
1365    ls: &mut LexState,
1366) -> Result<u32, LuaError> {
1367    let mut i: usize = 0;
1368    let mut r: u32 = 0;
1369
1370    while i < 3 && is_digit(ls.current) {
1371        r = 10 * r + (ls.current as u32 - b'0' as u32);
1372        save_and_next(ls, state)?;
1373        i += 1;
1374    }
1375
1376    // UCHAR_MAX = 255 = u8::MAX
1377    esc_check(state, ls, r <= u8::MAX as u32, b"decimal escape too large")?;
1378
1379    ls.buff.truncate_by(i);
1380    Ok(r)
1381}
1382
1383/// Scan a short (single/double-quoted) string literal.
1384///
1385/// The C function uses `goto read_save / only_save / no_save` for escape
1386/// handling.  In Rust this is replaced by the `EscapeResult` enum.
1387///
1388/// # C source (see llex.c lines 382-442 for full listing)
1389fn read_string(
1390    state: &mut LuaState,
1391    ls: &mut LexState,
1392    del: i32,
1393    seminfo: &mut TokenValue,
1394) -> Result<(), LuaError> {
1395    // Encoding for what the escape sequence handler needs to do after decoding.
1396    //
1397    // read_save:  advance(ls), remove '\' from buffer, save decoded byte
1398    // only_save:  remove '\' from buffer, save decoded byte (no advance)
1399    // no_save:    nothing (just break from the escape case)
1400    enum EscapeResult {
1401        ReadSave(i32),
1402        OnlySave(i32),
1403        NoSave,
1404    }
1405
1406    save_and_next(ls, state)?;
1407
1408    while ls.current != del {
1409        match ls.current {
1410            c if c == EOZ => {
1411                return Err(lex_error(ls, b"unfinished string", TK_EOS));
1412            }
1413            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1414                return Err(lex_error(ls, b"unfinished string", TK_STRING));
1415            }
1416            c if c == b'\\' as i32 => {
1417                save_and_next(ls, state)?;
1418
1419                // Inner switch on the escape character
1420                let esc = match ls.current {
1421                    c if c == b'a' as i32 => EscapeResult::ReadSave(b'\x07' as i32),
1422                    c if c == b'b' as i32 => EscapeResult::ReadSave(b'\x08' as i32),
1423                    c if c == b'f' as i32 => EscapeResult::ReadSave(b'\x0C' as i32),
1424                    c if c == b'n' as i32 => EscapeResult::ReadSave(b'\n' as i32),
1425                    c if c == b'r' as i32 => EscapeResult::ReadSave(b'\r' as i32),
1426                    c if c == b't' as i32 => EscapeResult::ReadSave(b'\t' as i32),
1427                    c if c == b'v' as i32 => EscapeResult::ReadSave(b'\x0B' as i32),
1428                    c if c == b'x' as i32 => {
1429                        let decoded = read_hex_esc(state, ls)?;
1430                        EscapeResult::ReadSave(decoded as i32)
1431                    }
1432                    c if c == b'u' as i32 => {
1433                        utf8_esc(state, ls)?;
1434                        EscapeResult::NoSave
1435                    }
1436                    c if c == b'\n' as i32 || c == b'\r' as i32 => {
1437                        inc_line_number(ls, state)?;
1438                        EscapeResult::OnlySave(b'\n' as i32)
1439                    }
1440                    c if c == b'\\' as i32 || c == b'"' as i32 || c == b'\'' as i32 => {
1441                        EscapeResult::ReadSave(c)
1442                    }
1443                    c if c == EOZ => EscapeResult::NoSave,
1444                    c if c == b'z' as i32 => {
1445                        ls.buff.truncate_by(1);
1446                        advance(ls);
1447                        while is_space(ls.current) {
1448                            if curr_is_newline(ls) {
1449                                inc_line_number(ls, state)?;
1450                            } else {
1451                                advance(ls);
1452                            }
1453                        }
1454                        EscapeResult::NoSave
1455                    }
1456                    _ => {
1457                        esc_check(
1458                            state, ls,
1459                            is_digit(ls.current),
1460                            b"invalid escape sequence",
1461                        )?;
1462                        let decoded = read_dec_esc(state, ls)?;
1463                        EscapeResult::OnlySave(decoded as i32)
1464                    }
1465                };
1466
1467                // Dispatch the C goto targets as match arms.
1468                match esc {
1469                    EscapeResult::ReadSave(c) => {
1470                        advance(ls);
1471                        ls.buff.truncate_by(1);
1472                        save(ls, state, c)?;
1473                    }
1474                    EscapeResult::OnlySave(c) => {
1475                        ls.buff.truncate_by(1);
1476                        save(ls, state, c)?;
1477                    }
1478                    EscapeResult::NoSave => {}
1479                }
1480            }
1481            _ => {
1482                save_and_next(ls, state)?;
1483            }
1484        }
1485    }
1486
1487    save_and_next(ls, state)?;
1488
1489    //                                     luaZ_bufflen(ls->buff) - 2);
1490    // Buffer contains: delimiter + content + delimiter; strip both delimiters.
1491    // PORT NOTE: capture into owned Vec to drop the borrow before new_string.
1492    let buf = ls.buff.as_slice();
1493    let content: Vec<u8> = if buf.len() >= 2 {
1494        buf[1..buf.len() - 1].to_vec()
1495    } else {
1496        Vec::new()
1497    };
1498    let ts = new_string(state, ls, &content)?;
1499    *seminfo = TokenValue::Str(ts);
1500    Ok(())
1501}
1502
1503/// Core lexer dispatch: consume and return the next raw token kind.
1504///
1505/// This is the heart of the lexer: a large `for`-`switch` loop that classifies
1506/// the current character and dispatches to the appropriate scanner.
1507///
1508/// # C source (see llex.c lines 445-562 for full listing)
1509fn llex(
1510    state: &mut LuaState,
1511    ls: &mut LexState,
1512    seminfo: &mut TokenValue,
1513) -> Result<i32, LuaError> {
1514    // macros.tsv: luaZ_resetbuffer → buf.clear()
1515    ls.buff.clear();
1516
1517    loop {
1518        match ls.current {
1519            c if c == b'\n' as i32 || c == b'\r' as i32 => {
1520                inc_line_number(ls, state)?;
1521                // PORT NOTE: skipcomment-equivalent. luaL_loadfile in C-Lua
1522                // strips a leading '#' line (Unix shebang). Our test harness
1523                // prepends a global-setup preamble to every official test, so
1524                // the script's '#' line is not at byte zero. Apply the same
1525                // rule at any token-scan line start: treat a line whose first
1526                // character is '#' as a single-line comment. This sits in
1527                // llex's dispatch loop (not inc_line_number) so it does not
1528                // affect newlines inside long-bracket strings.
1529                if ls.current == b'#' as i32 {
1530                    while !curr_is_newline(ls) && ls.current != EOZ {
1531                        advance(ls);
1532                    }
1533                }
1534            }
1535
1536            c if c == b' ' as i32
1537                || c == b'\x0C' as i32
1538                || c == b'\t' as i32
1539                || c == b'\x0B' as i32 =>
1540            {
1541                advance(ls);
1542            }
1543
1544            c if c == b'-' as i32 => {
1545                advance(ls);
1546                if ls.current != b'-' as i32 {
1547                    return Ok(b'-' as i32);
1548                }
1549                advance(ls);
1550
1551                if ls.current == b'[' as i32 {
1552                    let sep = skip_sep(state, ls)?;
1553                    ls.buff.clear();
1554                    if sep >= 2 {
1555                        read_long_string(state, ls, None, sep)?;
1556                        ls.buff.clear();
1557                        continue;
1558                    }
1559                }
1560                while !curr_is_newline(ls) && ls.current != EOZ {
1561                    advance(ls);
1562                }
1563                // loop continues (no token emitted for comments)
1564            }
1565
1566            c if c == b'[' as i32 => {
1567                let sep = skip_sep(state, ls)?;
1568                if sep >= 2 {
1569                    read_long_string(state, ls, Some(seminfo), sep)?;
1570                    return Ok(TK_STRING);
1571                } else if sep == 0 {
1572                    return Err(lex_error(ls, b"invalid long string delimiter", TK_STRING));
1573                }
1574                // sep == 1: plain '[', no long string
1575                return Ok(b'[' as i32);
1576            }
1577
1578            c if c == b'=' as i32 => {
1579                advance(ls);
1580                if check_next1(ls, b'=' as i32) {
1581                    return Ok(TK_EQ);
1582                }
1583                return Ok(b'=' as i32);
1584            }
1585
1586            c if c == b'<' as i32 => {
1587                advance(ls);
1588                if check_next1(ls, b'=' as i32) {
1589                    return Ok(TK_LE);
1590                } else if check_next1(ls, b'<' as i32) {
1591                    return Ok(TK_SHL);
1592                }
1593                return Ok(b'<' as i32);
1594            }
1595
1596            c if c == b'>' as i32 => {
1597                advance(ls);
1598                if check_next1(ls, b'=' as i32) {
1599                    return Ok(TK_GE);
1600                } else if check_next1(ls, b'>' as i32) {
1601                    return Ok(TK_SHR);
1602                }
1603                return Ok(b'>' as i32);
1604            }
1605
1606            c if c == b'/' as i32 => {
1607                advance(ls);
1608                if check_next1(ls, b'/' as i32) {
1609                    return Ok(TK_IDIV);
1610                }
1611                return Ok(b'/' as i32);
1612            }
1613
1614            c if c == b'~' as i32 => {
1615                advance(ls);
1616                if check_next1(ls, b'=' as i32) {
1617                    return Ok(TK_NE);
1618                }
1619                return Ok(b'~' as i32);
1620            }
1621
1622            c if c == b':' as i32 => {
1623                advance(ls);
1624                if check_next1(ls, b':' as i32) {
1625                    return Ok(TK_DBCOLON);
1626                }
1627                return Ok(b':' as i32);
1628            }
1629
1630            c if c == b'"' as i32 || c == b'\'' as i32 => {
1631                let del = ls.current;
1632                read_string(state, ls, del, seminfo)?;
1633                return Ok(TK_STRING);
1634            }
1635
1636            c if c == b'.' as i32 => {
1637                save_and_next(ls, state)?;
1638                if check_next1(ls, b'.' as i32) {
1639                    if check_next1(ls, b'.' as i32) {
1640                        return Ok(TK_DOTS);
1641                    }
1642                    return Ok(TK_CONCAT);
1643                } else if !is_digit(ls.current) {
1644                    return Ok(b'.' as i32);
1645                } else {
1646                    return read_numeral(state, ls, seminfo);
1647                }
1648            }
1649
1650            c if is_digit(c) => {
1651                return read_numeral(state, ls, seminfo);
1652            }
1653
1654            c if c == EOZ => {
1655                return Ok(TK_EOS);
1656            }
1657
1658            c => {
1659                if is_lalpha(c) {
1660                    loop {
1661                        save_and_next(ls, state)?;
1662                        if !is_lalnum(ls.current) {
1663                            break;
1664                        }
1665                    }
1666
1667                    // PORT NOTE: copy buffer bytes to drop borrow before new_string.
1668                    let content: Vec<u8> = ls.buff.as_slice().to_vec();
1669                    let ts = new_string(state, ls, &content)?;
1670
1671                    // PORT NOTE: canonical `lua_types::LuaString` lacks the `extra`
1672                    // byte that C-Lua uses to mark reserved words. Recover the
1673                    // keyword index directly from the interned bytes via the
1674                    // `LUAX_TOKENS` table; the first `NUM_RESERVED` entries are
1675                    // the keywords in declaration order, so token id =
1676                    // `FIRST_RESERVED + index`.
1677                    let reserved_token: Option<i32> = LUAX_TOKENS[..NUM_RESERVED]
1678                        .iter()
1679                        .position(|kw| *kw == content.as_slice())
1680                        .map(|i| FIRST_RESERVED + i as i32);
1681                    *seminfo = TokenValue::Str(ts);
1682
1683                    if let Some(tk) = reserved_token {
1684                        return Ok(tk);
1685                    } else {
1686                        return Ok(TK_NAME);
1687                    }
1688                } else {
1689                    let tok = ls.current;
1690                    advance(ls);
1691                    return Ok(tok);
1692                }
1693            }
1694        }
1695    }
1696}
1697
1698// ── Phase A stubs for cross-crate helpers ──────────────────────────────────────
1699//
1700// The functions below stand in for cross-crate calls that cannot resolve in
1701// Phase A.  They will be replaced by proper imports in Phase B.
1702
1703// TODO(port): replace with state.intern_str(bytes) once LuaState gains that
1704// method (from lua_vm::string::new_lstr wired in Phase B).
1705// TODO_ARCH(phase-b-reconcile): canonical LuaString is constructed via
1706// from_bytes; once LuaState::intern_str is wired, route through there instead.
1707fn intern_str_stub(
1708    state: &mut LuaState,
1709    bytes: &[u8],
1710) -> Result<GcRef<LuaString>, LuaError> {
1711    state.intern_str(bytes)
1712}
1713
1714/// Result of converting a byte string to a Lua number.
1715/// TODO(port): replace with the real `LuaValue` enum variants from lua-types (Phase B).
1716enum NumResult {
1717    Int(i64),
1718    Float(f64),
1719}
1720
1721fn str2num_stub(bytes: &[u8]) -> Option<NumResult> {
1722    let s = bytes.iter().position(|&b| b == 0)
1723        .map(|n| &bytes[..n])
1724        .unwrap_or(bytes);
1725    let mut value = lua_types::LuaValue::Nil;
1726    if lua_vm::object::str2num(s, &mut value) == 0 {
1727        return None;
1728    }
1729    match value {
1730        lua_types::LuaValue::Int(i) => Some(NumResult::Int(i)),
1731        lua_types::LuaValue::Float(f) => Some(NumResult::Float(f)),
1732        _ => None,
1733    }
1734}
1735
1736// TODO(port): replace with lua_vm::object::hex_value(c) in Phase B.
1737fn hex_value_stub(c: i32) -> u32 {
1738    match c {
1739        c if c >= b'0' as i32 && c <= b'9' as i32 => (c - b'0' as i32) as u32,
1740        c if c >= b'a' as i32 && c <= b'f' as i32 => (c - b'a' as i32 + 10) as u32,
1741        c if c >= b'A' as i32 && c <= b'F' as i32 => (c - b'A' as i32 + 10) as u32,
1742        _ => 0,
1743    }
1744}
1745
1746// TODO(port): replace with lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1747/// Encode a Unicode codepoint as a Lua-extended UTF-8 byte sequence (1 to 6 bytes).
1748///
1749/// Faithful port of `luaO_utf8esc` from lobject.c.  Lua permits codepoints up
1750/// to `0x7FFFFFFF` (5- and 6-byte sequences are non-strict UTF-8 but accepted
1751/// by `\u{...}` escapes per literals.lua test cases).
1752fn utf8_encode_stub(codepoint: u32) -> Vec<u8> {
1753    debug_assert!(codepoint <= 0x7FFF_FFFF);
1754    if codepoint < 0x80 {
1755        return vec![codepoint as u8];
1756    }
1757    let mut x = codepoint;
1758    let mut mfb: u32 = 0x3f;
1759    let mut buf: Vec<u8> = Vec::with_capacity(8);
1760    loop {
1761        buf.push(0x80 | ((x & 0x3f) as u8));
1762        x >>= 6;
1763        mfb >>= 1;
1764        if x <= mfb {
1765            break;
1766        }
1767    }
1768    buf.push(((!mfb << 1) | x) as u8);
1769    buf.reverse();
1770    buf
1771}
1772
1773// ──────────────────────────────────────────────────────────────────────────────
1774// PORT STATUS
1775//   source:        src/llex.c  (581 lines, 24 functions)
1776//                  src/llex.h  (91 lines; merged)
1777//   target_crate:  lua-lex
1778//   confidence:    medium
1779//   todos:         18
1780//   port_notes:    12
1781//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
1782//   notes:         Logic is faithful to the C.  The main structural differences:
1783//                  (1) LexState.L removed — state threaded via fn params;
1784//                  (2) save/save_and_next/inclinenumber/helpers are all fallible
1785//                  (Result<_, LuaError>) because lexerror is no longer noreturn;
1786//                  (3) goto read_save/only_save/no_save in read_string replaced
1787//                  by EscapeResult enum; (4) Cross-crate calls (intern_str,
1788//                  luaH_getstr/finishset, luaG_addinfo, luaO_str2num,
1789//                  luaO_hexavalue, luaO_utf8esc, luaC_fix, luaC_checkGC) are
1790//                  stubbed with TODO; (5) LuaError, LuaString, ZIO, LexBuffer,
1791//                  LuaState defined as local stubs — Phase B replaces with real
1792//                  imports once the crate graph is wired.  Key Phase B tasks:
1793//                  wire import paths; move LuaString.extra accessor to pub;
1794//                  implement luaX_newstring anchor-table logic.  Numeric
1795//                  literal parsing now delegates to lua_vm::object::str2num
1796//                  (handles hex integers with wrap-around and hex floats).
1797// ──────────────────────────────────────────────────────────────────────────────