lua_lex/lib.rs
1//! Lexical analyzer — port of `llex.c` + `llex.h`.
2//!
3//! Provides the Lua 5.4 lexer: character-by-character scanning of a [`ZIO`]
4//! input stream into [`Token`] values, with one-token lookahead. The
5//! `llex.h` header is merged here per PORTING.md §1.
6//!
7//! # C source files
8//! - `reference/lua-5.4.7/src/llex.c` (581 lines, 24 functions)
9//! - `reference/lua-5.4.7/src/llex.h` (91 lines; merged here)
10//!
11//! # Design notes
12//! - `LexState.L` (back-pointer to `lua_State`) is removed. All functions
13//! that need `LuaState` receive it as `state: &mut LuaState`.
14//! - `Token.token` is `i32` in Phase A (matching the C `int token` field).
15//! Single-byte tokens are their ASCII values; reserved-word tokens start at
16//! `FIRST_RESERVED` (257). A proper `TokenKind` enum is deferred to Phase B.
17//! - `save` / `save_and_next` are now fallible (`Result<(), LuaError>`); the
18//! `?` operator replaces the C noreturn `lexerror` call on buffer overflow.
19//! - The `goto read_save / only_save / no_save` pattern in `read_string` is
20//! translated via the local `EscapeResult` enum.
21
22// TODO(port): resolve remaining cross-crate calls (intern_str, table anchor,
23// number parsing, utf8 encoding) in Phase B. Canonical cross-crate type
24// imports are now in place per harness/type-vocabulary.tsv (see below).
25
26use std::io::Write as IoWrite;
27
28// PORT NOTE: GcRef<T> = Rc<T> in Phases A–C; replaced by real GC pointer in Phase D.
29use lua_types::gc::GcRef;
30
31// Canonical cross-crate types: imported from owner crates per
32// harness/type-vocabulary.tsv. See PORTING.md §7.
33pub use lua_types::LuaError;
34pub use lua_types::LuaString;
35pub use lua_vm::state::LuaState;
36pub use lua_vm::table::LuaTable;
37
38/// Placeholder for `LexBuffer` from `lua_vm::zio`.
39/// TODO(port): replace with `use lua_vm::zio::LexBuffer` in Phase B.
40/// types.tsv: Mbuffer → LexBuffer
41pub struct LexBuffer {
42 buffer: Vec<u8>,
43}
44
45impl LexBuffer {
46 pub fn new() -> Self {
47 LexBuffer { buffer: Vec::new() }
48 }
49
50 /// macros.tsv: luaZ_bufflen → buf.len()
51 pub fn len(&self) -> usize {
52 self.buffer.len()
53 }
54
55 /// macros.tsv: luaZ_sizebuffer → buf.capacity()
56 pub fn capacity(&self) -> usize {
57 self.buffer.capacity()
58 }
59
60 /// macros.tsv: luaZ_buffer → buf.as_mut_slice()
61 pub fn as_slice(&self) -> &[u8] {
62 &self.buffer
63 }
64
65 /// macros.tsv: luaZ_resetbuffer → buf.clear()
66 pub fn clear(&mut self) {
67 self.buffer.clear();
68 }
69
70 /// macros.tsv: luaZ_buffremove → buf.truncate_by(i)
71 pub fn truncate_by(&mut self, i: usize) {
72 let new_len = self.buffer.len().saturating_sub(i);
73 self.buffer.truncate(new_len);
74 }
75
76 /// allocated capacity. In C this changes `buffsize`, not the live byte
77 /// count `n`. The Rust analogue therefore manipulates `Vec::capacity`,
78 /// never `Vec::len` (otherwise `push_byte` would write past the live
79 /// content and leave embedded zero padding inside the token text).
80 pub fn resize(&mut self, _state: &mut LuaState, size: usize) -> Result<(), LuaError> {
81 if size < self.buffer.len() {
82 self.buffer.truncate(size);
83 }
84 if size > self.buffer.capacity() {
85 let extra = size - self.buffer.capacity();
86 self.buffer.reserve_exact(extra);
87 }
88 Ok(())
89 }
90
91 /// Append one byte to the live contents. Panics if capacity exceeded
92 /// (callers must pre-check via `save`).
93 fn push_byte(&mut self, c: u8) {
94 self.buffer.push(c);
95 }
96}
97
98impl Default for LexBuffer {
99 fn default() -> Self {
100 Self::new()
101 }
102}
103
104/// Placeholder for `ZIO` from `lua_vm::zio`.
105/// TODO(port): replace with `use lua_vm::zio::ZIO` in Phase B.
106/// types.tsv: Zio → ZIO
107pub struct ZIO {
108 // TODO(port): full ZIO implementation lives in lua_vm::zio; this is a stub.
109 reader: Box<dyn FnMut() -> Option<Vec<u8>>>,
110 n: usize,
111 p: usize,
112 current_chunk: Vec<u8>,
113}
114
115impl ZIO {
116 /// Construct a ZIO from a reader callback that yields successive chunks.
117 pub fn new(reader: Box<dyn FnMut() -> Option<Vec<u8>>>) -> Self {
118 ZIO {
119 reader,
120 n: 0,
121 p: 0,
122 current_chunk: Vec::new(),
123 }
124 }
125
126 /// Construct a ZIO that yields the supplied bytes once and then EOZ.
127 pub fn from_bytes(bytes: Vec<u8>) -> Self {
128 let mut once = Some(bytes);
129 ZIO::new(Box::new(move || once.take()))
130 }
131
132 /// macros.tsv: zgetc → z.getc()
133 pub fn getc(&mut self) -> i32 {
134 if self.n > 0 {
135 self.n -= 1;
136 let b = self.current_chunk[self.p] as u8;
137 self.p += 1;
138 b as i32
139 } else {
140 self.fill()
141 }
142 }
143
144 fn fill(&mut self) -> i32 {
145 match (self.reader)() {
146 None => EOZ,
147 Some(chunk) if chunk.is_empty() => EOZ,
148 Some(chunk) => {
149 self.n = chunk.len() - 1;
150 self.current_chunk = chunk;
151 self.p = 0;
152 let b = self.current_chunk[self.p] as u8;
153 self.p += 1;
154 b as i32
155 }
156 }
157 }
158}
159
160// ── Constants ─────────────────────────────────────────────────────────────────
161
162// macros.tsv: FIRST_RESERVED → const FIRST_RESERVED: i32 = 257
163/// First token kind value that is not a single-byte character.
164/// Single-byte tokens are represented by their ASCII value (0-255).
165pub const FIRST_RESERVED: i32 = 257;
166
167// macros.tsv: LUA_ENV → const LUA_ENV: &[u8] = b"_ENV"
168/// Name of the global environment upvalue.
169pub const LUA_ENV: &[u8] = b"_ENV";
170
171// macros.tsv: NUM_RESERVED → const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize
172/// Number of reserved words (keywords).
173pub const NUM_RESERVED: usize = (TK_WHILE - FIRST_RESERVED + 1) as usize;
174
175// macros.tsv: EOZ → const EOZ: i32 = -1
176/// End-of-stream sentinel returned by ZIO::getc.
177pub const EOZ: i32 = -1;
178
179// macros.tsv: MAX_SIZE → const MAX_SIZE: usize = ...
180const MAX_SIZE: usize = if std::mem::size_of::<usize>() < std::mem::size_of::<i64>() {
181 usize::MAX
182} else {
183 i64::MAX as usize
184};
185
186// macros.tsv: LUA_MIN_BUFFER → const LUA_MIN_BUFFER: usize = 32
187const LUA_MIN_BUFFER: usize = 32;
188
189// ── Token kind constants (ORDER RESERVED — matches C enum RESERVED) ───────────
190//
191// In C these are enum values. In Rust we use i32 constants for Phase A
192// (faithful to `Token.token: int` in C) with a TODO for a proper enum in Phase B.
193//
194
195/// `and`
196pub const TK_AND: i32 = 257;
197/// `break`
198pub const TK_BREAK: i32 = 258;
199/// `do`
200pub const TK_DO: i32 = 259;
201/// `else`
202pub const TK_ELSE: i32 = 260;
203/// `elseif`
204pub const TK_ELSEIF: i32 = 261;
205/// `end`
206pub const TK_END: i32 = 262;
207/// `false`
208pub const TK_FALSE: i32 = 263;
209/// `for`
210pub const TK_FOR: i32 = 264;
211/// `function`
212pub const TK_FUNCTION: i32 = 265;
213/// `goto`
214pub const TK_GOTO: i32 = 266;
215/// `if`
216pub const TK_IF: i32 = 267;
217/// `in`
218pub const TK_IN: i32 = 268;
219/// `local`
220pub const TK_LOCAL: i32 = 269;
221/// `nil`
222pub const TK_NIL: i32 = 270;
223/// `not`
224pub const TK_NOT: i32 = 271;
225/// `or`
226pub const TK_OR: i32 = 272;
227/// `repeat`
228pub const TK_REPEAT: i32 = 273;
229/// `return`
230pub const TK_RETURN: i32 = 274;
231/// `then`
232pub const TK_THEN: i32 = 275;
233/// `true`
234pub const TK_TRUE: i32 = 276;
235/// `until`
236pub const TK_UNTIL: i32 = 277;
237/// `while` (last keyword; NUM_RESERVED = TK_WHILE - FIRST_RESERVED + 1 = 22)
238pub const TK_WHILE: i32 = 278;
239/// `//` (floor division)
240pub const TK_IDIV: i32 = 279;
241/// `..` (concatenation)
242pub const TK_CONCAT: i32 = 280;
243/// `...` (vararg)
244pub const TK_DOTS: i32 = 281;
245/// `==`
246pub const TK_EQ: i32 = 282;
247/// `>=`
248pub const TK_GE: i32 = 283;
249/// `<=`
250pub const TK_LE: i32 = 284;
251/// `~=`
252pub const TK_NE: i32 = 285;
253/// `<<`
254pub const TK_SHL: i32 = 286;
255/// `>>`
256pub const TK_SHR: i32 = 287;
257/// `::`
258pub const TK_DBCOLON: i32 = 288;
259/// `<eof>`
260pub const TK_EOS: i32 = 289;
261/// `<number>` (float literal)
262pub const TK_FLT: i32 = 290;
263/// `<integer>` (integer literal)
264pub const TK_INT: i32 = 291;
265/// `<name>` (identifier)
266pub const TK_NAME: i32 = 292;
267/// `<string>` (string literal)
268pub const TK_STRING: i32 = 293;
269
270// Lua 5.5 `global`: with the upstream-default LUA_COMPAT_GLOBAL it is NOT a
271// reserved word — it always lexes as TK_NAME (so it stays a valid identifier on
272// every version), and the parser recognizes the `global` declaration statement
273// contextually (see `globalstat`/`statement` in lua-parse). There is therefore
274// no dedicated token id.
275
276// ORDER RESERVED — index 0 = TK_AND - FIRST_RESERVED, etc.
277/// Display strings for tokens, indexed by `token - FIRST_RESERVED`.
278pub static LUAX_TOKENS: &[&[u8]] = &[
279 // keywords (indices 0-21)
280 b"and",
281 b"break",
282 b"do",
283 b"else",
284 b"elseif",
285 b"end",
286 b"false",
287 b"for",
288 b"function",
289 b"goto",
290 b"if",
291 b"in",
292 b"local",
293 b"nil",
294 b"not",
295 b"or",
296 b"repeat",
297 b"return",
298 b"then",
299 b"true",
300 b"until",
301 b"while",
302 // other terminal symbols (indices 22-35)
303 b"//",
304 b"..",
305 b"...",
306 b"==",
307 b">=",
308 b"<=",
309 b"~=",
310 b"<<",
311 b">>",
312 b"::",
313 b"<eof>",
314 b"<number>",
315 b"<integer>",
316 b"<name>",
317 b"<string>",
318];
319
320// ── SemInfo / TokenValue ───────────────────────────────────────────────────────
321
322// types.tsv: SemInfo → TokenValue
323/// Semantic payload carried by a token.
324///
325/// Corresponds to `SemInfo` (a C union) in `llex.h`. In Rust this is a
326/// discriminated union (enum).
327///
328/// # C mapping
329/// ```text
330/// SemInfo.r → TokenValue::Float(f64) (lua_Number)
331/// SemInfo.i → TokenValue::Int(i64) (lua_Integer)
332/// SemInfo.ts → TokenValue::Str(GcRef<LuaString>)
333/// (no C field) → TokenValue::None (default / unset)
334/// ```
335#[derive(Clone)]
336pub enum TokenValue {
337 /// No semantic value (default; used for single-byte and most multi-char tokens).
338 None,
339 /// Float literal payload. C: `seminfo.r` (`lua_Number`).
340 Float(f64),
341 /// Integer literal payload. C: `seminfo.i` (`lua_Integer`).
342 Int(i64),
343 /// String/name payload. C: `seminfo.ts` (`TString *`).
344 Str(GcRef<LuaString>),
345}
346
347// ── Token ─────────────────────────────────────────────────────────────────────
348
349// types.tsv: Token → Token; Token.token → i32 (Phase A; TODO: TokenKind enum Phase B)
350/// A single lexed token with its semantic payload.
351///
352/// `kind` is an `i32` whose value is either an ASCII byte code (for single-byte
353/// tokens like `+`, `-`, `[`) or one of the `TK_*` constants (for reserved
354/// words, multi-char symbols, and literals).
355///
356/// TODO(port): Phase B — replace `kind: i32` with a proper `TokenKind` enum
357/// covering both single-byte and named tokens (e.g. `TokenKind::Char(u8)` +
358/// named variants).
359#[derive(Clone)]
360pub struct Token {
361 pub kind: i32,
362 pub value: TokenValue,
363}
364
365impl Token {
366 /// Construct a token with no semantic value.
367 pub fn new(kind: i32) -> Self {
368 Token {
369 kind,
370 value: TokenValue::None,
371 }
372 }
373
374 /// The end-of-stream sentinel token.
375 pub fn eos() -> Self {
376 Token::new(TK_EOS)
377 }
378}
379
380// ── LexState ──────────────────────────────────────────────────────────────────
381
382// types.tsv: LexState → LexState; LexState.L removed (thread via &mut LuaState)
383/// Per-chunk lexer (and shared parser) state.
384///
385/// Corresponds to `LexState` in `llex.h`. Owns the input stream, token
386/// buffer, and current/lookahead tokens.
387///
388/// # C mapping (types.tsv)
389/// ```text
390/// LexState.current → current: i32 (charint; -1 = EOZ)
391/// LexState.linenumber → linenumber: i32
392/// LexState.lastline → lastline: i32
393/// LexState.t → t: Token (current token)
394/// LexState.lookahead → lookahead: Token (one-token lookahead)
395/// LexState.fs → fs: Option<Box<FuncState>> (parser state)
396/// LexState.L → (removed; callers pass &mut LuaState)
397/// LexState.z → z: ZIO (owned input stream)
398/// LexState.buff → buff: LexBuffer (owned token-text buffer)
399/// LexState.h → h: GcRef<LuaTable> (string-anchor table)
400/// LexState.dyd → dyd: DynData (parser dynamic data)
401/// LexState.source → source: GcRef<LuaString>
402/// LexState.envn → envn: GcRef<LuaString>
403/// ```
404pub struct LexState {
405 pub current: i32,
406 pub linenumber: i32,
407 pub lastline: i32,
408 pub t: Token,
409 pub lookahead: Token,
410 // TODO(port): Box<FuncState> once FuncState lands in lua-parse (Phase B)
411 pub fs: Option<()>,
412 // PORT NOTE: C held a pointer; Rust owns the ZIO directly per types.tsv.
413 pub z: ZIO,
414 // PORT NOTE: C held a pointer; Rust owns the LexBuffer directly per types.tsv.
415 pub buff: LexBuffer,
416 // TODO(port): GcRef<LuaTable> once LuaTable is defined in Phase B
417 pub h: Option<GcRef<LuaTable>>,
418 /// Per-parse-session anchor for long strings. C-Lua's `ls->h` is a Lua
419 /// table that deduplicates all literal strings within a chunk (both short
420 /// and long), so e.g. `local s1 <const>="..."` and `local s2 <const>="..."`
421 /// with identical 50-byte payloads share one `TString` object — which is
422 /// what makes `string.format("%p", s1) == string.format("%p", s2)` hold.
423 /// Short strings already share identity via the global `interned_lt` pool,
424 /// but long strings (>LUAI_MAXSHORTLEN = 40) are not globally interned and
425 /// need this session-level map. Keyed by the string bytes; populated lazily
426 /// by `new_string`.
427 pub long_str_anchor: std::collections::HashMap<Vec<u8>, GcRef<LuaString>>,
428 // TODO(port): DynData once parser types land in Phase B
429 pub dyd: Option<()>,
430 pub source: GcRef<LuaString>,
431 pub envn: GcRef<LuaString>,
432 /// The active Lua version, snapshotted at lexer setup from
433 /// `state.global().lua_version` (fixed for the lifetime of a parse). The
434 /// error formatters (`lex_error`/`token2str`) take only `&LexState`, so they
435 /// read the version here rather than threading a `&LuaState` through every
436 /// syntax-error callsite. Lua 5.1 quotes the special multi-char token labels
437 /// (`<eof>`, `<name>`, …) in error messages where 5.2+ leaves them bare.
438 pub version: lua_types::LuaVersion,
439}
440
441// ── Character-classification helpers ─────────────────────────────────────────
442//
443// These are simplified ASCII implementations for Phase A.
444// TODO(port): import from lua_vm::ctype in Phase B; the full table handles
445// the LUA_UCID (Unicode identifiers) flag and matches the C bit-table exactly.
446//
447// PORT NOTE: the C macros take `int` (not `char`) so they handle EOZ (-1) safely.
448// These Rust fns match that contract: EOZ returns false for all predicates.
449
450#[inline]
451fn is_digit(c: i32) -> bool {
452 c >= b'0' as i32 && c <= b'9' as i32
453}
454
455#[inline]
456fn is_xdigit(c: i32) -> bool {
457 (c >= b'0' as i32 && c <= b'9' as i32)
458 || (c >= b'a' as i32 && c <= b'f' as i32)
459 || (c >= b'A' as i32 && c <= b'F' as i32)
460}
461
462// ALPHABIT: ASCII letters + '_'
463#[inline]
464fn is_lalpha(c: i32) -> bool {
465 (c >= b'a' as i32 && c <= b'z' as i32)
466 || (c >= b'A' as i32 && c <= b'Z' as i32)
467 || c == b'_' as i32
468}
469
470#[inline]
471fn is_lalnum(c: i32) -> bool {
472 is_lalpha(c) || is_digit(c)
473}
474
475#[inline]
476fn is_space(c: i32) -> bool {
477 matches!(c, 9 | 10 | 11 | 12 | 13 | 32) // \t \n \v \f \r space
478}
479
480// PRINTBIT: printable ASCII (graph + space), i.e. 0x20-0x7E
481#[inline]
482fn is_print(c: i32) -> bool {
483 c >= 0x20 && c <= 0x7E
484}
485
486#[inline]
487fn curr_is_newline(ls: &LexState) -> bool {
488 ls.current == b'\n' as i32 || ls.current == b'\r' as i32
489}
490
491// ── Low-level stream helpers ───────────────────────────────────────────────────
492
493/// Advance the lexer by one character.
494///
495/// Corresponds to the `next(ls)` macro. Named `advance` to avoid collision
496/// with Rust's iterator method.
497#[inline]
498fn advance(ls: &mut LexState) {
499 // macros.tsv: zgetc → z.getc()
500 ls.current = ls.z.getc();
501}
502
503/// Append character `c` to the token buffer, growing it if necessary.
504///
505/// On overflow calls [`lex_error`] which becomes `Err(LuaError::Syntax(...))`.
506///
507/// # C source
508/// ```c
509///
510/// // Mbuffer *b = ls->buff;
511/// // if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
512/// // size_t newsize;
513/// // if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
514/// // lexerror(ls, "lexical element too long", 0);
515/// // newsize = luaZ_sizebuffer(b) * 2;
516/// // luaZ_resizebuffer(ls->L, b, newsize);
517/// // }
518/// // b->buffer[luaZ_bufflen(b)++] = cast_char(c);
519/// // }
520/// ```
521fn save(ls: &mut LexState, state: &mut LuaState, c: i32) -> Result<(), LuaError> {
522 // macros.tsv: luaZ_bufflen → buf.len(); luaZ_sizebuffer → buf.capacity()
523 if ls.buff.len() + 1 > ls.buff.capacity() {
524 if ls.buff.capacity() >= MAX_SIZE / 2 {
525 return Err(lex_error(ls, b"lexical element too long", 0));
526 }
527 // luaZ_resizebuffer(ls->L, b, newsize);
528 // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
529 let newsize = ls.buff.capacity() * 2;
530 ls.buff.resize(state, newsize)?;
531 }
532 // macros.tsv: cast_char → x as i8 (C char is signed; Lua bytes stored as-is)
533 // PORT NOTE: we store the byte value directly; the i8 cast in C is for the
534 // C char type but the data is read back as unsigned via cast_uchar everywhere.
535 ls.buff.push_byte(c as u8);
536 Ok(())
537}
538
539/// Save the current character into the token buffer, then advance the stream.
540///
541/// Corresponds to the `save_and_next(ls)` macro. Fallible because `save`
542/// may need to grow the buffer.
543#[inline]
544fn save_and_next(ls: &mut LexState, state: &mut LuaState) -> Result<(), LuaError> {
545 let c = ls.current;
546 save(ls, state, c)?;
547 advance(ls);
548 Ok(())
549}
550
551// ── Error helpers ─────────────────────────────────────────────────────────────
552
553// l_noret → -> ! but in Rust we return LuaError (callers wrap in Err(...))
554// error_sites.tsv: luaX_lexerror → return Err(LuaError::syntax_at(ls, "msg", token))
555/// Build a syntax error, optionally annotated with the offending token text.
556///
557/// Corresponds to the static `lexerror` function in `llex.c`. In C this is
558/// `l_noret` (diverges via `luaD_throw`); in Rust it returns a `LuaError`
559/// value that callers wrap in `Err(...)`.
560///
561/// # C source
562/// ```c
563///
564/// // msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
565/// // if (token)
566/// // luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
567/// // luaD_throw(ls->L, LUA_ERRSYNTAX);
568/// // }
569/// ```
570pub fn lex_error(ls: &mut LexState, msg: &[u8], token: i32) -> LuaError {
571 const LUA_IDSIZE: usize = 60;
572 let mut buff = [0u8; LUA_IDSIZE];
573 let n = lua_vm::object::chunk_id(&mut buff[..], ls.source.as_bytes());
574 let src_part = &buff[..n];
575
576 let mut full_msg: Vec<u8> = Vec::new();
577 full_msg.extend_from_slice(src_part);
578 let _ = write!(full_msg, ":{}: ", ls.linenumber);
579 full_msg.extend_from_slice(msg);
580
581 if token != 0 {
582 let tok_text = txt_token(ls, token);
583 full_msg.extend_from_slice(b" near ");
584 full_msg.extend_from_slice(&tok_text);
585 }
586
587 LuaError::syntax_raw(&full_msg)
588}
589
590// LUAI_FUNC → pub(crate)
591// error_sites.tsv: luaX_syntaxerror → return Err(LuaError::syntax(format_args!("msg")))
592/// Report a syntax error at the current token.
593///
594/// # C source
595/// ```c
596///
597/// // lexerror(ls, msg, ls->t.token);
598/// // }
599/// ```
600pub fn syntax_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
601 let token = ls.t.kind;
602 lex_error(ls, msg, token)
603}
604
605/// Report a semantic error at the current line WITHOUT the `near <token>`
606/// suffix.
607///
608/// Mirrors upstream `luaK_semerror` (`lcode.c`), which sets
609/// `ls->t.token = 0` before calling `luaX_syntaxerror` so the `near` clause is
610/// suppressed. Used for attribute errors (`unknown attribute '<name>'`,
611/// `global variables cannot be to-be-closed`) where the offending construct is
612/// the attribute itself, not the current lookahead token.
613pub fn sem_error(ls: &mut LexState, msg: &[u8]) -> LuaError {
614 lex_error(ls, msg, 0)
615}
616
617/// Produce a human-readable representation of `token` for error messages.
618///
619/// For `TK_NAME`, `TK_STRING`, `TK_FLT`, `TK_INT`: formats the current
620/// token buffer contents as `'<text>'`. For everything else, delegates to
621/// [`token2str`].
622///
623/// # C source
624/// ```c
625///
626/// // switch (token) {
627/// // case TK_NAME: case TK_STRING:
628/// // case TK_FLT: case TK_INT:
629/// // save(ls, '\0');
630/// // return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
631/// // default:
632/// // return luaX_token2str(ls, token);
633/// // }
634/// // }
635/// ```
636///
637/// PORT NOTE: C calls `luaO_pushfstring` which pushes the string onto the
638/// Lua stack (stack-anchored temporary). Rust returns `Vec<u8>` directly
639/// since there is no stack-based string lifecycle for error formatting.
640fn txt_token(ls: &mut LexState, token: i32) -> Vec<u8> {
641 match token {
642 t if t == TK_NAME || t == TK_STRING || t == TK_FLT || t == TK_INT => {
643 let mut v: Vec<u8> = Vec::new();
644 v.push(b'\'');
645 let buff = ls.buff.as_slice();
646 let trimmed = if buff.last() == Some(&0) {
647 &buff[..buff.len() - 1]
648 } else {
649 buff
650 };
651 v.extend_from_slice(trimmed);
652 v.push(b'\'');
653 v
654 }
655 _ => token2str_raw(token, ls.version),
656 }
657}
658
659// LUAI_FUNC → pub(crate)
660/// Produce a human-readable token description (for error messages and the parser).
661///
662/// Single-byte printable tokens are formatted as `'X'`; non-printable as
663/// `'<\N>'`. Reserved words and multi-char symbols are formatted as `'kw'`.
664/// Literal tokens (`<name>`, `<string>`, etc.) return the bare label.
665///
666/// # C source
667/// ```c
668///
669/// // if (token < FIRST_RESERVED) {
670/// // if (lisprint(token))
671/// // return luaO_pushfstring(ls->L, "'%c'", token);
672/// // else
673/// // return luaO_pushfstring(ls->L, "'<\\%d>'", token);
674/// // }
675/// // else {
676/// // const char *s = luaX_tokens[token - FIRST_RESERVED];
677/// // if (token < TK_EOS)
678/// // return luaO_pushfstring(ls->L, "'%s'", s);
679/// // else
680/// // return s;
681/// // }
682/// // }
683/// ```
684///
685/// PORT NOTE: The `LexState` parameter is retained in the signature for API
686/// parity with the C export, but is unused in Rust because we don't push onto
687/// the Lua stack. The real formatting is in [`token2str_raw`].
688pub fn token2str(ls: &LexState, token: i32) -> Vec<u8> {
689 token2str_raw(token, ls.version)
690}
691
692/// Inner implementation of [`token2str`] that does not need `LexState`.
693///
694/// PORT NOTE: `version` gates the 5.1 special-token quoting. Upstream 5.1's
695/// `luaX_lexerror`/`error_expected` wrap the whole near/expected token in
696/// `LUA_QS` ('%s'), so the bare multi-char labels (`<eof>`, `<name>`, …) that
697/// `luaX_token2str` returns for `token >= TK_EOS` end up quoted. 5.2 rewrote
698/// `txtToken` to leave those bare and quote only symbols/reserved/literals, so
699/// for 5.2+ the `>= TK_EOS` arm stays unquoted. (Issue #105.)
700fn token2str_raw(token: i32, version: lua_types::LuaVersion) -> Vec<u8> {
701 if token < FIRST_RESERVED {
702 if is_print(token) {
703 vec![b'\'', token as u8, b'\'']
704 } else {
705 // PORT NOTE: uses write! to Vec<u8> to avoid String allocation for Lua data.
706 let mut v: Vec<u8> = Vec::new();
707 v.extend_from_slice(b"'<\\");
708 let _ = write!(&mut v, "{}", token);
709 v.extend_from_slice(b">'");
710 v
711 }
712 } else {
713 let idx = (token - FIRST_RESERVED) as usize;
714 let s = LUAX_TOKENS[idx];
715 if token < TK_EOS || version == lua_types::LuaVersion::V51 {
716 let mut v: Vec<u8> = Vec::with_capacity(s.len() + 2);
717 v.push(b'\'');
718 v.extend_from_slice(s);
719 v.push(b'\'');
720 v
721 } else {
722 s.to_vec()
723 }
724 }
725}
726
727// ── Public init / setup ───────────────────────────────────────────────────────
728
729// LUAI_FUNC → pub(crate)
730/// Initialise the lexer subsystem: intern all reserved words and fix them
731/// in the GC so they are never collected.
732///
733/// Must be called exactly once during VM startup via `luaX_init`.
734///
735/// # C source
736/// ```c
737///
738/// // int i;
739/// // TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */
740/// // luaC_fix(L, obj2gco(e)); /* never collect this name */
741/// // for (i=0; i<NUM_RESERVED; i++) {
742/// // TString *ts = luaS_new(L, luaX_tokens[i]);
743/// // luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */
744/// // ts->extra = cast_byte(i+1); /* reserved word */
745/// // }
746/// // }
747/// ```
748pub fn init(state: &mut LuaState) -> Result<(), LuaError> {
749 // macros.tsv: luaS_newliteral → state.intern_str(b"...")
750 // TODO(port): call state.intern_str(LUA_ENV) once LuaState has that method (Phase B)
751 let _e = intern_str_stub(state, LUA_ENV)?;
752
753 // macros.tsv: luaC_objbarrier / luaC_fix — GC fix; no-op in Phases A-C
754 // TODO(port): state.gc().fix(e) in Phase D
755
756 for i in 0..NUM_RESERVED {
757 // macros.tsv: luaS_new → state.intern_str(...)
758 // TODO(port): call state.intern_str(LUAX_TOKENS[i]) in Phase B
759 let ts = intern_str_stub(state, LUAX_TOKENS[i])?;
760
761 // TODO(port): state.gc().fix(ts.clone()) in Phase D
762
763 // macros.tsv: cast_byte → x as u8
764 // PORT NOTE: LuaString.extra uses Cell<u8> interior mutability.
765 // TODO(port): ts.set_extra((i + 1) as u8) — needs pub accessor on LuaString
766 let _ = ts; // suppress unused warning until Phase B
767 }
768
769 Ok(())
770}
771
772// LUAI_FUNC → pub(crate)
773/// Initialise `ls` for lexing a new chunk from stream `z`.
774///
775/// # C source
776/// ```c
777///
778/// // TString *source, int firstchar) {
779/// // ls->t.token = 0;
780/// // ls->L = L;
781/// // ls->current = firstchar;
782/// // ls->lookahead.token = TK_EOS; /* no look-ahead token */
783/// // ls->z = z;
784/// // ls->fs = NULL;
785/// // ls->linenumber = 1;
786/// // ls->lastline = 1;
787/// // ls->source = source;
788/// // ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */
789/// // luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);
790/// // }
791/// ```
792pub fn set_input(
793 state: &mut LuaState,
794 ls: &mut LexState,
795 z: ZIO,
796 source: GcRef<LuaString>,
797 firstchar: i32,
798) -> Result<(), LuaError> {
799 ls.t = Token::new(0);
800 ls.current = firstchar;
801 ls.lookahead = Token::eos();
802 ls.z = z;
803 ls.fs = None;
804 ls.linenumber = 1;
805 ls.lastline = 1;
806 ls.source = source;
807 ls.version = state.global().lua_version;
808 // macros.tsv: luaS_newliteral → state.intern_str(b"...")
809 // TODO(port): state.intern_str(LUA_ENV) in Phase B
810 ls.envn = intern_str_stub(state, LUA_ENV)?;
811 // macros.tsv: luaZ_resizebuffer → buf.resize(state, size)?
812 ls.buff.resize(state, LUA_MIN_BUFFER)?;
813 Ok(())
814}
815
816// LUAI_FUNC → pub(crate)
817/// Create (or retrieve) a Lua string and anchor it in the parser's GC-protection
818/// table `ls.h` so it cannot be collected before the end of compilation.
819///
820/// Also internalises long strings so that each unique content has exactly one
821/// copy in memory. The table `ls.h` is used as a set: the string is both the
822/// key and the value.
823///
824/// # C source
825/// ```c
826///
827/// // lua_State *L = ls->L;
828/// // TString *ts = luaS_newlstr(L, str, l);
829/// // const TValue *o = luaH_getstr(ls->h, ts);
830/// // if (!ttisnil(o)) /* string already present? */
831/// // ts = keystrval(nodefromval(o)); /* get saved copy */
832/// // else {
833/// // TValue *stv = s2v(L->top.p++); /* reserve stack space */
834/// // setsvalue(L, stv, ts); /* anchor the string */
835/// // luaH_finishset(L, ls->h, stv, o, stv); /* t[string] = string */
836/// // luaC_checkGC(L);
837/// // L->top.p--; /* remove string from stack */
838/// // }
839/// // return ts;
840/// // }
841/// ```
842pub(crate) fn new_string(
843 state: &mut LuaState,
844 ls: &mut LexState,
845 bytes: &[u8],
846) -> Result<GcRef<LuaString>, LuaError> {
847 // PORT NOTE: in C, the anchor table ls->h is a Lua table mapping the string
848 // to itself so a second occurrence of the same literal in the chunk returns
849 // the originally-created TString. We use a plain HashMap on LexState
850 // (`long_str_anchor`) for the equivalent dedup — sufficient because Phase
851 // A-C `GcRef<T>` is `Rc<T>` and identity is determined by the `Rc`
852 // allocation. Short strings already share identity via the global pool;
853 // long strings (>LUAI_MAXSHORTLEN) need this session-level map.
854 if let Some(existing) = ls.long_str_anchor.get(bytes) {
855 return Ok(existing.clone());
856 }
857 let ts = intern_str_stub(state, bytes)?;
858 ls.long_str_anchor.insert(bytes.to_vec(), ts.clone());
859 Ok(ts)
860}
861
862// ── Public advance / lookahead ─────────────────────────────────────────────────
863
864// LUAI_FUNC → pub(crate)
865/// Consume the current token; load the next one from the stream.
866///
867/// If a lookahead token was set, it becomes the current token without re-reading
868/// from the stream.
869///
870/// # C source
871/// ```c
872///
873/// // ls->lastline = ls->linenumber;
874/// // if (ls->lookahead.token != TK_EOS) {
875/// // ls->t = ls->lookahead;
876/// // ls->lookahead.token = TK_EOS;
877/// // }
878/// // else
879/// // ls->t.token = llex(ls, &ls->t.seminfo);
880/// // }
881/// ```
882pub fn next(state: &mut LuaState, ls: &mut LexState) -> Result<(), LuaError> {
883 ls.lastline = ls.linenumber;
884
885 if ls.lookahead.kind != TK_EOS {
886 // Clone to avoid borrow conflict; LuaString inside TokenValue is GcRef (Rc).
887 ls.t = ls.lookahead.clone();
888 ls.lookahead = Token::eos();
889 } else {
890 let mut val = TokenValue::None;
891 let kind = llex(state, ls, &mut val)?;
892 ls.t = Token { kind, value: val };
893 }
894 Ok(())
895}
896
897// LUAI_FUNC → pub(crate)
898/// Peek at the next token without consuming the current one.
899///
900/// The lookahead token is cached in `ls.lookahead` and returned. Only one
901/// token of lookahead is supported; calling this twice without an intervening
902/// [`next`] is a logic error (asserted in debug builds).
903///
904/// # C source
905/// ```c
906///
907/// // lua_assert(ls->lookahead.token == TK_EOS);
908/// // ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
909/// // return ls->lookahead.token;
910/// // }
911/// ```
912pub fn lookahead(state: &mut LuaState, ls: &mut LexState) -> Result<i32, LuaError> {
913 // macros.tsv: lua_assert → debug_assert!
914 debug_assert!(
915 ls.lookahead.kind == TK_EOS,
916 "luaX_lookahead: lookahead already set"
917 );
918
919 let mut val = TokenValue::None;
920 let kind = llex(state, ls, &mut val)?;
921 ls.lookahead = Token { kind, value: val };
922
923 Ok(ls.lookahead.kind)
924}
925
926// ── Private lexer helpers ──────────────────────────────────────────────────────
927
928/// If the current character equals `c`, advance and return `true`.
929///
930/// # C source
931/// ```c
932///
933/// // if (ls->current == c) { next(ls); return 1; }
934/// // else return 0;
935/// // }
936/// ```
937fn check_next1(ls: &mut LexState, c: i32) -> bool {
938 if ls.current == c {
939 advance(ls);
940 true
941 } else {
942 false
943 }
944}
945
946/// If the current character is either of the two bytes in `set`, save-and-advance
947/// and return `true`.
948///
949/// # C source
950/// ```c
951///
952/// // lua_assert(set[2] == '\0');
953/// // if (ls->current == set[0] || ls->current == set[1]) {
954/// // save_and_next(ls);
955/// // return 1;
956/// // }
957/// // else return 0;
958/// // }
959/// ```
960fn check_next2(ls: &mut LexState, state: &mut LuaState, set: &[u8; 2]) -> Result<bool, LuaError> {
961 if ls.current == set[0] as i32 || ls.current == set[1] as i32 {
962 save_and_next(ls, state)?;
963 Ok(true)
964 } else {
965 Ok(false)
966 }
967}
968
969/// Increment the line counter and consume the newline sequence.
970///
971/// Handles `\n`, `\r`, `\n\r`, and `\r\n`.
972///
973/// # C source
974/// ```c
975///
976/// // int old = ls->current;
977/// // lua_assert(currIsNewline(ls));
978/// // next(ls); /* skip '\n' or '\r' */
979/// // if (currIsNewline(ls) && ls->current != old)
980/// // next(ls); /* skip '\n\r' or '\r\n' */
981/// // if (++ls->linenumber >= MAX_INT)
982/// // lexerror(ls, "chunk has too many lines", 0);
983/// // }
984/// ```
985fn inc_line_number(ls: &mut LexState, _state: &mut LuaState) -> Result<(), LuaError> {
986 // macros.tsv: lua_assert → debug_assert!
987 debug_assert!(curr_is_newline(ls), "inc_line_number: not at a newline");
988
989 let old = ls.current;
990 advance(ls);
991
992 if curr_is_newline(ls) && ls.current != old {
993 advance(ls);
994 }
995
996 // macros.tsv: MAX_INT → i32::MAX
997 ls.linenumber += 1;
998 if ls.linenumber >= i32::MAX {
999 return Err(lex_error(ls, b"chunk has too many lines", 0));
1000 }
1001 Ok(())
1002}
1003
1004/// Scan a numeric literal (integer or float, decimal or hex).
1005///
1006/// The caller may have already read an initial dot. Accepts the pattern:
1007/// `%d(%x|%.|(Ee[+-]?))*` or `0[Xx](%x|%.|(Pp[+-]?))*`.
1008///
1009/// Returns `TK_INT` for integers, `TK_FLT` for floats.
1010///
1011/// # C source
1012/// ```c
1013///
1014/// // TValue obj;
1015/// // const char *expo = "Ee";
1016/// // int first = ls->current;
1017/// // lua_assert(lisdigit(ls->current));
1018/// // save_and_next(ls);
1019/// // if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */
1020/// // expo = "Pp";
1021/// // for (;;) {
1022/// // if (check_next2(ls, expo))
1023/// // check_next2(ls, "-+");
1024/// // else if (lisxdigit(ls->current) || ls->current == '.')
1025/// // save_and_next(ls);
1026/// // else break;
1027/// // }
1028/// // if (lislalpha(ls->current)) /* numeral touching a letter? */
1029/// // save_and_next(ls); /* force an error */
1030/// // save(ls, '\0');
1031/// // if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)
1032/// // lexerror(ls, "malformed number", TK_FLT);
1033/// // if (ttisinteger(&obj)) { seminfo->i = ivalue(&obj); return TK_INT; }
1034/// // else { seminfo->r = fltvalue(&obj); return TK_FLT; }
1035/// // }
1036/// ```
1037fn read_numeral(
1038 state: &mut LuaState,
1039 ls: &mut LexState,
1040 seminfo: &mut TokenValue,
1041) -> Result<i32, LuaError> {
1042 let mut expo: &[u8; 2] = b"Ee";
1043
1044 let first = ls.current;
1045
1046 debug_assert!(is_digit(ls.current), "read_numeral: not at a digit");
1047
1048 save_and_next(ls, state)?;
1049
1050 if first == b'0' as i32 && check_next2(ls, state, b"xX")? {
1051 expo = b"Pp";
1052 }
1053
1054 loop {
1055 if check_next2(ls, state, expo)? {
1056 check_next2(ls, state, b"-+")?;
1057 } else if is_xdigit(ls.current) || ls.current == b'.' as i32 {
1058 // save_and_next(ls);
1059 save_and_next(ls, state)?;
1060 } else {
1061 break;
1062 }
1063 }
1064
1065 if is_lalpha(ls.current) {
1066 save_and_next(ls, state)?;
1067 }
1068
1069 // In Rust, luaO_str2num will receive a byte slice; NUL is not needed.
1070 // We save 0 for parity with C, but our str2num stub ignores it.
1071 save(ls, state, 0)?;
1072
1073 // lexerror(ls, "malformed number", TK_FLT);
1074 // macros.tsv: luaZ_buffer → buf.as_mut_slice()
1075 let buf = ls.buff.as_slice();
1076 let num_bytes = if buf.last() == Some(&0) {
1077 &buf[..buf.len() - 1]
1078 } else {
1079 buf
1080 };
1081 let mut obj = lua_types::LuaValue::Nil;
1082 if lua_vm::object::str2num(num_bytes, &mut obj) == 0 {
1083 return Err(lex_error(ls, b"malformed number", TK_FLT));
1084 }
1085 match obj {
1086 lua_types::LuaValue::Int(i) => {
1087 // Lua 5.1/5.2 are float-only: `lua_Number` is the only numeric type,
1088 // so every numeric literal is parsed as a float (`lua_str2number`),
1089 // including ones written without a decimal point. A literal like
1090 // 9007199254740993 therefore loses precision exactly as in lua5.2.4
1091 // (prints `9.007199254741e+15`), rather than surviving as an i64.
1092 if is_float_only(state) {
1093 *seminfo = TokenValue::Float(i as f64);
1094 Ok(TK_FLT)
1095 } else {
1096 *seminfo = TokenValue::Int(i);
1097 Ok(TK_INT)
1098 }
1099 }
1100 lua_types::LuaValue::Float(f) => {
1101 *seminfo = TokenValue::Float(f);
1102 Ok(TK_FLT)
1103 }
1104 _ => unreachable!("str2num returned non-numeric LuaValue"),
1105 }
1106}
1107
1108/// Scan a `[=*[` or `]=*]` sequence; leave the last bracket as current char.
1109///
1110/// Returns:
1111/// - `count + 2` if well-formed (where `count` is the number of `=` signs),
1112/// - `1` if a single bracket with no `=`s and no second bracket,
1113/// - `0` if malformed (e.g. `[==` with no closing bracket).
1114///
1115/// # C source
1116/// ```c
1117///
1118/// // size_t count = 0;
1119/// // int s = ls->current;
1120/// // lua_assert(s == '[' || s == ']');
1121/// // save_and_next(ls);
1122/// // while (ls->current == '=') {
1123/// // save_and_next(ls);
1124/// // count++;
1125/// // }
1126/// // return (ls->current == s) ? count + 2
1127/// // : (count == 0) ? 1
1128/// // : 0;
1129/// // }
1130/// ```
1131fn skip_sep(state: &mut LuaState, ls: &mut LexState) -> Result<usize, LuaError> {
1132 let mut count: usize = 0;
1133 let s = ls.current;
1134 debug_assert!(
1135 s == b'[' as i32 || s == b']' as i32,
1136 "skip_sep: not at bracket"
1137 );
1138
1139 save_and_next(ls, state)?;
1140
1141 while ls.current == b'=' as i32 {
1142 save_and_next(ls, state)?;
1143 count += 1;
1144 }
1145
1146 if ls.current == s {
1147 Ok(count + 2)
1148 } else if count == 0 {
1149 Ok(1)
1150 } else {
1151 Ok(0)
1152 }
1153}
1154
1155/// Scan a long string or long comment delimited by `[=*[` … `]=*]`.
1156///
1157/// `seminfo` is `Some` when reading a string literal; `None` when skipping a
1158/// long comment. When `None`, buffer contents are discarded on each newline
1159/// to avoid wasting memory.
1160///
1161/// # C source
1162/// ```c
1163///
1164/// // int line = ls->linenumber;
1165/// // save_and_next(ls); /* skip 2nd '[' */
1166/// // if (currIsNewline(ls)) inclinenumber(ls);
1167/// // for (;;) {
1168/// // switch (ls->current) {
1169/// // case EOZ: { /* error */
1170/// // const char *what = (seminfo ? "string" : "comment");
1171/// // const char *msg = luaO_pushfstring(..., what, line);
1172/// // lexerror(ls, msg, TK_EOS);
1173/// // break;
1174/// // }
1175/// // case ']': {
1176/// // if (skip_sep(ls) == sep) {
1177/// // save_and_next(ls); /* skip 2nd ']' */
1178/// // goto endloop;
1179/// // }
1180/// // break;
1181/// // }
1182/// // case '\n': case '\r': {
1183/// // save(ls, '\n');
1184/// // inclinenumber(ls);
1185/// // if (!seminfo) luaZ_resetbuffer(ls->buff);
1186/// // break;
1187/// // }
1188/// // default: {
1189/// // if (seminfo) save_and_next(ls);
1190/// // else next(ls);
1191/// // }
1192/// // }
1193/// // } endloop:
1194/// // if (seminfo)
1195/// // seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1196/// // luaZ_bufflen(ls->buff) - 2 * sep);
1197/// // }
1198/// ```
1199fn read_long_string(
1200 state: &mut LuaState,
1201 ls: &mut LexState,
1202 seminfo: Option<&mut TokenValue>,
1203 sep: usize,
1204) -> Result<(), LuaError> {
1205 let line = ls.linenumber;
1206
1207 save_and_next(ls, state)?;
1208
1209 if curr_is_newline(ls) {
1210 inc_line_number(ls, state)?;
1211 }
1212
1213 // is_string: whether we are reading a string (true) or a comment (false)
1214 let is_string = seminfo.is_some();
1215
1216 loop {
1217 match ls.current {
1218 c if c == EOZ => {
1219 let what: &[u8] = if is_string { b"string" } else { b"comment" };
1220 // PORT NOTE: build message as Vec<u8> to avoid String allocation.
1221 let mut msg: Vec<u8> = Vec::new();
1222 msg.extend_from_slice(b"unfinished long ");
1223 msg.extend_from_slice(what);
1224 msg.extend_from_slice(b" (starting at line ");
1225 let _ = write!(&mut msg, "{}", line);
1226 msg.push(b')');
1227 return Err(lex_error(ls, &msg, TK_EOS));
1228 }
1229 c if c == b']' as i32 => {
1230 let s = skip_sep(state, ls)?;
1231 if s == sep {
1232 save_and_next(ls, state)?;
1233 break;
1234 }
1235 // else: the ']' sequence wasn't the closing delimiter; continue
1236 }
1237 c if c == b'\n' as i32 || c == b'\r' as i32 => {
1238 save(ls, state, b'\n' as i32)?;
1239 inc_line_number(ls, state)?;
1240 // macros.tsv: luaZ_resetbuffer → buf.clear()
1241 if !is_string {
1242 ls.buff.clear();
1243 }
1244 }
1245 _ => {
1246 if is_string {
1247 save_and_next(ls, state)?;
1248 } else {
1249 advance(ls);
1250 }
1251 }
1252 }
1253 }
1254
1255 // seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
1256 // luaZ_bufflen(ls->buff) - 2 * sep);
1257 if let Some(out) = seminfo {
1258 // The buffer contains: sep bytes of '[=' + content + sep bytes of '=]'
1259 // We want the content in between.
1260 // PORT NOTE: per PORTING.md §4.3, capture the slice into an owned
1261 // Vec so the immutable borrow of ls.buff is dropped before the
1262 // mutable borrow needed by new_string.
1263 let buf = ls.buff.as_slice();
1264 let content: Vec<u8> = buf[sep..buf.len() - sep].to_vec();
1265 let ts = new_string(state, ls, &content)?;
1266 *out = TokenValue::Str(ts);
1267 }
1268 Ok(())
1269}
1270
1271/// Check `c` is non-zero (truthy); if not, save the current char and raise a
1272/// string-escape error.
1273///
1274/// # C source
1275/// ```c
1276///
1277/// // if (!c) {
1278/// // if (ls->current != EOZ)
1279/// // save_and_next(ls); /* add current to buffer for error message */
1280/// // lexerror(ls, msg, TK_STRING);
1281/// // }
1282/// // }
1283/// ```
1284fn esc_check(
1285 state: &mut LuaState,
1286 ls: &mut LexState,
1287 ok: bool,
1288 msg: &[u8],
1289) -> Result<(), LuaError> {
1290 if !ok {
1291 if ls.current != EOZ {
1292 save_and_next(ls, state)?;
1293 }
1294 return Err(lex_error(ls, msg, TK_STRING));
1295 }
1296 Ok(())
1297}
1298
1299/// Save-and-advance, then verify the new current char is a hex digit; return
1300/// its numeric value (0-15).
1301///
1302/// # C source
1303/// ```c
1304///
1305/// // save_and_next(ls);
1306/// // esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
1307/// // return luaO_hexavalue(ls->current);
1308/// // }
1309/// ```
1310fn get_hexa(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1311 save_and_next(ls, state)?;
1312 esc_check(
1313 state,
1314 ls,
1315 is_xdigit(ls.current),
1316 b"hexadecimal digit expected",
1317 )?;
1318 // TODO(port): call lua_vm::object::hex_value in Phase B
1319 Ok(hex_value_stub(ls.current))
1320}
1321
1322/// Scan a `\xNN` hex escape; return the decoded byte value.
1323///
1324/// # C source
1325/// ```c
1326///
1327/// // int r = gethexa(ls);
1328/// // r = (r << 4) + gethexa(ls);
1329/// // luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */
1330/// // return r;
1331/// // }
1332/// ```
1333fn read_hex_esc(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1334 let r = get_hexa(state, ls)?;
1335 let r = (r << 4) + get_hexa(state, ls)?;
1336 // macros.tsv: luaZ_buffremove → buf.truncate_by(i)
1337 ls.buff.truncate_by(2);
1338 Ok(r)
1339}
1340
1341/// Scan a `\u{XXXXXX}` UTF-8 escape; return the Unicode codepoint.
1342///
1343/// # C source
1344/// ```c
1345///
1346/// // unsigned long r;
1347/// // int i = 4; /* chars to remove: '\', 'u', '{', first digit */
1348/// // save_and_next(ls); /* skip 'u' */
1349/// // esccheck(ls, ls->current == '{', "missing '{'");
1350/// // r = gethexa(ls); /* must have at least one digit */
1351/// // while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
1352/// // i++;
1353/// // esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
1354/// // r = (r << 4) + luaO_hexavalue(ls->current);
1355/// // }
1356/// // esccheck(ls, ls->current == '}', "missing '}'");
1357/// // next(ls); /* skip '}' */
1358/// // luaZ_buffremove(ls->buff, i);
1359/// // return r;
1360/// // }
1361/// ```
1362fn read_utf8_esc(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1363 let mut i: usize = 4;
1364
1365 save_and_next(ls, state)?;
1366
1367 esc_check(state, ls, ls.current == b'{' as i32, b"missing '{'")?;
1368
1369 let mut r = get_hexa(state, ls)?;
1370
1371 // The codepoint upper bound is version-gated and the C control flow differs
1372 // between families (`llex.c readutf8esc`):
1373 // * 5.3 (L336-340): `r = (r<<4)+digit; esccheck(r <= 0x10FFFF, ...)` —
1374 // accumulate the digit FIRST, then bound the running value at 0x10FFFF.
1375 // * 5.4 (L351) / 5.5 (L373): `esccheck(r <= (0x7FFFFFFFu >> 4), ...);
1376 // r = (r<<4)+digit` — bound BEFORE the shift, allowing up to 0x7FFFFFFF.
1377 // The order (check-before-shift vs shift-before-check) is reproduced exactly
1378 // because it also determines how many digits land in the `near '...'` buffer
1379 // snippet of the error message.
1380 let is_v53 = matches!(state.global().lua_version, lua_types::LuaVersion::V53);
1381
1382 // cast_void: discard return value
1383 loop {
1384 save_and_next(ls, state)?;
1385 if !is_xdigit(ls.current) {
1386 break;
1387 }
1388 i += 1;
1389 if is_v53 {
1390 // TODO(port): lua_vm::object::hex_value in Phase B
1391 r = (r << 4) + hex_value_stub(ls.current);
1392 esc_check(state, ls, r <= 0x10_FFFF, b"UTF-8 value too large")?;
1393 } else {
1394 esc_check(
1395 state,
1396 ls,
1397 r <= (0x7FFF_FFFFu32 >> 4),
1398 b"UTF-8 value too large",
1399 )?;
1400 // TODO(port): lua_vm::object::hex_value in Phase B
1401 r = (r << 4) + hex_value_stub(ls.current);
1402 }
1403 }
1404
1405 esc_check(state, ls, ls.current == b'}' as i32, b"missing '}'")?;
1406
1407 advance(ls);
1408
1409 ls.buff.truncate_by(i);
1410
1411 Ok(r)
1412}
1413
1414/// Scan `\u{...}` and append the UTF-8 encoding of the codepoint to the buffer.
1415///
1416/// # C source
1417/// ```c
1418///
1419/// // char buff[UTF8BUFFSZ];
1420/// // int n = luaO_utf8esc(buff, readutf8esc(ls));
1421/// // for (; n > 0; n--)
1422/// // save(ls, buff[UTF8BUFFSZ - n]);
1423/// // }
1424/// ```
1425fn utf8_esc(state: &mut LuaState, ls: &mut LexState) -> Result<(), LuaError> {
1426 let codepoint = read_utf8_esc(state, ls)?;
1427
1428 // macros.tsv: UTF8BUFFSZ → const UTF8_BUF_SZ: usize = 8
1429 // TODO(port): call lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1430 // For Phase A, encode directly here.
1431 let encoded = utf8_encode_stub(codepoint);
1432
1433 for &b in &encoded {
1434 save(ls, state, b as i32)?;
1435 }
1436 Ok(())
1437}
1438
1439/// Scan a decimal escape `\ddd` (up to 3 digits); return the byte value.
1440///
1441/// # C source
1442/// ```c
1443///
1444/// // int i;
1445/// // int r = 0;
1446/// // for (i = 0; i < 3 && lisdigit(ls->current); i++) {
1447/// // r = 10*r + ls->current - '0';
1448/// // save_and_next(ls);
1449/// // }
1450/// // esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
1451/// // luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */
1452/// // return r;
1453/// // }
1454/// ```
1455fn read_dec_esc(state: &mut LuaState, ls: &mut LexState) -> Result<u32, LuaError> {
1456 let mut i: usize = 0;
1457 let mut r: u32 = 0;
1458
1459 while i < 3 && is_digit(ls.current) {
1460 r = 10 * r + (ls.current as u32 - b'0' as u32);
1461 save_and_next(ls, state)?;
1462 i += 1;
1463 }
1464
1465 // UCHAR_MAX = 255 = u8::MAX. Lua 5.1 spells this `escape sequence too
1466 // large` (the `decimal escape too large` wording is 5.2+). Verified against
1467 // lua5.1.5; see specs/followup/5.1-roster-syntax.md §2.
1468 let too_large_msg: &[u8] = if matches!(state.global().lua_version, lua_types::LuaVersion::V51) {
1469 b"escape sequence too large"
1470 } else {
1471 b"decimal escape too large"
1472 };
1473 esc_check(state, ls, r <= u8::MAX as u32, too_large_msg)?;
1474
1475 ls.buff.truncate_by(i);
1476 Ok(r)
1477}
1478
1479/// Scan a short (single/double-quoted) string literal.
1480///
1481/// The C function uses `goto read_save / only_save / no_save` for escape
1482/// handling. In Rust this is replaced by the `EscapeResult` enum.
1483///
1484/// # C source (see llex.c lines 382-442 for full listing)
1485fn read_string(
1486 state: &mut LuaState,
1487 ls: &mut LexState,
1488 del: i32,
1489 seminfo: &mut TokenValue,
1490) -> Result<(), LuaError> {
1491 // Encoding for what the escape sequence handler needs to do after decoding.
1492 //
1493 // read_save: advance(ls), remove '\' from buffer, save decoded byte
1494 // only_save: remove '\' from buffer, save decoded byte (no advance)
1495 // no_save: nothing (just break from the escape case)
1496 enum EscapeResult {
1497 ReadSave(i32),
1498 OnlySave(i32),
1499 NoSave,
1500 }
1501
1502 save_and_next(ls, state)?;
1503
1504 while ls.current != del {
1505 match ls.current {
1506 c if c == EOZ => {
1507 return Err(lex_error(ls, b"unfinished string", TK_EOS));
1508 }
1509 c if c == b'\n' as i32 || c == b'\r' as i32 => {
1510 return Err(lex_error(ls, b"unfinished string", TK_STRING));
1511 }
1512 c if c == b'\\' as i32 => {
1513 save_and_next(ls, state)?;
1514
1515 // Lua 5.1's lexer does NOT recognize `\x`, `\z`, or `\u`, and it
1516 // does NOT raise on an unknown escape. For any escape char outside
1517 // the known set, the 5.1 lexer silently drops the backslash and
1518 // keeps the next character verbatim (`"\x41"` → bytes `x41`,
1519 // `"\z"` → `z`, `"\q"` → `q`). Decimal escapes (`\ddd`) and the
1520 // standard letter/quote/newline escapes still work. Verified
1521 // against lua5.1.5; see specs/followup/5.1-roster-syntax.md §2.
1522 let is_v51 = matches!(state.global().lua_version, lua_types::LuaVersion::V51);
1523
1524 // Inner switch on the escape character
1525 let esc = match ls.current {
1526 c if c == b'a' as i32 => EscapeResult::ReadSave(b'\x07' as i32),
1527 c if c == b'b' as i32 => EscapeResult::ReadSave(b'\x08' as i32),
1528 c if c == b'f' as i32 => EscapeResult::ReadSave(b'\x0C' as i32),
1529 c if c == b'n' as i32 => EscapeResult::ReadSave(b'\n' as i32),
1530 c if c == b'r' as i32 => EscapeResult::ReadSave(b'\r' as i32),
1531 c if c == b't' as i32 => EscapeResult::ReadSave(b'\t' as i32),
1532 c if c == b'v' as i32 => EscapeResult::ReadSave(b'\x0B' as i32),
1533 c if c == b'x' as i32 && !is_v51 => {
1534 let decoded = read_hex_esc(state, ls)?;
1535 EscapeResult::ReadSave(decoded as i32)
1536 }
1537 c if c == b'u' as i32 && !is_v51 => {
1538 utf8_esc(state, ls)?;
1539 EscapeResult::NoSave
1540 }
1541 c if c == b'\n' as i32 || c == b'\r' as i32 => {
1542 inc_line_number(ls, state)?;
1543 EscapeResult::OnlySave(b'\n' as i32)
1544 }
1545 c if c == b'\\' as i32 || c == b'"' as i32 || c == b'\'' as i32 => {
1546 EscapeResult::ReadSave(c)
1547 }
1548 c if c == EOZ => EscapeResult::NoSave,
1549 c if c == b'z' as i32 && !is_v51 => {
1550 ls.buff.truncate_by(1);
1551 advance(ls);
1552 while is_space(ls.current) {
1553 if curr_is_newline(ls) {
1554 inc_line_number(ls, state)?;
1555 } else {
1556 advance(ls);
1557 }
1558 }
1559 EscapeResult::NoSave
1560 }
1561 c if is_v51 && !is_digit(c) => {
1562 // 5.1 unknown escape: drop the backslash, emit the char.
1563 EscapeResult::ReadSave(c)
1564 }
1565 _ => {
1566 esc_check(state, ls, is_digit(ls.current), b"invalid escape sequence")?;
1567 let decoded = read_dec_esc(state, ls)?;
1568 EscapeResult::OnlySave(decoded as i32)
1569 }
1570 };
1571
1572 // Dispatch the C goto targets as match arms.
1573 match esc {
1574 EscapeResult::ReadSave(c) => {
1575 advance(ls);
1576 ls.buff.truncate_by(1);
1577 save(ls, state, c)?;
1578 }
1579 EscapeResult::OnlySave(c) => {
1580 ls.buff.truncate_by(1);
1581 save(ls, state, c)?;
1582 }
1583 EscapeResult::NoSave => {}
1584 }
1585 }
1586 _ => {
1587 save_and_next(ls, state)?;
1588 }
1589 }
1590 }
1591
1592 save_and_next(ls, state)?;
1593
1594 // luaZ_bufflen(ls->buff) - 2);
1595 // Buffer contains: delimiter + content + delimiter; strip both delimiters.
1596 // PORT NOTE: capture into owned Vec to drop the borrow before new_string.
1597 let buf = ls.buff.as_slice();
1598 let content: Vec<u8> = if buf.len() >= 2 {
1599 buf[1..buf.len() - 1].to_vec()
1600 } else {
1601 Vec::new()
1602 };
1603 let ts = new_string(state, ls, &content)?;
1604 *seminfo = TokenValue::Str(ts);
1605 Ok(())
1606}
1607
1608/// Core lexer dispatch: consume and return the next raw token kind.
1609///
1610/// This is the heart of the lexer: a large `for`-`switch` loop that classifies
1611/// the current character and dispatches to the appropriate scanner.
1612///
1613/// # C source (see llex.c lines 445-562 for full listing)
1614/// Whether the active version is the float-only legacy family (5.1/5.2), which
1615/// lacks the 5.3 integer operators (`//`, `<<`, `>>`, and the bitwise binops).
1616fn is_float_only(state: &LuaState) -> bool {
1617 matches!(
1618 state.global().lua_version,
1619 lua_types::LuaVersion::V51 | lua_types::LuaVersion::V52
1620 )
1621}
1622
1623fn llex(
1624 state: &mut LuaState,
1625 ls: &mut LexState,
1626 seminfo: &mut TokenValue,
1627) -> Result<i32, LuaError> {
1628 // macros.tsv: luaZ_resetbuffer → buf.clear()
1629 ls.buff.clear();
1630
1631 loop {
1632 match ls.current {
1633 c if c == b'\n' as i32 || c == b'\r' as i32 => {
1634 inc_line_number(ls, state)?;
1635 // PORT NOTE: skipcomment-equivalent. luaL_loadfile in C-Lua
1636 // strips a leading '#' line (Unix shebang). Our test harness
1637 // prepends a global-setup preamble to every official test, so
1638 // the script's '#' line is not at byte zero. Apply the same
1639 // rule at any token-scan line start: treat a line whose first
1640 // character is '#' as a single-line comment. This sits in
1641 // llex's dispatch loop (not inc_line_number) so it does not
1642 // affect newlines inside long-bracket strings.
1643 if ls.current == b'#' as i32 {
1644 while !curr_is_newline(ls) && ls.current != EOZ {
1645 advance(ls);
1646 }
1647 }
1648 }
1649
1650 c if c == b' ' as i32
1651 || c == b'\x0C' as i32
1652 || c == b'\t' as i32
1653 || c == b'\x0B' as i32 =>
1654 {
1655 advance(ls);
1656 }
1657
1658 c if c == b'-' as i32 => {
1659 advance(ls);
1660 if ls.current != b'-' as i32 {
1661 return Ok(b'-' as i32);
1662 }
1663 advance(ls);
1664
1665 if ls.current == b'[' as i32 {
1666 let sep = skip_sep(state, ls)?;
1667 ls.buff.clear();
1668 if sep >= 2 {
1669 read_long_string(state, ls, None, sep)?;
1670 ls.buff.clear();
1671 continue;
1672 }
1673 }
1674 while !curr_is_newline(ls) && ls.current != EOZ {
1675 advance(ls);
1676 }
1677 // loop continues (no token emitted for comments)
1678 }
1679
1680 c if c == b'[' as i32 => {
1681 let sep = skip_sep(state, ls)?;
1682 if sep >= 2 {
1683 read_long_string(state, ls, Some(seminfo), sep)?;
1684 return Ok(TK_STRING);
1685 } else if sep == 0 {
1686 return Err(lex_error(ls, b"invalid long string delimiter", TK_STRING));
1687 }
1688 // sep == 1: plain '[', no long string
1689 return Ok(b'[' as i32);
1690 }
1691
1692 c if c == b'=' as i32 => {
1693 advance(ls);
1694 if check_next1(ls, b'=' as i32) {
1695 return Ok(TK_EQ);
1696 }
1697 return Ok(b'=' as i32);
1698 }
1699
1700 c if c == b'<' as i32 => {
1701 advance(ls);
1702 if check_next1(ls, b'=' as i32) {
1703 return Ok(TK_LE);
1704 } else if !is_float_only(state) && check_next1(ls, b'<' as i32) {
1705 // The `<<` shift operator is a Lua 5.3 addition. Under the
1706 // float-only legacy family (5.1/5.2) it does not exist: a
1707 // bare `<` is returned, so a second `<` then surfaces
1708 // upstream's "unexpected symbol near '<'".
1709 return Ok(TK_SHL);
1710 }
1711 return Ok(b'<' as i32);
1712 }
1713
1714 c if c == b'>' as i32 => {
1715 advance(ls);
1716 if check_next1(ls, b'=' as i32) {
1717 return Ok(TK_GE);
1718 } else if !is_float_only(state) && check_next1(ls, b'>' as i32) {
1719 // `>>` is a 5.3 addition; absent in 5.1/5.2.
1720 return Ok(TK_SHR);
1721 }
1722 return Ok(b'>' as i32);
1723 }
1724
1725 c if c == b'/' as i32 => {
1726 advance(ls);
1727 if !is_float_only(state) && check_next1(ls, b'/' as i32) {
1728 // Floor division `//` is a 5.3 addition; absent in 5.1/5.2,
1729 // where the second `/` becomes "unexpected symbol near '/'".
1730 return Ok(TK_IDIV);
1731 }
1732 return Ok(b'/' as i32);
1733 }
1734
1735 c if c == b'~' as i32 => {
1736 advance(ls);
1737 if check_next1(ls, b'=' as i32) {
1738 return Ok(TK_NE);
1739 }
1740 return Ok(b'~' as i32);
1741 }
1742
1743 c if c == b':' as i32 => {
1744 advance(ls);
1745 // Lua 5.1 has no `::label::` token; `::` was added with `goto` in
1746 // 5.2. Under V51 the second `:` is left for the parser, which
1747 // reports `unexpected symbol near ':'`. See
1748 // specs/followup/5.1-roster-syntax.md §2.
1749 let is_v51 = matches!(state.global().lua_version, lua_types::LuaVersion::V51);
1750 if !is_v51 && check_next1(ls, b':' as i32) {
1751 return Ok(TK_DBCOLON);
1752 }
1753 return Ok(b':' as i32);
1754 }
1755
1756 c if c == b'"' as i32 || c == b'\'' as i32 => {
1757 let del = ls.current;
1758 read_string(state, ls, del, seminfo)?;
1759 return Ok(TK_STRING);
1760 }
1761
1762 c if c == b'.' as i32 => {
1763 save_and_next(ls, state)?;
1764 if check_next1(ls, b'.' as i32) {
1765 if check_next1(ls, b'.' as i32) {
1766 return Ok(TK_DOTS);
1767 }
1768 return Ok(TK_CONCAT);
1769 } else if !is_digit(ls.current) {
1770 return Ok(b'.' as i32);
1771 } else {
1772 return read_numeral(state, ls, seminfo);
1773 }
1774 }
1775
1776 c if is_digit(c) => {
1777 return read_numeral(state, ls, seminfo);
1778 }
1779
1780 c if c == EOZ => {
1781 return Ok(TK_EOS);
1782 }
1783
1784 c => {
1785 if is_lalpha(c) {
1786 loop {
1787 save_and_next(ls, state)?;
1788 if !is_lalnum(ls.current) {
1789 break;
1790 }
1791 }
1792
1793 // PORT NOTE: copy buffer bytes to drop borrow before new_string.
1794 let content: Vec<u8> = ls.buff.as_slice().to_vec();
1795 let ts = new_string(state, ls, &content)?;
1796
1797 // PORT NOTE: canonical `lua_types::LuaString` lacks the `extra`
1798 // byte that C-Lua uses to mark reserved words. Recover the
1799 // keyword index directly from the interned bytes via the
1800 // `LUAX_TOKENS` table; the first `NUM_RESERVED` entries are
1801 // the keywords in declaration order, so token id =
1802 // `FIRST_RESERVED + index`.
1803 let reserved_token: Option<i32> = LUAX_TOKENS[..NUM_RESERVED]
1804 .iter()
1805 .position(|kw| *kw == content.as_slice())
1806 .map(|i| FIRST_RESERVED + i as i32);
1807 *seminfo = TokenValue::Str(ts);
1808
1809 if let Some(tk) = reserved_token {
1810 // Lua 5.1 has no `goto` keyword — `goto` is an ordinary
1811 // identifier (`local goto = 5` is valid). The keyword and
1812 // the `::label::` grammar were added in 5.2. So under V51
1813 // `goto` lexes as a plain name; the parser then treats
1814 // `goto done` as a name beginning an assignment, yielding
1815 // the incidental `'=' expected near 'done'` the oracle
1816 // reports. See specs/followup/5.1-roster-syntax.md §2.
1817 if tk == TK_GOTO
1818 && matches!(state.global().lua_version, lua_types::LuaVersion::V51)
1819 {
1820 return Ok(TK_NAME);
1821 }
1822 return Ok(tk);
1823 }
1824
1825 // Lua 5.5: with the upstream-default `LUA_COMPAT_GLOBAL`, the
1826 // `global` declaration word is NOT reserved — `global` stays a
1827 // valid identifier, and the parser recognizes the declaration
1828 // statement contextually (see `globalstat` in lua-parse). So
1829 // `global` always lexes as a plain name, on every version.
1830 return Ok(TK_NAME);
1831 } else {
1832 let tok = ls.current;
1833 advance(ls);
1834 return Ok(tok);
1835 }
1836 }
1837 }
1838 }
1839}
1840
1841// ── Phase A stubs for cross-crate helpers ──────────────────────────────────────
1842//
1843// The functions below stand in for cross-crate calls that cannot resolve in
1844// Phase A. They will be replaced by proper imports in Phase B.
1845
1846// TODO(port): replace with state.intern_str(bytes) once LuaState gains that
1847// method (from lua_vm::string::new_lstr wired in Phase B).
1848// TODO_ARCH(phase-b-reconcile): canonical LuaString is constructed via
1849// from_bytes; once LuaState::intern_str is wired, route through there instead.
1850fn intern_str_stub(state: &mut LuaState, bytes: &[u8]) -> Result<GcRef<LuaString>, LuaError> {
1851 state.intern_str(bytes)
1852}
1853
1854// TODO(port): replace with lua_vm::object::hex_value(c) in Phase B.
1855fn hex_value_stub(c: i32) -> u32 {
1856 match c {
1857 c if c >= b'0' as i32 && c <= b'9' as i32 => (c - b'0' as i32) as u32,
1858 c if c >= b'a' as i32 && c <= b'f' as i32 => (c - b'a' as i32 + 10) as u32,
1859 c if c >= b'A' as i32 && c <= b'F' as i32 => (c - b'A' as i32 + 10) as u32,
1860 _ => 0,
1861 }
1862}
1863
1864// TODO(port): replace with lua_vm::object::utf8_esc_encode(codepoint) in Phase B.
1865/// Encode a Unicode codepoint as a Lua-extended UTF-8 byte sequence (1 to 6 bytes).
1866///
1867/// Faithful port of `luaO_utf8esc` from lobject.c. Lua permits codepoints up
1868/// to `0x7FFFFFFF` (5- and 6-byte sequences are non-strict UTF-8 but accepted
1869/// by `\u{...}` escapes per literals.lua test cases).
1870fn utf8_encode_stub(codepoint: u32) -> Vec<u8> {
1871 debug_assert!(codepoint <= 0x7FFF_FFFF);
1872 if codepoint < 0x80 {
1873 return vec![codepoint as u8];
1874 }
1875 let mut x = codepoint;
1876 let mut mfb: u32 = 0x3f;
1877 let mut buf: Vec<u8> = Vec::with_capacity(8);
1878 loop {
1879 buf.push(0x80 | ((x & 0x3f) as u8));
1880 x >>= 6;
1881 mfb >>= 1;
1882 if x <= mfb {
1883 break;
1884 }
1885 }
1886 buf.push(((!mfb << 1) | x) as u8);
1887 buf.reverse();
1888 buf
1889}
1890
1891// ──────────────────────────────────────────────────────────────────────────────
1892// PORT STATUS
1893// source: src/llex.c (581 lines, 24 functions)
1894// src/llex.h (91 lines; merged)
1895// target_crate: lua-lex
1896// confidence: medium
1897// todos: 18
1898// port_notes: 12
1899// unsafe_blocks: 0 (must be 0 outside explicit unsafe-budget crates)
1900// notes: Logic is faithful to the C. The main structural differences:
1901// (1) LexState.L removed — state threaded via fn params;
1902// (2) save/save_and_next/inclinenumber/helpers are all fallible
1903// (Result<_, LuaError>) because lexerror is no longer noreturn;
1904// (3) goto read_save/only_save/no_save in read_string replaced
1905// by EscapeResult enum; (4) Cross-crate calls (intern_str,
1906// luaH_getstr/finishset, luaG_addinfo, luaO_str2num,
1907// luaO_hexavalue, luaO_utf8esc, luaC_fix, luaC_checkGC) are
1908// stubbed with TODO; (5) LuaError, LuaString, ZIO, LexBuffer,
1909// LuaState defined as local stubs — Phase B replaces with real
1910// imports once the crate graph is wired. Key Phase B tasks:
1911// wire import paths; move LuaString.extra accessor to pub;
1912// implement luaX_newstring anchor-table logic. Numeric
1913// literal parsing now delegates to lua_vm::object::str2num
1914// (handles hex integers with wrap-around and hex floats).
1915// ──────────────────────────────────────────────────────────────────────────────