Skip to main content

lua_stdlib/
utf8_lib.rs

1//! UTF-8 standard library for Lua 5.4.
2//!
3//! Port of `lutf8lib.c` (291 lines, 9 functions).
4//!
5//! Provides the `utf8` module with `char`, `codepoint`, `codes`, `len`,
6//! `offset`, and `charpattern`. Supports both strict (Unicode-conformant)
7//! and lax (extended UTF-8, up to `MAX_UTF = 0x7FFFFFFF`) decoding modes.
8//!
9//! Strict mode rejects surrogates (U+D800..U+DFFF) and values above U+10FFFF.
10//! Lax mode accepts any well-formed byte sequence with a value ≤ MAX_UTF.
11
12use lua_types::error::LuaError;
13use lua_types::value::LuaValue;
14use lua_types::closure::LuaClosure;
15use lua_types::{LuaType, LuaStatus};
16use crate::state_stub::{LuaState, LuaStateStubExt as _, lua_CFunction, upvalue_index, CompareOp, LuaDebug};
17
18// C: #define MAXUNICODE 0x10FFFFu
19const MAX_UNICODE: u32 = 0x10_FFFF;
20
21// C: #define MAXUTF 0x7FFFFFFFu
22const MAX_UTF: u32 = 0x7FFF_FFFF;
23
24// C: typedef unsigned int utfint;
25// 31 bits are needed for MAX_UTF; u32 is sufficient on all Rust targets.
26type UtfInt = u32;
27
28// C: #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
29// sizeof(UTF8PATT)/sizeof(char) - 1 = 14 bytes (contains an embedded NUL).
30const UTF8_PATT: &[u8] = b"[\x00-\x7F\xC2-\xFD][\x80-\xBF]*";
31
32// ── Internal helpers ───────────────────────────────────────────────────────
33
34/// Translate a relative string position: negative values count backward from end.
35///
36/// C: `static lua_Integer u_posrelat(lua_Integer pos, size_t len)` (strlib copy)
37fn pos_relat(pos: i64, len: usize) -> i64 {
38    // C: if (pos >= 0) return pos;
39    if pos >= 0 {
40        pos
41    } else {
42        // C: else if (0u - (size_t)pos > len) return 0;
43        // 0u - (size_t)pos is the magnitude of pos as an unsigned value.
44        let abs_pos = pos.unsigned_abs() as u64;
45        if abs_pos > len as u64 {
46            0
47        } else {
48            // C: else return (lua_Integer)len + pos + 1;
49            len as i64 + pos + 1
50        }
51    }
52}
53
54/// Return `true` if byte `c` is a UTF-8 continuation byte (`10xxxxxx`).
55///
56/// C: `#define iscont(c)  (((c) & 0xC0) == 0x80)`
57#[inline]
58fn is_cont(c: u8) -> bool {
59    (c & 0xC0) == 0x80
60}
61
62/// Return `true` if the byte at 0-based index `pos` in `s` is a continuation
63/// byte, treating out-of-bounds positions as non-continuation.
64///
65/// C: `#define iscontp(p)  iscont(*(p))` — where `p = s + pos`.
66/// C strings carry a NUL terminator that is never a continuation byte;
67/// the bounds-check here replaces that guarantee.
68#[inline]
69fn is_cont_at(s: &[u8], pos: i64) -> bool {
70    if pos < 0 {
71        return false;
72    }
73    s.get(pos as usize).map_or(false, |&b| is_cont(b))
74}
75
76/// Decode one UTF-8 sequence from the start of `s`.
77///
78/// Returns `None` if the byte sequence is invalid.
79/// Returns `Some((remaining_slice, codepoint))` on success.
80///
81/// When `strict` is `true`, surrogates and values above `MAX_UNICODE` are
82/// rejected. When `false`, any value ≤ `MAX_UTF` is accepted (extended UTF-8).
83///
84/// C: `static const char *utf8_decode(const char *s, utfint *val, int strict)`
85fn utf8_decode(s: &[u8], strict: bool) -> Option<(&[u8], UtfInt)> {
86    // C: static const utfint limits[] = {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
87    // LIMITS[count] is the minimum value for a sequence with `count` continuation bytes.
88    // LIMITS[0] = u32::MAX forces an error when a non-ASCII byte has no continuation bytes.
89    const LIMITS: [UtfInt; 6] = [u32::MAX, 0x80, 0x800, 0x10000, 0x200000, 0x4000000];
90
91    if s.is_empty() {
92        return None;
93    }
94
95    // C: unsigned int c = (unsigned char)s[0];
96    let mut c = s[0] as u32;
97    let res: UtfInt;
98    let advance: usize;
99
100    if c < 0x80 {
101        // ASCII fast path — no continuation bytes needed.
102        res = c;
103        advance = 1;
104    } else {
105        // C: int count = 0; utfint res = 0;
106        let mut count: usize = 0;
107        let mut r: UtfInt = 0;
108
109        // C: for (; c & 0x40; c <<= 1) { unsigned int cc = (unsigned char)s[++count]; ... }
110        // The C for-loop runs the body first, then applies `c <<= 1` as the update.
111        while c & 0x40 != 0 {
112            // C: unsigned int cc = (unsigned char)s[++count];
113            count += 1;
114            if count >= s.len() {
115                return None; // string too short for the indicated sequence length
116            }
117            let cc = s[count] as u32;
118
119            // C: if (!iscont(cc)) return NULL;
120            if (cc & 0xC0) != 0x80 {
121                return None; // expected continuation byte, got something else
122            }
123
124            // C: res = (res << 6) | (cc & 0x3F);
125            r = (r << 6) | (cc & 0x3F);
126
127            // C for-loop update: c <<= 1
128            c <<= 1;
129        }
130
131        // C: res |= ((utfint)(c & 0x7F) << (count * 5));
132        r |= (c & 0x7F) << (count as u32 * 5);
133
134        // C: if (count > 5 || res > MAXUTF || res < limits[count]) return NULL;
135        if count > 5 || r > MAX_UTF || r < LIMITS[count] {
136            return None; // invalid (overlong, too large, or excess continuation bytes)
137        }
138
139        res = r;
140        // C: s += count; return s + 1; → total bytes consumed = count + 1
141        advance = count + 1;
142        if advance > s.len() {
143            return None;
144        }
145    }
146
147    // C: if (strict) { if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu)) return NULL; }
148    if strict && (res > MAX_UNICODE || (0xD800 <= res && res <= 0xDFFF)) {
149        return None; // surrogate or out-of-Unicode-range value in strict mode
150    }
151
152    Some((&s[advance..], res))
153}
154
155/// Encode a codepoint (≤ `MAX_UTF`) as extended UTF-8 bytes.
156///
157/// Mirrors `luaO_utf8esc` from `lobject.c`, which fills a fixed buffer backwards.
158/// This Rust version builds the bytes naturally and returns a `Vec<u8>`.
159///
160/// C: `int luaO_utf8esc(char *buff, unsigned long x)` (lobject.c)
161fn encode_utf8_codepoint(code: u32) -> Vec<u8> {
162    debug_assert!(code <= MAX_UTF);
163
164    // C: if (x < 0x80) buff[UTF8BUFFSZ - 1] = cast_char(x);
165    if code < 0x80 {
166        return vec![code as u8];
167    }
168
169    let mut x = code;
170    // C: unsigned int mfb = 0x3f;  — maximum value that fits in the first byte
171    let mut mfb: u32 = 0x3F;
172    // Continuation bytes built in reverse, then reversed at the end.
173    let mut bytes_rev: Vec<u8> = Vec::with_capacity(6);
174
175    // C: do { buff[UTF8BUFFSZ - (n++)] = cast_char(0x80 | (x & 0x3f)); x >>= 6; mfb >>= 1; }
176    //    while (x > mfb);
177    loop {
178        bytes_rev.push(0x80 | (x & 0x3F) as u8);
179        x >>= 6;
180        mfb >>= 1;
181        if x <= mfb {
182            break;
183        }
184    }
185
186    // C: buff[UTF8BUFFSZ - n] = cast_char((~mfb << 1) | x);
187    // wrapping_shl avoids a Rust debug-mode overflow panic on `!mfb << 1`
188    // (e.g., !0x1Fu32 = 0xFFFF_FFE0; << 1 = 0xFFFF_FFC0; as u8 = 0xC0).
189    let leading = ((!mfb).wrapping_shl(1) as u8) | (x as u8);
190
191    let mut result = Vec::with_capacity(bytes_rev.len() + 1);
192    result.push(leading);
193    for &b in bytes_rev.iter().rev() {
194        result.push(b);
195    }
196    result
197}
198
199// ── Library functions ──────────────────────────────────────────────────────
200
201/// `utf8.len(s [, i [, j [, lax]]])` → integer | (nil, integer)
202///
203/// Returns the number of UTF-8 characters that start in the byte range `[i,j]`
204/// of string `s` (1-based, defaulting to the whole string).
205/// On a malformed sequence, returns `(nil, position)` where `position` is the
206/// 1-based byte offset of the first bad byte.
207///
208/// C: `static int utflen(lua_State *L)`
209fn utf_len(state: &mut LuaState) -> Result<usize, LuaError> {
210    // C: const char *s = luaL_checklstring(L, 1, &len);
211    // Clone to avoid holding a borrow across subsequent mutable state calls.
212    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
213    let len = s.len();
214
215    // C: lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
216    // TODO(port): opt_arg_integer(narg, default) not yet in LuaState API; adjust in Phase B.
217    let raw_posi: i64 = state.opt_arg_integer(2, 1)?;
218    let mut posi: i64 = pos_relat(raw_posi, len);
219
220    // C: lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
221    // TODO(port): opt_arg_integer API (second call site).
222    let raw_posj: i64 = state.opt_arg_integer(3, -1)?;
223    let mut posj: i64 = pos_relat(raw_posj, len);
224
225    // C: int lax = lua_toboolean(L, 4);
226    // TODO(port): to_boolean(n) method not yet confirmed in LuaState API.
227    let lax: bool = state.to_boolean(4);
228
229    // C: luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, ...);
230    // Note: C short-circuits, so --posi only executes when 1 <= posi.
231    if posi < 1 {
232        return Err(LuaError::arg_error(2, "initial position out of bounds"));
233    }
234    posi -= 1; // 1-based → 0-based
235    if posi > len as i64 {
236        return Err(LuaError::arg_error(2, "initial position out of bounds"));
237    }
238
239    // C: luaL_argcheck(L, --posj < (lua_Integer)len, 3, ...);
240    posj -= 1; // 1-based → 0-based (always decremented, no short-circuit)
241    if posj >= len as i64 {
242        return Err(LuaError::arg_error(3, "final position out of bounds"));
243    }
244
245    let mut n: i64 = 0;
246
247    // C: while (posi <= posj) { const char *s1 = utf8_decode(s + posi, NULL, !lax); ... }
248    while posi <= posj {
249        match utf8_decode(&s[posi as usize..], !lax) {
250            None => {
251                // C: luaL_pushfail(L); lua_pushinteger(L, posi + 1); return 2;
252                state.push(LuaValue::Nil); // luaL_pushfail
253                state.push(LuaValue::Int(posi + 1)); // 1-based position of failure
254                return Ok(2);
255            }
256            Some((remaining, _)) => {
257                // C: posi = s1 - s;  (s1 points past the decoded bytes)
258                posi = (len - remaining.len()) as i64;
259                n += 1;
260            }
261        }
262    }
263
264    // C: lua_pushinteger(L, n); return 1;
265    state.push(LuaValue::Int(n));
266    Ok(1)
267}
268
269/// `utf8.codepoint(s [, i [, j [, lax]]])` → integer, ...
270///
271/// Returns the codepoints (as integers) for all characters starting in `s[i..j]`.
272///
273/// C: `static int codepoint(lua_State *L)`
274fn codepoint(state: &mut LuaState) -> Result<usize, LuaError> {
275    // C: const char *s = luaL_checklstring(L, 1, &len);
276    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
277    let len = s.len();
278
279    // C: lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
280    // TODO(port): opt_arg_integer API (codepoint start position).
281    let raw_posi: i64 = state.opt_arg_integer(2, 1)?;
282    let posi: i64 = pos_relat(raw_posi, len);
283
284    // C: lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
285    // Default for the end position is posi (1-based), giving a single character.
286    // TODO(port): opt_arg_integer API (codepoint end position).
287    let raw_pose: i64 = state.opt_arg_integer(3, posi)?;
288    let pose: i64 = pos_relat(raw_pose, len);
289
290    // C: int lax = lua_toboolean(L, 4);
291    // TODO(port): to_boolean API (codepoint lax mode).
292    let lax: bool = state.to_boolean(4);
293
294    // C: luaL_argcheck(L, posi >= 1, 2, "out of bounds");
295    if posi < 1 {
296        return Err(LuaError::arg_error(2, "out of bounds"));
297    }
298
299    // C: luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds");
300    if pose > len as i64 {
301        return Err(LuaError::arg_error(3, "out of bounds"));
302    }
303
304    // C: if (posi > pose) return 0;
305    if posi > pose {
306        return Ok(0); // empty interval: no values
307    }
308
309    // C: if (pose - posi >= INT_MAX) return luaL_error(L, "string slice too long");
310    if pose - posi >= i32::MAX as i64 {
311        return Err(LuaError::runtime(format_args!("string slice too long")));
312    }
313
314    // C: n = (int)(pose - posi) + 1; luaL_checkstack(L, n, "string slice too long");
315    let n_max = (pose - posi + 1) as i32;
316    state.ensure_stack(n_max, "string slice too long")?;
317
318    // C: se = s + pose; for (s += posi - 1; s < se;) { ... }
319    // 0-based: start at (posi - 1), stop before byte index `pose`.
320    let mut pos: usize = (posi - 1) as usize; // 0-based start
321    let end: usize = pose as usize; // 0-based exclusive end
322    let mut count: usize = 0;
323
324    while pos < end {
325        // C: s = utf8_decode(s, &code, !lax); if (s == NULL) return luaL_error(L, MSGInvalid);
326        match utf8_decode(&s[pos..], !lax) {
327            None => return Err(LuaError::runtime(format_args!("invalid UTF-8 code"))),
328            Some((remaining, code)) => {
329                // C: lua_pushinteger(L, code); n++;
330                state.push(LuaValue::Int(code as i64));
331                count += 1;
332                pos = len - remaining.len(); // advance by decoded character width
333            }
334        }
335    }
336
337    Ok(count)
338}
339
340/// Encode the codepoint at stack argument `arg` and return the UTF-8 bytes.
341///
342/// C: `static void pushutfchar(lua_State *L, int arg)` — restructured to return
343/// `Vec<u8>` directly rather than pushing to the stack, avoiding the push/pop
344/// dance that `luaL_Buffer` required.
345///
346/// PORT NOTE: C's `pushutfchar` called `lua_pushfstring(L, "%U", code)` to encode
347/// and push in one step. Here the encoding is extracted so `utf_char` can build
348/// the concatenated result without intermediate stack operations.
349fn get_utf_char_bytes(state: &mut LuaState, arg: i32) -> Result<Vec<u8>, LuaError> {
350    // C: lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
351    let code = state.check_arg_integer(arg)? as u64;
352
353    // C: luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
354    if code > MAX_UTF as u64 {
355        return Err(LuaError::arg_error(arg, "value out of range"));
356    }
357
358    Ok(encode_utf8_codepoint(code as u32))
359}
360
361/// `utf8.char(n1, n2, ...)` → string
362///
363/// Returns a string formed by the UTF-8 encoding of the given codepoints.
364///
365/// C: `static int utfchar(lua_State *L)`
366fn utf_char(state: &mut LuaState) -> Result<usize, LuaError> {
367    // C: int n = lua_gettop(L);
368    // TODO(port): stack_top() / arg_count() API on LuaState not yet confirmed.
369    let n: i32 = state.stack_top() as i32;
370
371    if n == 1 {
372        // C: pushutfchar(L, 1);  — optimized single-character path
373        let bytes = get_utf_char_bytes(state, 1)?;
374        let s = state.intern_str(&bytes)?;
375        state.push(LuaValue::Str(s));
376    } else {
377        // C: luaL_Buffer b; luaL_buffinit(L, &b);
378        //    for (i = 1; i <= n; i++) { pushutfchar(L, i); luaL_addvalue(&b); }
379        //    luaL_pushresult(&b);
380        // PORT NOTE: luaL_Buffer replaced by Vec<u8>; codepoints are encoded
381        // directly into the accumulator without intermediate stack push/pop.
382        let mut buf: Vec<u8> = Vec::new();
383        for i in 1..=n {
384            buf.extend_from_slice(&get_utf_char_bytes(state, i)?);
385        }
386        let s = state.intern_str(&buf)?;
387        state.push(LuaValue::Str(s));
388    }
389
390    Ok(1)
391}
392
393/// `utf8.offset(s, n [, i])` → integer | nil
394///
395/// Returns the byte offset where the n-th character (counting from position `i`)
396/// starts. Negative `n` counts from the end. `n == 0` returns the start of the
397/// character that contains position `i`.
398/// Returns `nil` if the character cannot be found.
399///
400/// C: `static int byteoffset(lua_State *L)`
401fn byte_offset(state: &mut LuaState) -> Result<usize, LuaError> {
402    // C: const char *s = luaL_checklstring(L, 1, &len);
403    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
404    let len = s.len();
405
406    // C: lua_Integer n = luaL_checkinteger(L, 2);
407    let n: i64 = state.check_arg_integer(2)?;
408
409    // C: lua_Integer posi = (n >= 0) ? 1 : len + 1;
410    let default_posi: i64 = if n >= 0 { 1 } else { len as i64 + 1 };
411
412    // C: posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
413    // TODO(port): opt_arg_integer API (byte_offset position argument).
414    let raw_posi: i64 = state.opt_arg_integer(3, default_posi)?;
415    let posi_1based: i64 = pos_relat(raw_posi, len);
416
417    // C: luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, "position out of bounds");
418    if posi_1based < 1 {
419        return Err(LuaError::arg_error(3, "position out of bounds"));
420    }
421    let mut posi: i64 = posi_1based - 1; // 1-based → 0-based
422    if posi > len as i64 {
423        return Err(LuaError::arg_error(3, "position out of bounds"));
424    }
425
426    // `count` is a mutable copy of `n`; driven to 0 when the target character is found.
427    let mut count = n;
428
429    if count == 0 {
430        // C: while (posi > 0 && iscontp(s + posi)) posi--;
431        // Scan backward to find the start of the character containing `posi`.
432        while posi > 0 && is_cont_at(&s, posi) {
433            posi -= 1;
434        }
435        // count remains 0
436    } else {
437        // C: if (iscontp(s + posi)) return luaL_error(L, "initial position is a continuation byte");
438        if is_cont_at(&s, posi) {
439            return Err(LuaError::runtime(format_args!(
440                "initial position is a continuation byte"
441            )));
442        }
443
444        if count < 0 {
445            // C: while (n < 0 && posi > 0) {
446            //      do { posi--; } while (posi > 0 && iscontp(s + posi));
447            //      n++;
448            //    }
449            while count < 0 && posi > 0 {
450                // do-while: always decrements at least once, then skips back over
451                // any continuation bytes to land on a leading byte.
452                loop {
453                    posi -= 1;
454                    if posi == 0 || !is_cont_at(&s, posi) {
455                        break;
456                    }
457                }
458                count += 1;
459            }
460        } else {
461            // C: n--;
462            //    while (n > 0 && posi < (lua_Integer)len) {
463            //      do { posi++; } while (iscontp(s + posi));  /* cannot pass '\0' */
464            //      n--;
465            //    }
466            count -= 1; // do not move for the 1st character
467            while count > 0 && posi < len as i64 {
468                // C relies on the NUL terminator to stop the inner do-while.
469                // Rust uses an explicit bounds check instead.
470                loop {
471                    posi += 1;
472                    if !is_cont_at(&s, posi) {
473                        break;
474                    }
475                }
476                count -= 1;
477            }
478        }
479    }
480
481    // C: if (n == 0) lua_pushinteger(L, posi + 1); else luaL_pushfail(L);
482    if count == 0 {
483        state.push(LuaValue::Int(posi + 1)); // 0-based → 1-based
484    } else {
485        state.push(LuaValue::Nil); // luaL_pushfail: character not found
486    }
487    Ok(1)
488}
489
490/// Internal iterator body shared by `iter_aux_strict` and `iter_aux_lax`.
491///
492/// Stack on entry (from the generic for): (1) string, (2) current byte position
493/// (0-based; initially pushed as 0 by `iter_codes`).
494///
495/// Advances past any leading continuation bytes, decodes the next character,
496/// and returns `(next_1based_pos, codepoint)`.  Returns nothing (0) when the
497/// string is exhausted.
498///
499/// C: `static int iter_aux(lua_State *L, int strict)`
500fn iter_aux(state: &mut LuaState, strict: bool) -> Result<usize, LuaError> {
501    // C: const char *s = luaL_checklstring(L, 1, &len);
502    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
503    let len = s.len();
504
505    // C: lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2);
506    // TODO(port): to_integer(n) exact return type (i64/Option<i64>) not yet confirmed;
507    // treating as i64 cast to u64 for unsigned byte-index arithmetic.
508    let mut n: u64 = state.to_integer(2).unwrap_or(0) as u64;
509
510    // C: if (n < len) { while (iscontp(s + n)) n++; }
511    if (n as usize) < len {
512        while (n as usize) < len && is_cont(s[n as usize]) {
513            n += 1;
514        }
515    }
516
517    // C: if (n >= len) return 0;
518    if (n as usize) >= len {
519        return Ok(0); // no more codepoints
520    }
521
522    // C: const char *next = utf8_decode(s + n, &code, strict);
523    //    if (next == NULL || iscontp(next)) return luaL_error(L, MSGInvalid);
524    match utf8_decode(&s[n as usize..], strict) {
525        None => Err(LuaError::runtime(format_args!("invalid UTF-8 code"))),
526        Some((remaining, code)) => {
527            let next_pos = len - remaining.len(); // 0-based index of the next character
528            // C: iscontp(next) — an unexpected continuation byte immediately after a
529            // valid sequence indicates a malformed input stream.
530            if next_pos < len && is_cont(s[next_pos]) {
531                return Err(LuaError::runtime(format_args!("invalid UTF-8 code")));
532            }
533            // C: lua_pushinteger(L, n + 1); lua_pushinteger(L, code); return 2;
534            state.push(LuaValue::Int((n + 1) as i64)); // 1-based position for next iteration
535            state.push(LuaValue::Int(code as i64));
536            Ok(2)
537        }
538    }
539}
540
541/// Strict iterator body: rejects surrogates and values > MAX_UNICODE.
542///
543/// C: `static int iter_auxstrict(lua_State *L)`
544fn iter_aux_strict(state: &mut LuaState) -> Result<usize, LuaError> {
545    iter_aux(state, true)
546}
547
548/// Lax iterator body: accepts extended UTF-8 up to MAX_UTF.
549///
550/// C: `static int iter_auxlax(lua_State *L)`
551fn iter_aux_lax(state: &mut LuaState) -> Result<usize, LuaError> {
552    iter_aux(state, false)
553}
554
555/// `utf8.codes(s [, lax])` → function, string, integer
556///
557/// Returns the iterator triple `(f, s, 0)` for use in a generic for loop.
558/// Each call to `f(s, pos)` returns the next `(pos, codepoint)` pair.
559///
560/// C: `static int iter_codes(lua_State *L)`
561fn iter_codes(state: &mut LuaState) -> Result<usize, LuaError> {
562    // C: int lax = lua_toboolean(L, 2);
563    // TODO(port): to_boolean API (iter_codes lax mode).
564    let lax: bool = state.to_boolean(2);
565
566    // C: const char *s = luaL_checkstring(L, 1);
567    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
568
569    // C: luaL_argcheck(L, !iscontp(s), 1, MSGInvalid);
570    // The very first byte of the string must not be a continuation byte.
571    if s.first().map_or(false, |&b| is_cont(b)) {
572        return Err(LuaError::arg_error(1, "invalid UTF-8 code"));
573    }
574
575    // C: lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
576    // TODO(phase-b): LuaClosure::LightC in lua-types is fn() -> i32; needs widening to the real lua_CFunction signature. Stub via push_c_function until then.
577    let iter_fn: fn(&mut LuaState) -> Result<usize, LuaError> =
578        if lax { iter_aux_lax } else { iter_aux_strict };
579    state.push_c_function(iter_fn)?;
580
581    // C: lua_pushvalue(L, 1);  — push the string argument as the loop invariant
582    // TODO(port): push_value_at(idx) not yet confirmed in LuaState API.
583    state.push_value_at(1)?;
584
585    // C: lua_pushinteger(L, 0);  — initial control variable (byte position 0)
586    state.push(LuaValue::Int(0));
587
588    Ok(3)
589}
590
591// ── Library registration ───────────────────────────────────────────────────
592
593/// Function registration table for the `utf8` library.
594///
595/// C: `static const luaL_Reg funcs[]`
596/// "charpattern" is intentionally absent here; it is a string value and is
597/// registered separately inside `open_utf8` via `lua_setfield`.
598pub const FUNCS: &[(&[u8], fn(&mut LuaState) -> Result<usize, LuaError>)] = &[
599    (b"offset", byte_offset),
600    (b"codepoint", codepoint),
601    (b"char", utf_char),
602    (b"len", utf_len),
603    (b"codes", iter_codes),
604];
605
606/// Open the `utf8` library.
607///
608/// Registers all functions from `FUNCS` into a new table, then sets
609/// `utf8.charpattern` to the byte-string pattern matching one UTF-8 sequence.
610///
611/// C: `LUAMOD_API int luaopen_utf8(lua_State *L)`
612pub fn open_utf8(state: &mut LuaState) -> Result<usize, LuaError> {
613    // C: luaL_newlib(L, funcs);
614    // TODO(port): new_lib(funcs) API on LuaState not yet confirmed; expected to
615    // create a new table and register all (name, fn) pairs from FUNCS.
616    state.new_lib(FUNCS)?;
617
618    // C: lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
619    let patt = state.intern_str(UTF8_PATT)?;
620    state.push(LuaValue::Str(patt));
621
622    // C: lua_setfield(L, -2, "charpattern");
623    // TODO(port): set_field(table_idx, field_name) API on LuaState not yet confirmed.
624    state.set_field(-2, b"charpattern")?;
625
626    Ok(1)
627}
628
629// ──────────────────────────────────────────────────────────────────────────
630// PORT STATUS
631//   source:        src/lutf8lib.c  (291 lines, 9 functions)
632//   target_crate:  lua-stdlib
633//   confidence:    medium
634//   todos:         13
635//   port_notes:    2
636//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
637//   notes:         Core UTF-8 logic (utf8_decode, encode_utf8_codepoint,
638//                  pos_relat, is_cont_at) is a faithful translation and should
639//                  be correct. All 13 TODOs are unresolved LuaState API names:
640//                  opt_arg_integer, to_boolean, stack_top, push_value_at,
641//                  new_lib, set_field, and to_integer — Phase B reconciles
642//                  these against the actual method signatures. No unsafe
643//                  blocks; NUL-terminator reliance in C replaced by Rust
644//                  bounds checks throughout.
645// ──────────────────────────────────────────────────────────────────────────