Skip to main content

lua_stdlib/
utf8_lib.rs

1//! UTF-8 standard library for Lua 5.4.
2//!
3//! Port of `lutf8lib.c` (291 lines, 9 functions).
4//!
5//! Provides the `utf8` module with `char`, `codepoint`, `codes`, `len`,
6//! `offset`, and `charpattern`. Supports both strict (Unicode-conformant)
7//! and lax (extended UTF-8, up to `MAX_UTF = 0x7FFFFFFF`) decoding modes.
8//!
9//! Strict mode rejects surrogates (U+D800..U+DFFF) and values above U+10FFFF.
10//! Lax mode accepts any well-formed byte sequence with a value ≤ MAX_UTF.
11
12use lua_types::error::LuaError;
13use lua_types::value::LuaValue;
14use lua_types::closure::LuaClosure;
15use lua_types::{LuaType, LuaStatus};
16use crate::state_stub::{LuaState, LuaStateStubExt as _, lua_CFunction, upvalue_index, CompareOp, LuaDebug};
17
18const MAX_UNICODE: u32 = 0x10_FFFF;
19
20const MAX_UTF: u32 = 0x7FFF_FFFF;
21
22// 31 bits are needed for MAX_UTF; u32 is sufficient on all Rust targets.
23type UtfInt = u32;
24
25// sizeof(UTF8PATT)/sizeof(char) - 1 = 14 bytes (contains an embedded NUL).
26const UTF8_PATT: &[u8] = b"[\x00-\x7F\xC2-\xFD][\x80-\xBF]*";
27
28// ── Internal helpers ───────────────────────────────────────────────────────
29
30/// Translate a relative string position: negative values count backward from end.
31///
32fn pos_relat(pos: i64, len: usize) -> i64 {
33    if pos >= 0 {
34        pos
35    } else {
36        // 0u - (size_t)pos is the magnitude of pos as an unsigned value.
37        let abs_pos = pos.unsigned_abs() as u64;
38        if abs_pos > len as u64 {
39            0
40        } else {
41            len as i64 + pos + 1
42        }
43    }
44}
45
46/// Return `true` if byte `c` is a UTF-8 continuation byte (`10xxxxxx`).
47///
48#[inline]
49fn is_cont(c: u8) -> bool {
50    (c & 0xC0) == 0x80
51}
52
53/// Return `true` if the byte at 0-based index `pos` in `s` is a continuation
54/// byte, treating out-of-bounds positions as non-continuation.
55///
56/// C strings carry a NUL terminator that is never a continuation byte;
57/// the bounds-check here replaces that guarantee.
58#[inline]
59fn is_cont_at(s: &[u8], pos: i64) -> bool {
60    if pos < 0 {
61        return false;
62    }
63    s.get(pos as usize).map_or(false, |&b| is_cont(b))
64}
65
66/// Decode one UTF-8 sequence from the start of `s`.
67///
68/// Returns `None` if the byte sequence is invalid.
69/// Returns `Some((remaining_slice, codepoint))` on success.
70///
71/// When `strict` is `true`, surrogates and values above `MAX_UNICODE` are
72/// rejected. When `false`, any value ≤ `MAX_UTF` is accepted (extended UTF-8).
73///
74fn utf8_decode(s: &[u8], strict: bool) -> Option<(&[u8], UtfInt)> {
75    // LIMITS[count] is the minimum value for a sequence with `count` continuation bytes.
76    // LIMITS[0] = u32::MAX forces an error when a non-ASCII byte has no continuation bytes.
77    const LIMITS: [UtfInt; 6] = [u32::MAX, 0x80, 0x800, 0x10000, 0x200000, 0x4000000];
78
79    if s.is_empty() {
80        return None;
81    }
82
83    let mut c = s[0] as u32;
84    let res: UtfInt;
85    let advance: usize;
86
87    if c < 0x80 {
88        // ASCII fast path — no continuation bytes needed.
89        res = c;
90        advance = 1;
91    } else {
92        let mut count: usize = 0;
93        let mut r: UtfInt = 0;
94
95        // The C for-loop runs the body first, then applies `c <<= 1` as the update.
96        while c & 0x40 != 0 {
97            count += 1;
98            if count >= s.len() {
99                return None; // string too short for the indicated sequence length
100            }
101            let cc = s[count] as u32;
102
103            if (cc & 0xC0) != 0x80 {
104                return None; // expected continuation byte, got something else
105            }
106
107            r = (r << 6) | (cc & 0x3F);
108
109            // C for-loop update: c <<= 1
110            c <<= 1;
111        }
112
113        r |= (c & 0x7F) << (count as u32 * 5);
114
115        if count > 5 || r > MAX_UTF || r < LIMITS[count] {
116            return None; // invalid (overlong, too large, or excess continuation bytes)
117        }
118
119        res = r;
120        advance = count + 1;
121        if advance > s.len() {
122            return None;
123        }
124    }
125
126    if strict && (res > MAX_UNICODE || (0xD800 <= res && res <= 0xDFFF)) {
127        return None; // surrogate or out-of-Unicode-range value in strict mode
128    }
129
130    Some((&s[advance..], res))
131}
132
133/// Encode a codepoint (≤ `MAX_UTF`) as extended UTF-8 bytes.
134///
135/// Mirrors `luaO_utf8esc` from `lobject.c`, which fills a fixed buffer backwards.
136/// This Rust version builds the bytes naturally and returns a `Vec<u8>`.
137///
138fn encode_utf8_codepoint(code: u32) -> Vec<u8> {
139    debug_assert!(code <= MAX_UTF);
140
141    if code < 0x80 {
142        return vec![code as u8];
143    }
144
145    let mut x = code;
146    let mut mfb: u32 = 0x3F;
147    // Continuation bytes built in reverse, then reversed at the end.
148    let mut bytes_rev: Vec<u8> = Vec::with_capacity(6);
149
150    //    while (x > mfb);
151    loop {
152        bytes_rev.push(0x80 | (x & 0x3F) as u8);
153        x >>= 6;
154        mfb >>= 1;
155        if x <= mfb {
156            break;
157        }
158    }
159
160    // wrapping_shl avoids a Rust debug-mode overflow panic on `!mfb << 1`
161    // (e.g., !0x1Fu32 = 0xFFFF_FFE0; << 1 = 0xFFFF_FFC0; as u8 = 0xC0).
162    let leading = ((!mfb).wrapping_shl(1) as u8) | (x as u8);
163
164    let mut result = Vec::with_capacity(bytes_rev.len() + 1);
165    result.push(leading);
166    for &b in bytes_rev.iter().rev() {
167        result.push(b);
168    }
169    result
170}
171
172// ── Library functions ──────────────────────────────────────────────────────
173
174/// `utf8.len(s [, i [, j [, lax]]])` → integer | (nil, integer)
175///
176/// Returns the number of UTF-8 characters that start in the byte range `[i,j]`
177/// of string `s` (1-based, defaulting to the whole string).
178/// On a malformed sequence, returns `(nil, position)` where `position` is the
179/// 1-based byte offset of the first bad byte.
180///
181fn utf_len(state: &mut LuaState) -> Result<usize, LuaError> {
182    // Clone to avoid holding a borrow across subsequent mutable state calls.
183    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
184    let len = s.len();
185
186    // TODO(port): opt_arg_integer(narg, default) not yet in LuaState API; adjust in Phase B.
187    let raw_posi: i64 = state.opt_arg_integer(2, 1)?;
188    let mut posi: i64 = pos_relat(raw_posi, len);
189
190    // TODO(port): opt_arg_integer API (second call site).
191    let raw_posj: i64 = state.opt_arg_integer(3, -1)?;
192    let mut posj: i64 = pos_relat(raw_posj, len);
193
194    // TODO(port): to_boolean(n) method not yet confirmed in LuaState API.
195    let lax: bool = state.to_boolean(4);
196
197    // Note: C short-circuits, so --posi only executes when 1 <= posi.
198    if posi < 1 {
199        return Err(LuaError::arg_error(2, "initial position out of bounds"));
200    }
201    posi -= 1; // 1-based → 0-based
202    if posi > len as i64 {
203        return Err(LuaError::arg_error(2, "initial position out of bounds"));
204    }
205
206    posj -= 1; // 1-based → 0-based (always decremented, no short-circuit)
207    if posj >= len as i64 {
208        return Err(LuaError::arg_error(3, "final position out of bounds"));
209    }
210
211    let mut n: i64 = 0;
212
213    while posi <= posj {
214        match utf8_decode(&s[posi as usize..], !lax) {
215            None => {
216                state.push(LuaValue::Nil); // luaL_pushfail
217                state.push(LuaValue::Int(posi + 1)); // 1-based position of failure
218                return Ok(2);
219            }
220            Some((remaining, _)) => {
221                posi = (len - remaining.len()) as i64;
222                n += 1;
223            }
224        }
225    }
226
227    state.push(LuaValue::Int(n));
228    Ok(1)
229}
230
231/// `utf8.codepoint(s [, i [, j [, lax]]])` → integer, ...
232///
233/// Returns the codepoints (as integers) for all characters starting in `s[i..j]`.
234///
235fn codepoint(state: &mut LuaState) -> Result<usize, LuaError> {
236    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
237    let len = s.len();
238
239    // TODO(port): opt_arg_integer API (codepoint start position).
240    let raw_posi: i64 = state.opt_arg_integer(2, 1)?;
241    let posi: i64 = pos_relat(raw_posi, len);
242
243    // Default for the end position is posi (1-based), giving a single character.
244    // TODO(port): opt_arg_integer API (codepoint end position).
245    let raw_pose: i64 = state.opt_arg_integer(3, posi)?;
246    let pose: i64 = pos_relat(raw_pose, len);
247
248    // TODO(port): to_boolean API (codepoint lax mode).
249    let lax: bool = state.to_boolean(4);
250
251    if posi < 1 {
252        return Err(LuaError::arg_error(2, "out of bounds"));
253    }
254
255    if pose > len as i64 {
256        return Err(LuaError::arg_error(3, "out of bounds"));
257    }
258
259    if posi > pose {
260        return Ok(0); // empty interval: no values
261    }
262
263    if pose - posi >= i32::MAX as i64 {
264        return Err(LuaError::runtime(format_args!("string slice too long")));
265    }
266
267    let n_max = (pose - posi + 1) as i32;
268    state.ensure_stack(n_max, "string slice too long")?;
269
270    // 0-based: start at (posi - 1), stop before byte index `pose`.
271    let mut pos: usize = (posi - 1) as usize; // 0-based start
272    let end: usize = pose as usize; // 0-based exclusive end
273    let mut count: usize = 0;
274
275    while pos < end {
276        match utf8_decode(&s[pos..], !lax) {
277            None => return Err(LuaError::runtime(format_args!("invalid UTF-8 code"))),
278            Some((remaining, code)) => {
279                state.push(LuaValue::Int(code as i64));
280                count += 1;
281                pos = len - remaining.len(); // advance by decoded character width
282            }
283        }
284    }
285
286    Ok(count)
287}
288
289/// Encode the codepoint at stack argument `arg` and return the UTF-8 bytes.
290///
291/// `Vec<u8>` directly rather than pushing to the stack, avoiding the push/pop
292/// dance that `luaL_Buffer` required.
293///
294/// PORT NOTE: C's `pushutfchar` called `lua_pushfstring(L, "%U", code)` to encode
295/// and push in one step. Here the encoding is extracted so `utf_char` can build
296/// the concatenated result without intermediate stack operations.
297fn get_utf_char_bytes(state: &mut LuaState, arg: i32) -> Result<Vec<u8>, LuaError> {
298    let code = state.check_arg_integer(arg)? as u64;
299
300    if code > MAX_UTF as u64 {
301        return Err(LuaError::arg_error(arg, "value out of range"));
302    }
303
304    Ok(encode_utf8_codepoint(code as u32))
305}
306
307/// `utf8.char(n1, n2, ...)` → string
308///
309/// Returns a string formed by the UTF-8 encoding of the given codepoints.
310///
311fn utf_char(state: &mut LuaState) -> Result<usize, LuaError> {
312    // TODO(port): stack_top() / arg_count() API on LuaState not yet confirmed.
313    let n: i32 = state.stack_top() as i32;
314
315    if n == 1 {
316        let bytes = get_utf_char_bytes(state, 1)?;
317        let s = state.intern_str(&bytes)?;
318        state.push(LuaValue::Str(s));
319    } else {
320        //    for (i = 1; i <= n; i++) { pushutfchar(L, i); luaL_addvalue(&b); }
321        //    luaL_pushresult(&b);
322        // PORT NOTE: luaL_Buffer replaced by Vec<u8>; codepoints are encoded
323        // directly into the accumulator without intermediate stack push/pop.
324        let mut buf: Vec<u8> = Vec::new();
325        for i in 1..=n {
326            buf.extend_from_slice(&get_utf_char_bytes(state, i)?);
327        }
328        let s = state.intern_str(&buf)?;
329        state.push(LuaValue::Str(s));
330    }
331
332    Ok(1)
333}
334
335/// `utf8.offset(s, n [, i])` → integer | nil
336///
337/// Returns the byte offset where the n-th character (counting from position `i`)
338/// starts. Negative `n` counts from the end. `n == 0` returns the start of the
339/// character that contains position `i`.
340/// Returns `nil` if the character cannot be found.
341///
342fn byte_offset(state: &mut LuaState) -> Result<usize, LuaError> {
343    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
344    let len = s.len();
345
346    let n: i64 = state.check_arg_integer(2)?;
347
348    let default_posi: i64 = if n >= 0 { 1 } else { len as i64 + 1 };
349
350    // TODO(port): opt_arg_integer API (byte_offset position argument).
351    let raw_posi: i64 = state.opt_arg_integer(3, default_posi)?;
352    let posi_1based: i64 = pos_relat(raw_posi, len);
353
354    if posi_1based < 1 {
355        return Err(LuaError::arg_error(3, "position out of bounds"));
356    }
357    let mut posi: i64 = posi_1based - 1; // 1-based → 0-based
358    if posi > len as i64 {
359        return Err(LuaError::arg_error(3, "position out of bounds"));
360    }
361
362    // `count` is a mutable copy of `n`; driven to 0 when the target character is found.
363    let mut count = n;
364
365    if count == 0 {
366        // Scan backward to find the start of the character containing `posi`.
367        while posi > 0 && is_cont_at(&s, posi) {
368            posi -= 1;
369        }
370        // count remains 0
371    } else {
372        if is_cont_at(&s, posi) {
373            return Err(LuaError::runtime(format_args!(
374                "initial position is a continuation byte"
375            )));
376        }
377
378        if count < 0 {
379            //      do { posi--; } while (posi > 0 && iscontp(s + posi));
380            //      n++;
381            //    }
382            while count < 0 && posi > 0 {
383                // do-while: always decrements at least once, then skips back over
384                // any continuation bytes to land on a leading byte.
385                loop {
386                    posi -= 1;
387                    if posi == 0 || !is_cont_at(&s, posi) {
388                        break;
389                    }
390                }
391                count += 1;
392            }
393        } else {
394            //    while (n > 0 && posi < (lua_Integer)len) {
395            //      do { posi++; } while (iscontp(s + posi));  /* cannot pass '\0' */
396            //      n--;
397            //    }
398            count -= 1; // do not move for the 1st character
399            while count > 0 && posi < len as i64 {
400                // C relies on the NUL terminator to stop the inner do-while.
401                // Rust uses an explicit bounds check instead.
402                loop {
403                    posi += 1;
404                    if !is_cont_at(&s, posi) {
405                        break;
406                    }
407                }
408                count -= 1;
409            }
410        }
411    }
412
413    if count == 0 {
414        state.push(LuaValue::Int(posi + 1)); // 0-based → 1-based
415    } else {
416        state.push(LuaValue::Nil); // luaL_pushfail: character not found
417    }
418    Ok(1)
419}
420
421/// Internal iterator body shared by `iter_aux_strict` and `iter_aux_lax`.
422///
423/// Stack on entry (from the generic for): (1) string, (2) current byte position
424/// (0-based; initially pushed as 0 by `iter_codes`).
425///
426/// Advances past any leading continuation bytes, decodes the next character,
427/// and returns `(next_1based_pos, codepoint)`.  Returns nothing (0) when the
428/// string is exhausted.
429///
430fn iter_aux(state: &mut LuaState, strict: bool) -> Result<usize, LuaError> {
431    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
432    let len = s.len();
433
434    // TODO(port): to_integer(n) exact return type (i64/Option<i64>) not yet confirmed;
435    // treating as i64 cast to u64 for unsigned byte-index arithmetic.
436    let mut n: u64 = state.to_integer(2).unwrap_or(0) as u64;
437
438    if (n as usize) < len {
439        while (n as usize) < len && is_cont(s[n as usize]) {
440            n += 1;
441        }
442    }
443
444    if (n as usize) >= len {
445        return Ok(0); // no more codepoints
446    }
447
448    //    if (next == NULL || iscontp(next)) return luaL_error(L, MSGInvalid);
449    match utf8_decode(&s[n as usize..], strict) {
450        None => Err(LuaError::runtime(format_args!("invalid UTF-8 code"))),
451        Some((remaining, code)) => {
452            let next_pos = len - remaining.len(); // 0-based index of the next character
453            // valid sequence indicates a malformed input stream.
454            if next_pos < len && is_cont(s[next_pos]) {
455                return Err(LuaError::runtime(format_args!("invalid UTF-8 code")));
456            }
457            state.push(LuaValue::Int((n + 1) as i64)); // 1-based position for next iteration
458            state.push(LuaValue::Int(code as i64));
459            Ok(2)
460        }
461    }
462}
463
464/// Strict iterator body: rejects surrogates and values > MAX_UNICODE.
465///
466fn iter_aux_strict(state: &mut LuaState) -> Result<usize, LuaError> {
467    iter_aux(state, true)
468}
469
470/// Lax iterator body: accepts extended UTF-8 up to MAX_UTF.
471///
472fn iter_aux_lax(state: &mut LuaState) -> Result<usize, LuaError> {
473    iter_aux(state, false)
474}
475
476/// `utf8.codes(s [, lax])` → function, string, integer
477///
478/// Returns the iterator triple `(f, s, 0)` for use in a generic for loop.
479/// Each call to `f(s, pos)` returns the next `(pos, codepoint)` pair.
480///
481fn iter_codes(state: &mut LuaState) -> Result<usize, LuaError> {
482    // TODO(port): to_boolean API (iter_codes lax mode).
483    let lax: bool = state.to_boolean(2);
484
485    let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
486
487    // The very first byte of the string must not be a continuation byte.
488    if s.first().map_or(false, |&b| is_cont(b)) {
489        return Err(LuaError::arg_error(1, "invalid UTF-8 code"));
490    }
491
492    // TODO(phase-b): LuaClosure::LightC in lua-types is fn() -> i32; needs widening to the real lua_CFunction signature. Stub via push_c_function until then.
493    let iter_fn: fn(&mut LuaState) -> Result<usize, LuaError> =
494        if lax { iter_aux_lax } else { iter_aux_strict };
495    state.push_c_function(iter_fn)?;
496
497    // TODO(port): push_value_at(idx) not yet confirmed in LuaState API.
498    state.push_value_at(1)?;
499
500    state.push(LuaValue::Int(0));
501
502    Ok(3)
503}
504
505// ── Library registration ───────────────────────────────────────────────────
506
507/// Function registration table for the `utf8` library.
508///
509/// "charpattern" is intentionally absent here; it is a string value and is
510/// registered separately inside `open_utf8` via `lua_setfield`.
511pub const FUNCS: &[(&[u8], fn(&mut LuaState) -> Result<usize, LuaError>)] = &[
512    (b"offset", byte_offset),
513    (b"codepoint", codepoint),
514    (b"char", utf_char),
515    (b"len", utf_len),
516    (b"codes", iter_codes),
517];
518
519/// Open the `utf8` library.
520///
521/// Registers all functions from `FUNCS` into a new table, then sets
522/// `utf8.charpattern` to the byte-string pattern matching one UTF-8 sequence.
523///
524pub fn open_utf8(state: &mut LuaState) -> Result<usize, LuaError> {
525    // TODO(port): new_lib(funcs) API on LuaState not yet confirmed; expected to
526    // create a new table and register all (name, fn) pairs from FUNCS.
527    state.new_lib(FUNCS)?;
528
529    let patt = state.intern_str(UTF8_PATT)?;
530    state.push(LuaValue::Str(patt));
531
532    // TODO(port): set_field(table_idx, field_name) API on LuaState not yet confirmed.
533    state.set_field(-2, b"charpattern")?;
534
535    Ok(1)
536}
537
538// ──────────────────────────────────────────────────────────────────────────
539// PORT STATUS
540//   source:        src/lutf8lib.c  (291 lines, 9 functions)
541//   target_crate:  lua-stdlib
542//   confidence:    medium
543//   todos:         13
544//   port_notes:    2
545//   unsafe_blocks: 0   (must be 0 outside explicit unsafe-budget crates)
546//   notes:         Core UTF-8 logic (utf8_decode, encode_utf8_codepoint,
547//                  pos_relat, is_cont_at) is a faithful translation and should
548//                  be correct. All 13 TODOs are unresolved LuaState API names:
549//                  opt_arg_integer, to_boolean, stack_top, push_value_at,
550//                  new_lib, set_field, and to_integer — Phase B reconciles
551//                  these against the actual method signatures. No unsafe
552//                  blocks; NUL-terminator reliance in C replaced by Rust
553//                  bounds checks throughout.
554// ──────────────────────────────────────────────────────────────────────────