lua_stdlib/utf8_lib.rs
1//! UTF-8 standard library for Lua 5.4.
2//!
3//! Port of `lutf8lib.c` (291 lines, 9 functions).
4//!
5//! Provides the `utf8` module with `char`, `codepoint`, `codes`, `len`,
6//! `offset`, and `charpattern`. Supports both strict (Unicode-conformant)
7//! and lax (extended UTF-8, up to `MAX_UTF = 0x7FFFFFFF`) decoding modes.
8//!
9//! Strict mode rejects surrogates (U+D800..U+DFFF) and values above U+10FFFF.
10//! Lax mode accepts any well-formed byte sequence with a value ≤ MAX_UTF.
11
12use lua_types::error::LuaError;
13use lua_types::value::LuaValue;
14use crate::state_stub::{LuaState, LuaStateStubExt as _};
15
16const MAX_UNICODE: u32 = 0x10_FFFF;
17
18const MAX_UTF: u32 = 0x7FFF_FFFF;
19
20// 31 bits are needed for MAX_UTF; u32 is sufficient on all Rust targets.
21type UtfInt = u32;
22
23// sizeof(UTF8PATT)/sizeof(char) - 1 = 14 bytes (contains an embedded NUL).
24const UTF8_PATT: &[u8] = b"[\x00-\x7F\xC2-\xFD][\x80-\xBF]*";
25
26// ── Internal helpers ───────────────────────────────────────────────────────
27
28/// Translate a relative string position: negative values count backward from end.
29///
30fn pos_relat(pos: i64, len: usize) -> i64 {
31 if pos >= 0 {
32 pos
33 } else {
34 // 0u - (size_t)pos is the magnitude of pos as an unsigned value.
35 let abs_pos = pos.unsigned_abs() as u64;
36 if abs_pos > len as u64 {
37 0
38 } else {
39 len as i64 + pos + 1
40 }
41 }
42}
43
44/// Return `true` if byte `c` is a UTF-8 continuation byte (`10xxxxxx`).
45///
46#[inline]
47fn is_cont(c: u8) -> bool {
48 (c & 0xC0) == 0x80
49}
50
51/// Return `true` if the byte at 0-based index `pos` in `s` is a continuation
52/// byte, treating out-of-bounds positions as non-continuation.
53///
54/// C strings carry a NUL terminator that is never a continuation byte;
55/// the bounds-check here replaces that guarantee.
56#[inline]
57fn is_cont_at(s: &[u8], pos: i64) -> bool {
58 if pos < 0 {
59 return false;
60 }
61 s.get(pos as usize).map_or(false, |&b| is_cont(b))
62}
63
64/// Decode one UTF-8 sequence from the start of `s`.
65///
66/// Returns `None` if the byte sequence is invalid.
67/// Returns `Some((remaining_slice, codepoint))` on success.
68///
69/// When `strict` is `true`, surrogates and values above `MAX_UNICODE` are
70/// rejected. When `false`, any value ≤ `MAX_UTF` is accepted (extended UTF-8).
71///
72fn utf8_decode(s: &[u8], strict: bool) -> Option<(&[u8], UtfInt)> {
73 // LIMITS[count] is the minimum value for a sequence with `count` continuation bytes.
74 // LIMITS[0] = u32::MAX forces an error when a non-ASCII byte has no continuation bytes.
75 const LIMITS: [UtfInt; 6] = [u32::MAX, 0x80, 0x800, 0x10000, 0x200000, 0x4000000];
76
77 if s.is_empty() {
78 return None;
79 }
80
81 let mut c = s[0] as u32;
82 let res: UtfInt;
83 let advance: usize;
84
85 if c < 0x80 {
86 // ASCII fast path — no continuation bytes needed.
87 res = c;
88 advance = 1;
89 } else {
90 let mut count: usize = 0;
91 let mut r: UtfInt = 0;
92
93 // The C for-loop runs the body first, then applies `c <<= 1` as the update.
94 while c & 0x40 != 0 {
95 count += 1;
96 if count >= s.len() {
97 return None; // string too short for the indicated sequence length
98 }
99 let cc = s[count] as u32;
100
101 if (cc & 0xC0) != 0x80 {
102 return None; // expected continuation byte, got something else
103 }
104
105 r = (r << 6) | (cc & 0x3F);
106
107 // C for-loop update: c <<= 1
108 c <<= 1;
109 }
110
111 r |= (c & 0x7F) << (count as u32 * 5);
112
113 if count > 5 || r > MAX_UTF || r < LIMITS[count] {
114 return None; // invalid (overlong, too large, or excess continuation bytes)
115 }
116
117 res = r;
118 advance = count + 1;
119 if advance > s.len() {
120 return None;
121 }
122 }
123
124 if strict && (res > MAX_UNICODE || (0xD800 <= res && res <= 0xDFFF)) {
125 return None; // surrogate or out-of-Unicode-range value in strict mode
126 }
127
128 Some((&s[advance..], res))
129}
130
131/// Encode a codepoint (≤ `MAX_UTF`) as extended UTF-8 bytes.
132///
133/// Mirrors `luaO_utf8esc` from `lobject.c`, which fills a fixed buffer backwards.
134/// This Rust version builds the bytes naturally and returns a `Vec<u8>`.
135///
136fn encode_utf8_codepoint(code: u32) -> Vec<u8> {
137 debug_assert!(code <= MAX_UTF);
138
139 if code < 0x80 {
140 return vec![code as u8];
141 }
142
143 let mut x = code;
144 let mut mfb: u32 = 0x3F;
145 // Continuation bytes built in reverse, then reversed at the end.
146 let mut bytes_rev: Vec<u8> = Vec::with_capacity(6);
147
148 // while (x > mfb);
149 loop {
150 bytes_rev.push(0x80 | (x & 0x3F) as u8);
151 x >>= 6;
152 mfb >>= 1;
153 if x <= mfb {
154 break;
155 }
156 }
157
158 // wrapping_shl avoids a Rust debug-mode overflow panic on `!mfb << 1`
159 // (e.g., !0x1Fu32 = 0xFFFF_FFE0; << 1 = 0xFFFF_FFC0; as u8 = 0xC0).
160 let leading = ((!mfb).wrapping_shl(1) as u8) | (x as u8);
161
162 let mut result = Vec::with_capacity(bytes_rev.len() + 1);
163 result.push(leading);
164 for &b in bytes_rev.iter().rev() {
165 result.push(b);
166 }
167 result
168}
169
170// ── Library functions ──────────────────────────────────────────────────────
171
172/// `utf8.len(s [, i [, j [, lax]]])` → integer | (nil, integer)
173///
174/// Returns the number of UTF-8 characters that start in the byte range `[i,j]`
175/// of string `s` (1-based, defaulting to the whole string).
176/// On a malformed sequence, returns `(nil, position)` where `position` is the
177/// 1-based byte offset of the first bad byte.
178///
179fn utf_len(state: &mut LuaState) -> Result<usize, LuaError> {
180 // Clone to avoid holding a borrow across subsequent mutable state calls.
181 let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
182 let len = s.len();
183
184 let raw_posi: i64 = state.opt_arg_integer(2, 1)?;
185 let mut posi: i64 = pos_relat(raw_posi, len);
186
187 let raw_posj: i64 = state.opt_arg_integer(3, -1)?;
188 let mut posj: i64 = pos_relat(raw_posj, len);
189
190 let lax: bool = state.to_boolean(4);
191
192 // Note: C short-circuits, so --posi only executes when 1 <= posi.
193 if posi < 1 {
194 return Err(LuaError::arg_error(2, "initial position out of bounds"));
195 }
196 posi -= 1; // 1-based → 0-based
197 if posi > len as i64 {
198 return Err(LuaError::arg_error(2, "initial position out of bounds"));
199 }
200
201 posj -= 1; // 1-based → 0-based (always decremented, no short-circuit)
202 if posj >= len as i64 {
203 return Err(LuaError::arg_error(3, "final position out of bounds"));
204 }
205
206 let mut n: i64 = 0;
207
208 while posi <= posj {
209 match utf8_decode(&s[posi as usize..], !lax) {
210 None => {
211 state.push(LuaValue::Nil); // luaL_pushfail
212 state.push(LuaValue::Int(posi + 1)); // 1-based position of failure
213 return Ok(2);
214 }
215 Some((remaining, _)) => {
216 posi = (len - remaining.len()) as i64;
217 n += 1;
218 }
219 }
220 }
221
222 state.push(LuaValue::Int(n));
223 Ok(1)
224}
225
226/// `utf8.codepoint(s [, i [, j [, lax]]])` → integer, ...
227///
228/// Returns the codepoints (as integers) for all characters starting in `s[i..j]`.
229///
230fn codepoint(state: &mut LuaState) -> Result<usize, LuaError> {
231 let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
232 let len = s.len();
233
234 let raw_posi: i64 = state.opt_arg_integer(2, 1)?;
235 let posi: i64 = pos_relat(raw_posi, len);
236
237 // Default for the end position is posi (1-based), giving a single character.
238 let raw_pose: i64 = state.opt_arg_integer(3, posi)?;
239 let pose: i64 = pos_relat(raw_pose, len);
240
241 let lax: bool = state.to_boolean(4);
242
243 if posi < 1 {
244 return Err(LuaError::arg_error(2, "out of bounds"));
245 }
246
247 if pose > len as i64 {
248 return Err(LuaError::arg_error(3, "out of bounds"));
249 }
250
251 if posi > pose {
252 return Ok(0); // empty interval: no values
253 }
254
255 if pose - posi >= i32::MAX as i64 {
256 return Err(LuaError::runtime(format_args!("string slice too long")));
257 }
258
259 let n_max = (pose - posi + 1) as i32;
260 state.ensure_stack(n_max, "string slice too long")?;
261
262 // 0-based: start at (posi - 1), stop before byte index `pose`.
263 let mut pos: usize = (posi - 1) as usize; // 0-based start
264 let end: usize = pose as usize; // 0-based exclusive end
265 let mut count: usize = 0;
266
267 while pos < end {
268 match utf8_decode(&s[pos..], !lax) {
269 None => return Err(LuaError::runtime(format_args!("invalid UTF-8 code"))),
270 Some((remaining, code)) => {
271 state.push(LuaValue::Int(code as i64));
272 count += 1;
273 pos = len - remaining.len(); // advance by decoded character width
274 }
275 }
276 }
277
278 Ok(count)
279}
280
281/// Encode the codepoint at stack argument `arg` and return the UTF-8 bytes.
282///
283/// `Vec<u8>` directly rather than pushing to the stack, avoiding the push/pop
284/// dance that `luaL_Buffer` required.
285///
286/// PORT NOTE: C's `pushutfchar` called `lua_pushfstring(L, "%U", code)` to encode
287/// and push in one step. Here the encoding is extracted so `utf_char` can build
288/// the concatenated result without intermediate stack operations.
289fn get_utf_char_bytes(state: &mut LuaState, arg: i32) -> Result<Vec<u8>, LuaError> {
290 let code = state.check_arg_integer(arg)? as u64;
291
292 if code > MAX_UTF as u64 {
293 return Err(LuaError::arg_error(arg, "value out of range"));
294 }
295
296 Ok(encode_utf8_codepoint(code as u32))
297}
298
299/// `utf8.char(n1, n2, ...)` → string
300///
301/// Returns a string formed by the UTF-8 encoding of the given codepoints.
302///
303fn utf_char(state: &mut LuaState) -> Result<usize, LuaError> {
304 let n: i32 = state.stack_top() as i32;
305
306 if n == 1 {
307 let bytes = get_utf_char_bytes(state, 1)?;
308 let s = state.intern_str(&bytes)?;
309 state.push(LuaValue::Str(s));
310 } else {
311 // for (i = 1; i <= n; i++) { pushutfchar(L, i); luaL_addvalue(&b); }
312 // luaL_pushresult(&b);
313 // PORT NOTE: luaL_Buffer replaced by Vec<u8>; codepoints are encoded
314 // directly into the accumulator without intermediate stack push/pop.
315 let mut buf: Vec<u8> = Vec::new();
316 for i in 1..=n {
317 buf.extend_from_slice(&get_utf_char_bytes(state, i)?);
318 }
319 let s = state.intern_str(&buf)?;
320 state.push(LuaValue::Str(s));
321 }
322
323 Ok(1)
324}
325
326/// `utf8.offset(s, n [, i])` → integer | nil
327///
328/// Returns the byte offset where the n-th character (counting from position `i`)
329/// starts. Negative `n` counts from the end. `n == 0` returns the start of the
330/// character that contains position `i`.
331/// Returns `nil` if the character cannot be found.
332///
333fn byte_offset(state: &mut LuaState) -> Result<usize, LuaError> {
334 let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
335 let len = s.len();
336
337 let n: i64 = state.check_arg_integer(2)?;
338
339 let default_posi: i64 = if n >= 0 { 1 } else { len as i64 + 1 };
340
341 let raw_posi: i64 = state.opt_arg_integer(3, default_posi)?;
342 let posi_1based: i64 = pos_relat(raw_posi, len);
343
344 if posi_1based < 1 {
345 return Err(LuaError::arg_error(3, "position out of bounds"));
346 }
347 let mut posi: i64 = posi_1based - 1; // 1-based → 0-based
348 if posi > len as i64 {
349 return Err(LuaError::arg_error(3, "position out of bounds"));
350 }
351
352 // `count` is a mutable copy of `n`; driven to 0 when the target character is found.
353 let mut count = n;
354
355 if count == 0 {
356 // Scan backward to find the start of the character containing `posi`.
357 while posi > 0 && is_cont_at(&s, posi) {
358 posi -= 1;
359 }
360 // count remains 0
361 } else {
362 if is_cont_at(&s, posi) {
363 return Err(LuaError::runtime(format_args!(
364 "initial position is a continuation byte"
365 )));
366 }
367
368 if count < 0 {
369 // do { posi--; } while (posi > 0 && iscontp(s + posi));
370 // n++;
371 // }
372 while count < 0 && posi > 0 {
373 // do-while: always decrements at least once, then skips back over
374 // any continuation bytes to land on a leading byte.
375 loop {
376 posi -= 1;
377 if posi == 0 || !is_cont_at(&s, posi) {
378 break;
379 }
380 }
381 count += 1;
382 }
383 } else {
384 // while (n > 0 && posi < (lua_Integer)len) {
385 // do { posi++; } while (iscontp(s + posi)); /* cannot pass '\0' */
386 // n--;
387 // }
388 count -= 1; // do not move for the 1st character
389 while count > 0 && posi < len as i64 {
390 // C relies on the NUL terminator to stop the inner do-while.
391 // Rust uses an explicit bounds check instead.
392 loop {
393 posi += 1;
394 if !is_cont_at(&s, posi) {
395 break;
396 }
397 }
398 count -= 1;
399 }
400 }
401 }
402
403 if count == 0 {
404 state.push(LuaValue::Int(posi + 1)); // 0-based → 1-based
405 } else {
406 state.push(LuaValue::Nil); // luaL_pushfail: character not found
407 }
408 Ok(1)
409}
410
411/// Internal iterator body shared by `iter_aux_strict` and `iter_aux_lax`.
412///
413/// Stack on entry (from the generic for): (1) string, (2) current byte position
414/// (0-based; initially pushed as 0 by `iter_codes`).
415///
416/// Advances past any leading continuation bytes, decodes the next character,
417/// and returns `(next_1based_pos, codepoint)`. Returns nothing (0) when the
418/// string is exhausted.
419///
420fn iter_aux(state: &mut LuaState, strict: bool) -> Result<usize, LuaError> {
421 let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
422 let len = s.len();
423
424 let mut n: u64 = state.to_integer(2).unwrap_or(0) as u64;
425
426 if (n as usize) < len {
427 while (n as usize) < len && is_cont(s[n as usize]) {
428 n += 1;
429 }
430 }
431
432 if (n as usize) >= len {
433 return Ok(0); // no more codepoints
434 }
435
436 // if (next == NULL || iscontp(next)) return luaL_error(L, MSGInvalid);
437 match utf8_decode(&s[n as usize..], strict) {
438 None => Err(LuaError::runtime(format_args!("invalid UTF-8 code"))),
439 Some((remaining, code)) => {
440 let next_pos = len - remaining.len(); // 0-based index of the next character
441 // valid sequence indicates a malformed input stream.
442 if next_pos < len && is_cont(s[next_pos]) {
443 return Err(LuaError::runtime(format_args!("invalid UTF-8 code")));
444 }
445 state.push(LuaValue::Int((n + 1) as i64)); // 1-based position for next iteration
446 state.push(LuaValue::Int(code as i64));
447 Ok(2)
448 }
449 }
450}
451
452/// Strict iterator body: rejects surrogates and values > MAX_UNICODE.
453///
454fn iter_aux_strict(state: &mut LuaState) -> Result<usize, LuaError> {
455 iter_aux(state, true)
456}
457
458/// Lax iterator body: accepts extended UTF-8 up to MAX_UTF.
459///
460fn iter_aux_lax(state: &mut LuaState) -> Result<usize, LuaError> {
461 iter_aux(state, false)
462}
463
464/// `utf8.codes(s [, lax])` → function, string, integer
465///
466/// Returns the iterator triple `(f, s, 0)` for use in a generic for loop.
467/// Each call to `f(s, pos)` returns the next `(pos, codepoint)` pair.
468///
469fn iter_codes(state: &mut LuaState) -> Result<usize, LuaError> {
470 let lax: bool = state.to_boolean(2);
471
472 let s: Vec<u8> = state.check_arg_string(1)?.to_vec();
473
474 if s.first().map_or(false, |&b| is_cont(b)) {
475 return Err(LuaError::arg_error(1, "invalid UTF-8 code"));
476 }
477
478 let iter_fn: fn(&mut LuaState) -> Result<usize, LuaError> =
479 if lax { iter_aux_lax } else { iter_aux_strict };
480 state.push_c_function(iter_fn)?;
481
482 state.push_value_at(1)?;
483
484 state.push(LuaValue::Int(0));
485
486 Ok(3)
487}
488
489// ── Library registration ───────────────────────────────────────────────────
490
491/// Function registration table for the `utf8` library.
492///
493/// "charpattern" is intentionally absent here; it is a string value and is
494/// registered separately inside `open_utf8` via `lua_setfield`.
495pub const FUNCS: &[(&[u8], fn(&mut LuaState) -> Result<usize, LuaError>)] = &[
496 (b"offset", byte_offset),
497 (b"codepoint", codepoint),
498 (b"char", utf_char),
499 (b"len", utf_len),
500 (b"codes", iter_codes),
501];
502
503/// Open the `utf8` library.
504///
505/// Registers all functions from `FUNCS` into a new table, then sets
506/// `utf8.charpattern` to the byte-string pattern matching one UTF-8 sequence.
507///
508pub fn open_utf8(state: &mut LuaState) -> Result<usize, LuaError> {
509 state.new_lib(FUNCS)?;
510
511 let patt = state.intern_str(UTF8_PATT)?;
512 state.push(LuaValue::Str(patt));
513
514 state.set_field(-2, b"charpattern")?;
515
516 Ok(1)
517}
518
519// ──────────────────────────────────────────────────────────────────────────
520// PORT STATUS
521// source: src/lutf8lib.c (291 lines, 9 functions)
522// target_crate: lua-stdlib
523// confidence: high
524// todos: 0
525// unsafe_blocks: 0 (must be 0 outside explicit unsafe-budget crates)
526// notes: Core UTF-8 logic (utf8_decode, encode_utf8_codepoint,
527// pos_relat, is_cont_at) is a faithful translation. LuaState
528// API names reconciled against state_stub overrides. No unsafe
529// blocks; NUL-terminator reliance in C replaced by Rust bounds
530// checks throughout.
531// ──────────────────────────────────────────────────────────────────────────