luars 0.17.0

A library for lua 5.5 runtime implementation in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
// UTF-8 library
// Implements: char, charpattern, codes, codepoint, len, offset

use crate::lib_registry::LibraryModule;
use crate::lua_value::LuaValue;
use crate::lua_vm::LuaResult;
use crate::lua_vm::LuaState;

pub fn create_utf8_lib() -> LibraryModule {
    let mut module = crate::lib_module!("utf8", {
        "len" => utf8_len,
        "char" => utf8_char,
        "codes" => utf8_codes,
        "codepoint" => utf8_codepoint,
        "offset" => utf8_offset,
    });

    // Add charpattern constant - the byte-level pattern matching a single UTF-8 character
    // Equivalent to: "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
    module = module.with_value("charpattern", |vm| {
        vm.create_binary(vec![
            b'[', 0x00, b'-', 0x7F, 0xC2, b'-', 0xFD, b']', b'[', 0x80, b'-', 0xBF, b']', b'*',
        ])
    });

    module
}

/// Helper: translate a relative string position (negative means back from end)
/// Matches C Lua's u_posrelat
#[inline]
fn u_posrelat(pos: i64, len: usize) -> i64 {
    if pos >= 0 {
        pos
    } else {
        let upos = (-pos) as usize;
        if upos > len { 0 } else { len as i64 + pos + 1 }
    }
}

#[inline]
fn iscont(b: u8) -> bool {
    (b & 0xC0) == 0x80
}

const MAXUNICODE: u32 = 0x10FFFF;
const MAXUTF: u32 = 0x7FFFFFFF;

/// Decode one UTF-8 sequence from byte slice. Returns (codepoint, byte_length).
/// If strict is true, rejects surrogates and values > MAXUNICODE.
fn decode_utf8(s: &[u8], strict: bool) -> Result<(u32, usize), String> {
    if s.is_empty() {
        return Err("invalid UTF-8 code".to_string());
    }
    let c = s[0];
    if c < 0x80 {
        return Ok((c as u32, 1));
    }
    // Determine expected length and limits
    static LIMITS: [u32; 6] = [u32::MAX, 0x80, 0x800, 0x10000, 0x200000, 0x4000000];
    let mut count = 0usize;
    let mut mask = c;
    while mask & 0x40 != 0 {
        count += 1;
        mask <<= 1;
    }
    if count == 0 || count > 5 {
        return Err("invalid UTF-8 code".to_string());
    }
    // First byte contributes: c & ((1 << (7-count)) - 1) = c & (0x7F >> count)
    let mut res = (c & (0x7F >> count)) as u32;
    for i in 1..=count {
        if i >= s.len() || !iscont(s[i]) {
            return Err("invalid UTF-8 code".to_string());
        }
        res = (res << 6) | (s[i] & 0x3F) as u32;
    }
    if res > MAXUTF || res < LIMITS[count] {
        return Err("invalid UTF-8 code".to_string());
    }
    if strict && (res > MAXUNICODE || (0xD800..=0xDFFF).contains(&res)) {
        return Err("invalid UTF-8 code".to_string());
    }
    Ok((res, count + 1))
}

/// Encode a codepoint into extended UTF-8 bytes (supports up to 0x7FFFFFFF)
fn encode_utf8_extended(x: u32) -> Vec<u8> {
    if x < 0x80 {
        return vec![x as u8];
    }
    let mut bytes = Vec::new();
    let mut x = x;
    let mut mfb: u32 = 0x3f;
    loop {
        bytes.push(0x80 | (x & 0x3f) as u8);
        x >>= 6;
        mfb >>= 1;
        if x <= mfb {
            break;
        }
    }
    bytes.push(((!mfb << 1) | x) as u8);
    bytes.reverse();
    bytes
}

fn utf8_len(l: &mut LuaState) -> LuaResult<usize> {
    let s_value = l
        .get_arg(1)
        .ok_or_else(|| l.error("bad argument #1 to 'len' (string expected)".to_string()))?;
    let Some(bytes) = s_value.as_bytes() else {
        return Err(l.error("bad argument #1 to 'len' (string expected)".to_string()));
    };

    let len = bytes.len();
    let lax = l.get_arg(4).and_then(|v| v.as_bool()).unwrap_or(false);

    // Get byte positions using u_posrelat (like C Lua)
    let i_raw = l.get_arg(2).and_then(|v| v.as_integer()).unwrap_or(1);
    let j_raw = l.get_arg(3).and_then(|v| v.as_integer()).unwrap_or(-1);

    let mut posi = u_posrelat(i_raw, len);
    let mut posj = u_posrelat(j_raw, len);

    // luaL_argcheck: 1 <= posi && --posi <= len
    if posi < 1 || {
        posi -= 1;
        posi
    } > len as i64
    {
        return Err(
            l.error("bad argument #2 to 'len' (initial position out of bounds)".to_string())
        );
    }
    // luaL_argcheck: --posj < len
    posj -= 1;
    if posj >= len as i64 {
        return Err(l.error("bad argument #3 to 'len' (final position out of bounds)".to_string()));
    }

    let mut n: i64 = 0;
    // Keep posi/posj as i64 for the loop comparison (C Lua uses signed lua_Integer).
    // Casting a negative posj to usize would wrap to usize::MAX, breaking empty-range detection.
    while posi <= posj {
        let pos = posi as usize;
        if pos >= bytes.len() {
            break;
        }
        match decode_utf8(&bytes[pos..], !lax) {
            Ok((_code, char_len)) => {
                posi += char_len as i64;
                n += 1;
            }
            Err(_) => {
                // Conversion error: return nil + error position
                l.push_value(LuaValue::nil())?;
                l.push_value(LuaValue::integer(pos as i64 + 1))?;
                return Ok(2);
            }
        }
    }

    l.push_value(LuaValue::integer(n))?;
    Ok(1)
}

fn utf8_char(l: &mut LuaState) -> LuaResult<usize> {
    let args = l.get_args();

    let mut result_bytes: Vec<u8> = Vec::new();
    for arg in args {
        if let Some(code) = arg.as_integer() {
            if code < 0 || code as u32 > MAXUTF {
                return Err(l.error("bad argument to 'char' (value out of range)".to_string()));
            }
            let code = code as u32;
            if let Some(ch) = char::from_u32(code) {
                let mut buf = [0u8; 4];
                let s = ch.encode_utf8(&mut buf);
                result_bytes.extend_from_slice(s.as_bytes());
            } else {
                // Extended encoding for surrogates and values > 0x10FFFF
                result_bytes.extend_from_slice(&encode_utf8_extended(code));
            }
        } else {
            return Err(l.error("bad argument to 'char' (number expected)".to_string()));
        }
    }

    let val = l.create_bytes(&result_bytes)?;
    l.push_value(val)?;
    Ok(1)
}

/// utf8.codes(s [, lax]) - Returns an iterator for UTF-8 characters
fn utf8_codes(l: &mut LuaState) -> LuaResult<usize> {
    let s_value = l
        .get_arg(1)
        .ok_or_else(|| l.error("bad argument #1 to 'codes' (string expected)".to_string()))?;
    if s_value.as_bytes().is_none() {
        return Err(l.error("bad argument #1 to 'codes' (string expected)".to_string()));
    }

    // Create state table: {string = s, position = 0}
    let state_table = l.create_table(2, 0)?;
    let string_key = LuaValue::integer(1);
    let position_key = LuaValue::integer(2);

    if let Some(table) = state_table.as_table_mut() {
        table.raw_set(&string_key, s_value);
        table.raw_set(&position_key, LuaValue::integer(0));
    }

    l.push_value(LuaValue::cfunction(utf8_codes_iterator))?;
    l.push_value(state_table)?;
    l.push_value(LuaValue::nil())?;
    Ok(3)
}

/// Iterator function for utf8.codes
fn utf8_codes_iterator(l: &mut LuaState) -> LuaResult<usize> {
    let t_value = l
        .get_arg(1)
        .ok_or_else(|| l.error("utf8.codes iterator: invalid state".to_string()))?;

    let string_key = 1;
    let position_key = 2;

    // Extract string and position from state table
    let Some(table) = t_value.as_table() else {
        return Err(l.error("utf8.codes iterator: invalid state".to_string()));
    };

    let Some(s_val) = table.raw_geti(string_key) else {
        return Err(l.error("utf8.codes iterator: string not found".to_string()));
    };

    // Accept both string and binary
    let Some(bytes) = s_val.as_bytes() else {
        return Err(l.error("utf8.codes iterator: invalid string".to_string()));
    };

    let lax = false; // TODO: support lax codes iterator

    let pos = table
        .raw_geti(position_key)
        .and_then(|v| v.as_integer())
        .unwrap_or(0) as usize;

    if pos >= bytes.len() {
        l.push_value(LuaValue::nil())?;
        return Ok(1);
    }

    // Decode next UTF-8 character using decode_utf8
    let remaining = &bytes[pos..];
    match decode_utf8(remaining, !lax) {
        Ok((code_point, char_len)) => {
            // Update position in the state table
            l.raw_seti(
                &t_value,
                position_key,
                LuaValue::integer((pos + char_len) as i64),
            );

            l.push_value(LuaValue::integer((pos + 1) as i64))?; // 1-based position
            l.push_value(LuaValue::integer(code_point as i64))?;
            Ok(2)
        }
        Err(e) => Err(l.error(e)),
    }
}

/// utf8.codepoint(s [, i [, j [, lax]]]) - Returns code points of characters
/// Follows C Lua's codepoint() using u_posrelat for indices
fn utf8_codepoint(l: &mut LuaState) -> LuaResult<usize> {
    let s_value = l
        .get_arg(1)
        .ok_or_else(|| l.error("bad argument #1 to 'codepoint' (string expected)".to_string()))?;
    // Accept both string and binary values
    let Some(bytes) = s_value.as_bytes() else {
        return Err(l.error("bad argument #1 to 'codepoint' (string expected)".to_string()));
    };

    let len = bytes.len();

    let i_raw = l.get_arg(2).and_then(|v| v.as_integer()).unwrap_or(1);
    let posi = u_posrelat(i_raw, len);
    let j_raw = l.get_arg(3).and_then(|v| v.as_integer()).unwrap_or(posi);
    let pose = u_posrelat(j_raw, len);
    let lax = l.get_arg(4).and_then(|v| v.as_bool()).unwrap_or(false);

    // luaL_argcheck
    if posi < 1 {
        return Err(l.error("bad argument #2 to 'codepoint' (out of bounds)".to_string()));
    }
    if pose > len as i64 {
        return Err(l.error("bad argument #3 to 'codepoint' (out of bounds)".to_string()));
    }
    if posi > pose {
        return Ok(0); // empty interval
    }

    let mut count = 0;
    let se = pose as usize; // end byte (1-based inclusive → byte index)
    let mut pos = (posi - 1) as usize; // 0-based start

    while pos < se {
        let remaining = &bytes[pos..];
        // Decode one UTF-8 character
        let (code, char_len) = decode_utf8(remaining, !lax).map_err(|e| l.error(e))?;
        l.push_value(LuaValue::integer(code as i64))?;
        count += 1;
        pos += char_len;
    }

    Ok(count)
}

/// utf8.offset(s, n [, i]) - Returns byte position where n-th character
/// counting from position 'i' starts and ends; 0 means character at 'i'.
/// Follows C Lua 5.5's byteoffset() exactly.
fn utf8_offset(l: &mut LuaState) -> LuaResult<usize> {
    let s_value = l
        .get_arg(1)
        .ok_or_else(|| l.error("bad argument #1 to 'offset' (string expected)".to_string()))?;
    let Some(bytes) = s_value.as_bytes() else {
        return Err(l.error("bad argument #1 to 'offset' (string expected)".to_string()));
    };

    let n_value = l
        .get_arg(2)
        .ok_or_else(|| l.error("bad argument #2 to 'offset' (number expected)".to_string()))?;
    let Some(n) = n_value.as_integer() else {
        return Err(l.error("bad argument #2 to 'offset' (number expected)".to_string()));
    };

    let len = bytes.len();

    // Default i: if n >= 0 then 1 else len+1
    let default_i = if n >= 0 { 1i64 } else { len as i64 + 1 };
    let i_raw = l
        .get_arg(3)
        .and_then(|v| v.as_integer())
        .unwrap_or(default_i);
    let mut posi = u_posrelat(i_raw, len);

    // luaL_argcheck: 1 <= posi && --posi <= len
    if posi < 1 || {
        posi -= 1;
        posi
    } > len as i64
    {
        return Err(l.error("bad argument #3 to 'offset' (position out of bounds)".to_string()));
    }

    let mut n = n;

    if n == 0 {
        // Find beginning of current byte sequence
        while posi > 0 && (posi as usize) < len && iscont(bytes[posi as usize]) {
            posi -= 1;
        }
    } else {
        if (posi as usize) < len && iscont(bytes[posi as usize]) {
            return Err(l.error("initial position is a continuation byte".to_string()));
        }
        if n < 0 {
            while n < 0 && posi > 0 {
                // Find beginning of previous character
                loop {
                    posi -= 1;
                    if posi <= 0 || !iscont(bytes[posi as usize]) {
                        break;
                    }
                }
                n += 1;
            }
        } else {
            n -= 1; // do not move for 1st character
            while n > 0 && (posi as usize) < len {
                // Find beginning of next character
                loop {
                    posi += 1;
                    if (posi as usize) >= len || !iscont(bytes[posi as usize]) {
                        break;
                    }
                }
                n -= 1;
            }
        }
    }

    if n != 0 {
        // Did not find given character - return nil
        l.push_value(LuaValue::nil())?;
        return Ok(1);
    }

    // Push initial position (1-based)
    l.push_value(LuaValue::integer(posi + 1))?;

    // Find end position of this character
    let pos_usize = posi as usize;
    if pos_usize < len && (bytes[pos_usize] & 0x80) != 0 {
        // Multi-byte character
        if iscont(bytes[pos_usize]) {
            return Err(l.error("initial position is a continuation byte".to_string()));
        }
        let mut end_pos = posi;
        while (end_pos as usize + 1) < len && iscont(bytes[end_pos as usize + 1]) {
            end_pos += 1;
        }
        // Push final position (1-based)
        l.push_value(LuaValue::integer(end_pos + 1))?;
    } else {
        // One-byte character (or position == len+1): final position is the initial one
        l.push_value(LuaValue::integer(posi + 1))?;
    }

    Ok(2)
}