use crate::runtime::Value;
use crate::vm::builtins::{arg_error, raise_str};
use crate::vm::error::LuaError;
use crate::vm::exec::Vm;
pub(crate) fn open_utf8(vm: &mut Vm) {
let t = vm.heap.new_table();
let set = |vm: &mut Vm, name: &str, f| {
let fv = vm.native(f);
let k = Value::Str(vm.heap.intern(name.as_bytes()));
unsafe { t.as_mut() }
.set(&mut vm.heap, k, fv)
.expect("valid key");
};
set(vm, "char", u_char);
set(vm, "codepoint", u_codepoint);
set(vm, "len", u_len);
set(vm, "offset", u_offset);
set(vm, "codes", u_codes);
let k = Value::Str(vm.heap.intern(b"charpattern"));
let v = Value::Str(vm.heap.intern(b"[\x00-\x7F\xC2-\xFD][\x80-\xBF]*"));
unsafe { t.as_mut() }
.set(&mut vm.heap, k, v)
.expect("valid key");
vm.set_global("utf8", Value::Table(t))
.expect("stdlib registration");
vm.barrier_back_table(t);
}
const MAX_UNICODE: u32 = 0x10_FFFF;
fn decode(s: &[u8], i: usize, strict: bool) -> Option<(u32, usize)> {
let c = *s.get(i)?;
if c < 0x80 {
return Some((c as u32, i + 1));
}
if c < 0xC0 {
return None; }
let count = match c {
0xC0..=0xDF => 1,
0xE0..=0xEF => 2,
0xF0..=0xF7 => 3,
0xF8..=0xFB => 4,
0xFC..=0xFD => 5,
_ => return None,
};
let mut cp = (c as u32) & (0x7F >> count);
for k in 1..=count {
let cc = *s.get(i + k)?;
if cc & 0xC0 != 0x80 {
return None;
}
cp = (cp << 6) | (cc as u32 & 0x3F);
}
const LIMITS: [u32; 6] = [0x80, 0x800, 0x10000, 0x200000, 0x4000000, 0x8000_0000];
if count >= 6 || cp < LIMITS[count - 1] {
return None;
}
if strict && (cp > MAX_UNICODE || (0xD800..=0xDFFF).contains(&cp)) {
return None;
}
Some((cp, i + 1 + count))
}
fn encode(out: &mut Vec<u8>, x: u32) {
if x < 0x80 {
out.push(x as u8);
return;
}
let mut cont = [0u8; 6];
let mut n = 0;
let mut mfb: u32 = 0x3f;
let mut x = x;
loop {
cont[n] = 0x80 | (x & 0x3f) as u8;
n += 1;
x >>= 6;
mfb >>= 1;
if x <= mfb {
break;
}
}
out.push(((!mfb << 1) | x) as u8);
out.extend(cont[..n].iter().rev());
}
fn check_strv(vm: &mut Vm, fs: u32, nargs: u32, i: u32, who: &str) -> Result<Vec<u8>, LuaError> {
match vm.nat_arg(fs, nargs, i) {
Value::Str(s) => Ok(s.as_bytes().to_vec()),
v => Err(arg_error(
vm,
i + 1,
who,
&format!("string expected, got {}", v.type_name()),
)),
}
}
fn posrelat(pos: i64, len: usize) -> i64 {
if pos >= 0 {
pos
} else if (-pos) as usize > len {
0
} else {
len as i64 + pos + 1
}
}
fn u_char(vm: &mut Vm, fs: u32, nargs: u32) -> Result<u32, LuaError> {
let mut out = Vec::new();
let cap: i64 = if vm.version() <= crate::version::LuaVersion::Lua53 {
0x10FFFF
} else {
0x7FFF_FFFF
};
for i in 0..nargs {
let c = vm.int_from(vm.nat_arg(fs, nargs, i), "use as a code point")?;
if !(0..=cap).contains(&c) {
return Err(arg_error(vm, i + 1, "char", "value out of range"));
}
encode(&mut out, c as u32);
}
let s = Value::Str(vm.heap.intern(&out));
Ok(vm.nat_return(fs, &[s]))
}
fn u_codepoint(vm: &mut Vm, fs: u32, nargs: u32) -> Result<u32, LuaError> {
let s = check_strv(vm, fs, nargs, 0, "codepoint")?;
let i = if nargs >= 2 {
posrelat(
vm.int_from(vm.nat_arg(fs, nargs, 1), "use as an index")?,
s.len(),
)
} else {
1
};
let j = if nargs >= 3 && !vm.nat_arg(fs, nargs, 2).is_nil() {
posrelat(
vm.int_from(vm.nat_arg(fs, nargs, 2), "use as an index")?,
s.len(),
)
} else {
i
};
let lax = vm.nat_arg(fs, nargs, 3).truthy();
let oob = if vm.version() <= crate::version::LuaVersion::Lua53 {
"out of range"
} else {
"out of bounds"
};
if i < 1 {
return Err(arg_error(vm, 2, "codepoint", oob));
}
if j > s.len() as i64 {
return Err(arg_error(vm, 3, "codepoint", oob));
}
let mut out = Vec::new();
let mut pos = (i - 1) as usize;
while pos < j as usize {
let Some((cp, next)) = decode(&s, pos, !lax) else {
return Err(raise_str(vm, "invalid UTF-8 code"));
};
out.push(Value::Int(cp as i64));
pos = next;
}
Ok(vm.nat_return(fs, &out))
}
fn u_len(vm: &mut Vm, fs: u32, nargs: u32) -> Result<u32, LuaError> {
let s = check_strv(vm, fs, nargs, 0, "len")?;
let i = if nargs >= 2 && !vm.nat_arg(fs, nargs, 1).is_nil() {
posrelat(
vm.int_from(vm.nat_arg(fs, nargs, 1), "use as an index")?,
s.len(),
)
} else {
1
};
let j = if nargs >= 3 && !vm.nat_arg(fs, nargs, 2).is_nil() {
posrelat(
vm.int_from(vm.nat_arg(fs, nargs, 2), "use as an index")?,
s.len(),
)
} else {
-1_i64 + s.len() as i64 + 1
};
let lax = vm.nat_arg(fs, nargs, 3).truthy();
if !(i >= 1 && i - 1 <= s.len() as i64) {
return Err(arg_error(vm, 2, "len", "initial position out of bounds"));
}
if j > s.len() as i64 {
return Err(arg_error(vm, 3, "len", "final position out of bounds"));
}
let mut n: i64 = 0;
let mut pos = (i - 1) as usize;
let end = j as usize;
while pos < end {
match decode(&s, pos, !lax) {
Some((_, next)) => {
n += 1;
pos = next;
}
None => {
return Ok(vm.nat_return(fs, &[Value::Nil, Value::Int(pos as i64 + 1)]));
}
}
}
Ok(vm.nat_return(fs, &[Value::Int(n)]))
}
fn is_cont(b: u8) -> bool {
b & 0xC0 == 0x80
}
fn u_offset(vm: &mut Vm, fs: u32, nargs: u32) -> Result<u32, LuaError> {
let s = check_strv(vm, fs, nargs, 0, "offset")?;
let n = vm.int_from(vm.nat_arg(fs, nargs, 1), "use as an index")?;
let len = s.len();
let default_i = if n >= 0 { 1 } else { len as i64 + 1 };
let i = if nargs >= 3 {
posrelat(
vm.int_from(vm.nat_arg(fs, nargs, 2), "use as an index")?,
len,
)
} else {
default_i
};
if !(1..=len as i64 + 1).contains(&i) {
let suffix = if vm.version() <= crate::version::LuaVersion::Lua53 {
"position out of range"
} else {
"position out of bounds"
};
return Err(arg_error(vm, 3, "offset", suffix));
}
let mut pos = (i - 1) as usize;
let mut n = n;
if n == 0 {
while pos > 0 && is_cont(s[pos]) {
pos -= 1;
}
} else {
if pos < len && is_cont(s[pos]) {
return Err(raise_str(vm, "initial position is a continuation byte"));
}
if n < 0 {
while n < 0 && pos > 0 {
loop {
pos -= 1;
if !(pos > 0 && is_cont(s[pos])) {
break;
}
}
n += 1;
}
} else {
n -= 1; while n > 0 && pos < len {
loop {
pos += 1;
if !(pos < len && is_cont(s[pos])) {
break;
}
}
n -= 1;
}
}
}
if n != 0 {
return Ok(vm.nat_return(fs, &[Value::Nil]));
}
let start = pos as i64 + 1;
if pos < len && s[pos] & 0x80 != 0 {
if is_cont(s[pos]) {
return Err(raise_str(vm, "initial position is a continuation byte"));
}
while pos + 1 < len && is_cont(s[pos + 1]) {
pos += 1;
}
}
Ok(vm.nat_return(fs, &[Value::Int(start), Value::Int(pos as i64 + 1)]))
}
fn codes_iter(vm: &mut Vm, fs: u32, nargs: u32) -> Result<u32, LuaError> {
let Value::Str(s) = vm.nat_arg(fs, nargs, 0) else {
return Err(raise_str(vm, "bad iterator state for 'codes'"));
};
let prev = match vm.nat_arg(fs, nargs, 1) {
Value::Int(i) => i,
_ => 0,
};
let lax = vm.nat_upval(fs, 0).truthy();
let bytes = s.as_bytes().to_vec();
if prev < 0 || prev as usize > bytes.len() {
return Ok(vm.nat_return(fs, &[Value::Nil]));
}
let mut pos = prev as usize;
if pos > 0 {
let Some((_, next)) = decode(&bytes, pos - 1, !lax) else {
return Err(raise_str(vm, "invalid UTF-8 code"));
};
pos = next;
}
if pos >= bytes.len() {
return Ok(vm.nat_return(fs, &[Value::Nil]));
}
let Some((cp, _)) = decode(&bytes, pos, !lax) else {
return Err(raise_str(vm, "invalid UTF-8 code"));
};
Ok(vm.nat_return(fs, &[Value::Int(pos as i64 + 1), Value::Int(cp as i64)]))
}
fn u_codes(vm: &mut Vm, fs: u32, nargs: u32) -> Result<u32, LuaError> {
let s = vm.nat_arg(fs, nargs, 0);
let Value::Str(_) = s else {
return Err(arg_error(
vm,
1,
"codes",
&format!("string expected, got {}", s.type_name()),
));
};
let lax = Value::Bool(vm.nat_arg(fs, nargs, 1).truthy());
let it = vm.native_with(codes_iter, Box::new([lax]));
Ok(vm.nat_return(fs, &[it, s, Value::Int(0)]))
}