print "testing UTF-8 library"
local function checkerror (msg, f, ...)
local s, err = pcall(f, ...)
assert(not s and string.find(err, msg))
end
local function len (s)
return #string.gsub(s, "[\x80-\xBF]", "")
end
local justone = "^" .. utf8.charpattern .. "$"
assert(not utf8.offset("alo", 5))
assert(not utf8.offset("alo", -4))
local function check (s, t, nonstrict)
local l = utf8.len(s, 1, -1, nonstrict)
assert(#t == l and len(s) == l)
assert(utf8.char(table.unpack(t)) == s)
assert(utf8.offset(s, 0) == 1)
local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
assert(#t == #t1)
for i = 1, #t do assert(t[i] == t1[i]) end
for i = 1, l do local pi = utf8.offset(s, i) local pi1 = utf8.offset(s, 2, pi) assert(string.find(string.sub(s, pi, pi1 - 1), justone))
assert(utf8.offset(s, -1, pi1) == pi)
assert(utf8.offset(s, i - l - 1) == pi)
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
for j = pi, pi1 - 1 do
assert(utf8.offset(s, 0, j) == pi)
end
for j = pi + 1, pi1 - 1 do
assert(not utf8.len(s, j))
end
assert(utf8.len(s, pi, pi, nonstrict) == 1)
assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
assert(utf8.len(s, 1, pi, nonstrict) == i)
end
local i = 0
for p, c in utf8.codes(s, nonstrict) do
i = i + 1
assert(c == t[i] and p == utf8.offset(s, i))
assert(utf8.codepoint(s, p, p, nonstrict) == c)
end
assert(i == #t)
i = 0
for c in string.gmatch(s, utf8.charpattern) do
i = i + 1
assert(c == utf8.char(t[i]))
end
assert(i == #t)
for i = 1, l do
assert(utf8.offset(s, i) == utf8.offset(s, i - l - 1, #s + 1))
end
end
do local function check (s, p)
local a, b = utf8.len(s)
assert(not a and b == p)
end
check("abc\xE3def", 4)
check("汉字\x80", #("汉字") + 1)
check("\xF4\x9F\xBF", 1)
check("\xF4\x9F\xBF\xBF", 1)
end
do
local function errorcodes (s)
checkerror("invalid UTF%-8 code",
function ()
for c in utf8.codes(s) do assert(c) end
end)
end
errorcodes("ab\xff")
end
checkerror("position out of range", utf8.offset, "abc", 1, 5)
checkerror("position out of range", utf8.offset, "abc", 1, -4)
checkerror("position out of range", utf8.offset, "", 1, 2)
checkerror("position out of range", utf8.offset, "", 1, -1)
checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
checkerror("continuation byte", utf8.offset, "\x80", 1)
checkerror("out of string", utf8.len, "abc", 0, 2)
checkerror("out of string", utf8.len, "abc", 1, 4)
local s = "hello World"
local t = {string.byte(s, 1, -1)}
for i = 1, utf8.len(s) do assert(t[i] == string.byte(s, i)) end
check(s, t)
check("汉字/漢字", {27721, 23383, 47, 28450, 23383,})
do
local s = "áéí\128"
local t = {utf8.codepoint(s,1,#s - 1)}
assert(#t == 3 and t[1] == 225 and t[2] == 233 and t[3] == 237)
checkerror("invalid UTF%-8 code", utf8.codepoint, s, 1, #s)
checkerror("out of range", utf8.codepoint, s, #s + 1)
t = {utf8.codepoint(s, 4, 3)}
assert(#t == 0)
checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
end
assert(utf8.char() == "")
assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
checkerror("value out of range", utf8.char, -1)
local function invalid (s)
checkerror("invalid UTF%-8 code", utf8.codepoint, s)
assert(not utf8.len(s))
end
invalid("\xF4\x9F\xBF\xBF")
invalid("\xC0\x80") invalid("\xC1\xBF") invalid("\xE0\x9F\xBF") invalid("\xF0\x8F\xBF\xBF")
invalid("\x80") invalid("\xBF") invalid("\xFE") invalid("\xFF")
check("", {})
s = "\0 \x7F\z
\xC2\x80 \xDF\xBF\z
\xE0\xA0\x80 \xEF\xBF\xBF\z
\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF"
s = string.gsub(s, " ", "")
check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
x = "日本語a-4\0éó"
check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})
check("𣲷𠜎𠱓𡁻𠵼ab𠺢",
{0x23CB7, 0x2070E, 0x20C53, 0x2107B, 0x20D7C, 0x61, 0x62, 0x20EA2,})
check("𨳊𩶘𦧺𨳒𥄫𤓓\xF4\x8F\xBF\xBF",
{0x28CCA, 0x29D98, 0x269FA, 0x28CD2, 0x2512B, 0x244D3, 0x10ffff})
local i = 0
for p, c in string.gmatch(x, "()(" .. utf8.charpattern .. ")") do
i = i + 1
assert(utf8.offset(x, i) == p)
assert(utf8.len(x, p) == utf8.len(x) - i + 1)
assert(utf8.len(c) == 1)
for j = 1, #c - 1 do
assert(utf8.offset(x, 0, p + j - 1) == p)
end
end
return 'OK'