pasta_lua 0.2.3

local unicode = require "luacheck.unicode"
local utils = require "luacheck.utils"

local decoder = {}

local sbyte = string.byte
local sfind = string.find
local sgsub = string.gsub
local ssub = string.sub

-- `LatinChars` and `UnicodeChars` objects represent source strings
-- and provide Unicode-aware access to them with a common interface.
-- Source bytes should not be accessed directly.
-- Provided methods are:
-- `Chars:get_codepoint(index)`: returns codepoint at given index as integer or nil if index is out of range.
-- `Chars:get_substring(from, to)`: returns substring of original bytes corresponding to characters from `from` to `to`.
-- `Chars:get_printable_substring(from. to)`: like get_substring but escapes not printable characters.
-- `Chars:get_length()`: returns total number of characters.
-- `Chars:find(pattern, from)`: `string.find` but `from` is in characters. Return values are still in bytes.

-- `LatinChars` is an optimized special case for latin1 strings.
local LatinChars = utils.class()

function LatinChars:__init(bytes)
   self._bytes = bytes
end

function LatinChars:get_codepoint(index)
   return sbyte(self._bytes, index)
end

function LatinChars:get_substring(from, to)
   return ssub(self._bytes, from, to)
end

local function hexadecimal_escaper(byte)
   return ("\\x%02X"):format(sbyte(byte))
end

function LatinChars:get_printable_substring(from, to)
   return (sgsub(ssub(self._bytes, from, to), "[^\32-\126]", hexadecimal_escaper))
end

function LatinChars:get_length()
   return #self._bytes
end

function LatinChars:find(pattern, from)
   return sfind(self._bytes, pattern, from)
end

-- Decodes `bytes` as UTF8. Returns arrays of codepoints as integers and their byte offsets.
-- Byte offsets have one extra item pointing to one byte past the end of `bytes`.
-- On decoding error returns nothing.
local function get_codepoints_and_byte_offsets(bytes)
   local codepoints = {}
   local byte_offsets = {}

   local byte_index = 1
   local codepoint_index = 1

   while true do
      byte_offsets[codepoint_index] = byte_index

      -- Attempt to decode the next codepoint from UTF8.
      local codepoint = sbyte(bytes, byte_index)

      if not codepoint then
         return codepoints, byte_offsets
      end

      byte_index = byte_index + 1

      if codepoint >= 0x80 then
         -- Not ASCII.

         if codepoint < 0xC0 then
            return
         end

         local cont = (sbyte(bytes, byte_index) or 0) - 0x80

         if cont < 0 or cont >= 0x40 then
            return
         end

         byte_index = byte_index + 1

         if codepoint < 0xE0 then
            -- Two bytes.
            codepoint = cont + (codepoint - 0xC0) * 0x40
         elseif codepoint < 0xF0 then
            -- Three bytes.
            codepoint = cont + (codepoint - 0xE0) * 0x40

            cont = (sbyte(bytes, byte_index) or 0) - 0x80

            if cont < 0 or cont >= 0x40 then
               return
            end

            byte_index = byte_index + 1

            codepoint = cont + codepoint * 0x40
         elseif codepoint < 0xF8 then
            -- Four bytes.
            codepoint = cont + (codepoint - 0xF0) * 0x40

            cont = (sbyte(bytes, byte_index) or 0) - 0x80

            if cont < 0 or cont >= 0x40 then
               return
            end

            byte_index = byte_index + 1

            codepoint = cont + codepoint * 0x40

            cont = (sbyte(bytes, byte_index) or 0) - 0x80

            if cont < 0 or cont >= 0x40 then
               return
            end

            byte_index = byte_index + 1

            codepoint = cont + codepoint * 0x40

            if codepoint > 0x10FFFF then
               return
            end
         else
            return
         end
      end

      codepoints[codepoint_index] = codepoint
      codepoint_index = codepoint_index + 1
   end
end

-- `UnicodeChars` is the general case for non-latin1 strings.
-- Assumes UTF8, on decoding error falls back to latin1.
local UnicodeChars = utils.class()

function UnicodeChars:__init(bytes, codepoints, byte_offsets)
   self._bytes = bytes
   self._codepoints = codepoints
   self._byte_offsets = byte_offsets
end

function UnicodeChars:get_codepoint(index)
   return self._codepoints[index]
end

function UnicodeChars:get_substring(from, to)
   local byte_offsets = self._byte_offsets
   return ssub(self._bytes, byte_offsets[from], byte_offsets[to + 1] - 1)
end

function UnicodeChars:get_printable_substring(from, to)
   -- This is only called on syntax error, it's okay to be slow.
   local parts = {}

   for index = from, to do
      local codepoint = self._codepoints[index]

      if unicode.is_printable(codepoint) then
         table.insert(parts, self:get_substring(index, index))
      else
         table.insert(parts, (codepoint > 255 and "\\u{%X}" or "\\x%02X"):format(codepoint))
      end
   end

   return table.concat(parts)
end

function UnicodeChars:get_length()
   return #self._codepoints
end

function UnicodeChars:find(pattern, from)
   return sfind(self._bytes, pattern, self._byte_offsets[from])
end

function decoder.decode(bytes)
   -- Only use UnicodeChars if necessary. LatinChars isn't much faster but noticeably more memory efficient.
   if sfind(bytes, "[\128-\255]") then
      local codepoints, byte_offsets = get_codepoints_and_byte_offsets(bytes)

      if codepoints then
         return UnicodeChars(bytes, codepoints, byte_offsets)
      end
   end

   return LatinChars(bytes)
end

return decoder