#include "arf.h"
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
static const uint8_t utf8_bom[] = { 0xef, 0xbb, 0xbf };
static const uint8_t utf8_replacement[] = { 0xef, 0xbf, 0xbd };
static const size_t sizeof_escaped_byte = 2;
static const uint8_t *find_invalid_utf8(const uint8_t *ptr, size_t len) {
const uint8_t *end = ptr + len;
while (ptr != end) {
if (ptr[0] < 0x80) {
ptr += 1;
} else if ((ptr[0] & 0xe0) == 0xc0) {
if ((end - ptr) < 2 ||
(ptr[0] & 0xfe) == 0xc0 ||
(ptr[1] & 0xc0) != 0x80)
{
break;
}
ptr += 2;
} else if ((ptr[0] & 0xf0) == 0xe0) {
if ((end - ptr) < 3 ||
(ptr[0] == 0xe0 && (ptr[1] & 0xe0) == 0x80) ||
(ptr[0] == 0xed && (ptr[1] & 0xe0) == 0xa0) ||
(ptr[0] == 0xef && ptr[1] == 0xbf && (ptr[2] & 0xfe) == 0xbe) ||
(ptr[1] & 0xc0) != 0x80 ||
(ptr[2] & 0xc0) != 0x80)
{
break;
}
ptr += 3;
} else if ((ptr[0] & 0xf8) == 0xf0) {
if ((end - ptr) < 4 ||
ptr[0] > 0xf4 ||
(ptr[0] == 0xf0 && (ptr[1] & 0xf0) == 0x80) ||
(ptr[0] == 0xf4 && ptr[1] > 0x8f) ||
(ptr[1] & 0xc0) != 0x80 ||
(ptr[2] & 0xc0) != 0x80 ||
(ptr[3] & 0xc0) != 0x80)
{
break;
}
ptr += 4;
} else {
break;
}
}
return ptr;
}
bool arf_is_valid_c_str(const char *c_str) {
size_t c_str_len = strlen(c_str);
return find_invalid_utf8((const uint8_t *)c_str, c_str_len) ==
(const uint8_t *)c_str + c_str_len;
}
bool arf_has_arf_magic(const uint8_t *ptr, size_t len) {
return len >= sizeof(utf8_bom) &&
memcmp(ptr, utf8_bom, sizeof(utf8_bom)) == 0;
}
bool arf_is_valid_arf(const uint8_t *ptr, size_t len) {
if (!arf_has_arf_magic(ptr, len)) {
return false;
}
ptr += sizeof(utf8_bom);
len -= sizeof(utf8_bom);
if (find_invalid_utf8(ptr, len) != ptr + len) {
return false;
}
size_t first_len = strlen((const char *)ptr);
if (first_len >= len) {
return false;
}
bool any_invalid_bytes = false;
size_t second_begin = first_len + 1;
size_t i = 0, j = second_begin;
while (i != first_len) {
if (ptr[j] == 0) {
if (len - j < sizeof_escaped_byte || (int8_t)ptr[j + 1] < 0) {
return false;
}
if (first_len - i < sizeof(utf8_replacement) ||
memcmp(ptr + i, utf8_replacement,
sizeof(utf8_replacement)) != 0)
{
return false;
}
i += sizeof(utf8_replacement);
j += sizeof_escaped_byte;
any_invalid_bytes = true;
} else {
if (ptr[i] != ptr[j]) {
return false;
}
i += 1;
j += 1;
}
}
if (!any_invalid_bytes) {
return false;
}
return true;
}
static size_t arf_sizeof_c_str_arf_impl(const char *c_str, size_t c_str_len) {
size_t len = sizeof(utf8_bom) + 1;
for (size_t i = 0; i != c_str_len; ) {
const uint8_t *found =
find_invalid_utf8((const uint8_t *)c_str + i, c_str_len - i);
size_t valid_len = (size_t)(found - ((const uint8_t *)c_str + i));
if (__builtin_add_overflow(len, valid_len * 2, &len))
return SIZE_MAX;
i += valid_len;
if (i == c_str_len)
break;
size_t more = sizeof(utf8_replacement) + sizeof_escaped_byte;
if (__builtin_add_overflow(len, more, &len))
return SIZE_MAX;
i += 1;
}
return len;
}
bool arf_categorize_c_str(const char *c_str, size_t *restrict len) {
size_t c_str_len = strlen(c_str);
if (__builtin_expect(find_invalid_utf8((const uint8_t *)c_str, c_str_len) ==
(const uint8_t *)c_str + c_str_len,
true))
{
*len = c_str_len;
return true;
}
*len = arf_sizeof_c_str_arf_impl(c_str, c_str_len);
return false;
}
size_t arf_sizeof_c_str_arf(const char *c_str) {
return arf_sizeof_c_str_arf_impl(c_str, strlen(c_str));
}
void arf_c_str_arf(const char *c_str, uint8_t *ptr) {
size_t c_str_len = strlen(c_str);
memcpy(ptr, utf8_bom, sizeof(utf8_bom));
ptr += sizeof(utf8_bom);
const uint8_t *in = (const uint8_t*)c_str;
for (size_t len = c_str_len; len != 0; ) {
const uint8_t *invalid = find_invalid_utf8(in, len);
size_t valid_len = (size_t)(invalid - in);
memcpy(ptr, in, valid_len);
ptr += valid_len;
in += valid_len;
len -= valid_len;
if (len == 0)
break;
memcpy(ptr, utf8_replacement, sizeof(utf8_replacement));
ptr += sizeof(utf8_replacement);
in += 1;
len -= 1;
}
*ptr++ = '\0';
in = (const uint8_t*)c_str;
for (size_t len = c_str_len; len != 0; ) {
const uint8_t *invalid = find_invalid_utf8(in, len);
size_t valid_len = (size_t)(invalid - in);
memcpy(ptr, in, valid_len);
ptr += valid_len;
in += valid_len;
len -= valid_len;
if (len == 0)
break;
*ptr++ = '\0';
*ptr++ = *in & INT8_MAX;
in += 1;
len -= 1;
}
}
size_t arf_sizeof_arf_c_str(const uint8_t *ptr, size_t len) {
assert(arf_is_valid_arf(ptr, len));
const uint8_t *end = ptr + len;
ptr += sizeof(utf8_bom);
ptr += strlen((const char *)ptr) + 1;
size_t c_str_len = 0;
while (ptr != end) {
if (*ptr++ == '\0')
ptr++;
c_str_len += 1;
}
c_str_len += 1;
return c_str_len;
}
void arf_arf_c_str(const uint8_t *ptr, size_t len, char *__restrict__ c_str) {
assert(arf_is_valid_arf(ptr, len));
const uint8_t *end = ptr + len;
ptr += sizeof(utf8_bom);
ptr += strlen((const char *)ptr) + 1;
while (ptr != end) {
uint8_t b = *ptr++;
if (b == '\0')
b = *ptr++ | (uint8_t)INT8_MIN;
*c_str++ = (char)b;
}
*c_str = '\0';
}