arf-strings 0.4.0

Encoding and decoding for ARF strings
Documentation
//! ARF library for converting to and from ARFs.

#include "arf.h"
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

// UTF-8 encoding for U+FEFF, which marks at the beginning of an ARF string.
static const uint8_t utf8_bom[] = { 0xef, 0xbb, 0xbf };

// UTF-8 encoding for U+FFFD, which is used in the lossy portion of an ARF
// string to replace invalid bytes.
static const uint8_t utf8_replacement[] = { 0xef, 0xbf, 0xbd };

// The number of bytes in a byte escape sequence.
static const size_t sizeof_escaped_byte = 2;

// Return a pointer to the first invalid byte, or a pointer to one past the end
// if the entire string is valid UTF-8.
static const uint8_t *find_invalid_utf8(const uint8_t *ptr, size_t len) {
    const uint8_t *end = ptr + len;
    while (ptr != end) {
        if (ptr[0] < 0x80) {
            ptr += 1;
        } else if ((ptr[0] & 0xe0) == 0xc0) {
            if ((end - ptr) < 2 ||
                (ptr[0] & 0xfe) == 0xc0 ||
                (ptr[1] & 0xc0) != 0x80)
            {
                break;
            }
            ptr += 2;
        } else if ((ptr[0] & 0xf0) == 0xe0) {
            if ((end - ptr) < 3 ||
                (ptr[0] == 0xe0 && (ptr[1] & 0xe0) == 0x80) ||
                (ptr[0] == 0xed && (ptr[1] & 0xe0) == 0xa0) ||
                (ptr[0] == 0xef && ptr[1] == 0xbf && (ptr[2] & 0xfe) == 0xbe) ||
                (ptr[1] & 0xc0) != 0x80 ||
                (ptr[2] & 0xc0) != 0x80)
            {
                break;
            }
            ptr += 3;
        } else if ((ptr[0] & 0xf8) == 0xf0) {
            if ((end - ptr) < 4 ||
                ptr[0] > 0xf4 ||
                (ptr[0] == 0xf0 && (ptr[1] & 0xf0) == 0x80) ||
                (ptr[0] == 0xf4 && ptr[1] > 0x8f) ||
                (ptr[1] & 0xc0) != 0x80 ||
                (ptr[2] & 0xc0) != 0x80 ||
                (ptr[3] & 0xc0) != 0x80)
            {
                break;
            }
            ptr += 4;
        } else {
            break;
        }
    }

    return ptr;
}

bool arf_is_valid_c_str(const char *c_str) {
    // Check that the C-string is all valid UTF-8.
    size_t c_str_len = strlen(c_str);
    return find_invalid_utf8((const uint8_t *)c_str, c_str_len) ==
           (const uint8_t *)c_str + c_str_len;
}

bool arf_has_arf_magic(const uint8_t *ptr, size_t len) {
    // ARF strings start with a UTF-8 BOM.
    return len >= sizeof(utf8_bom) &&
           memcmp(ptr, utf8_bom, sizeof(utf8_bom)) == 0;
}

bool arf_is_valid_arf(const uint8_t *ptr, size_t len) {
    // ARF strings begin and end with fixed bytes.
    if (!arf_has_arf_magic(ptr, len)) {
        return false;
    }
    ptr += sizeof(utf8_bom);
    len -= sizeof(utf8_bom);

    // ARF strings are valid UTF-8.
    if (find_invalid_utf8(ptr, len) != ptr + len) {
        return false;
    }

    // ARF strings contain a NUL byte separating the replacement portion from
    // the NUL-escaped portion.
    size_t first_len = strlen((const char *)ptr);
    if (first_len >= len) {
        return false;
    }

    // Check that the lossy portion translates to the NUL-escaped portion.
    bool any_invalid_bytes = false;
    size_t second_begin = first_len + 1;
    size_t i = 0, j = second_begin;
    while (i != first_len) {
        if (ptr[j] == 0) {
            // Check the NUL-escaped encoding.
            if (len - j < sizeof_escaped_byte || (int8_t)ptr[j + 1] < 0) {
                return false;
            }

            // Check that the escaped string contains a replacement character.
            if (first_len - i < sizeof(utf8_replacement) ||
                memcmp(ptr + i, utf8_replacement,
                       sizeof(utf8_replacement)) != 0)
            {
                return false;
            }

            i += sizeof(utf8_replacement);
            j += sizeof_escaped_byte;
            any_invalid_bytes = true;
        } else {
            // Check that the bytes match.
            if (ptr[i] != ptr[j]) {
                return false;
            }

            i += 1;
            j += 1;
        }
    }

    // If there weren't any invalid bytes, we shouldn't have an ARF string.
    if (!any_invalid_bytes) {
        return false;
    }

    // Arf!
    return true;
}

/// Like `arf_sizeof_c_str_arf`, but has `strlen(c_str)` passed in so that it
/// doesn't need to be recomputed.
static size_t arf_sizeof_c_str_arf_impl(const char *c_str, size_t c_str_len) {
    // Start with the length of the fixed-length parts of an ARF string.
    size_t len = sizeof(utf8_bom) + 1;

    // Add the size of both the lossy portion and the NUL-escaped portion.
    for (size_t i = 0; i != c_str_len; ) {
        const uint8_t *found =
            find_invalid_utf8((const uint8_t *)c_str + i, c_str_len - i);

        // Copy in valid UTF-8 bytes.
        size_t valid_len = (size_t)(found - ((const uint8_t *)c_str + i));
        if (__builtin_add_overflow(len, valid_len * 2, &len))
            return SIZE_MAX;

        i += valid_len;
        if (i == c_str_len)
            break;

        // Handle an invalid byte.
        size_t more = sizeof(utf8_replacement) + sizeof_escaped_byte;
        if (__builtin_add_overflow(len, more, &len))
            return SIZE_MAX;

        i += 1;
    }

    return len;
}

bool arf_categorize_c_str(const char *c_str, size_t *restrict len) {
    size_t c_str_len = strlen(c_str);

    if (__builtin_expect(find_invalid_utf8((const uint8_t *)c_str, c_str_len) ==
                         (const uint8_t *)c_str + c_str_len,
                         true))
    {
        *len = c_str_len;
        return true;
    }

    *len = arf_sizeof_c_str_arf_impl(c_str, c_str_len);
    return false;
}

size_t arf_sizeof_c_str_arf(const char *c_str) {
    return arf_sizeof_c_str_arf_impl(c_str, strlen(c_str));
}

void arf_c_str_arf(const char *c_str, uint8_t *ptr) {
    size_t c_str_len = strlen(c_str);

    memcpy(ptr, utf8_bom, sizeof(utf8_bom));
    ptr += sizeof(utf8_bom);

    // Encode the replacement-encoded portion.
    const uint8_t *in = (const uint8_t*)c_str;
    for (size_t len = c_str_len; len != 0; ) {
        const uint8_t *invalid = find_invalid_utf8(in, len);

        // Copy in valid UTF-8 bytes.
        size_t valid_len = (size_t)(invalid - in);
        memcpy(ptr, in, valid_len);
        ptr += valid_len;
        in += valid_len;
        len -= valid_len;

        if (len == 0)
            break;

        // Handle an invalid byte.
        memcpy(ptr, utf8_replacement, sizeof(utf8_replacement));
        ptr += sizeof(utf8_replacement);
        in += 1;
        len -= 1;
    }

    *ptr++ = '\0';

    // Encode the full-encoded portion.
    in = (const uint8_t*)c_str;
    for (size_t len = c_str_len; len != 0; ) {
        const uint8_t *invalid = find_invalid_utf8(in, len);

        // Copy in valid UTF-8 bytes.
        size_t valid_len = (size_t)(invalid - in);
        memcpy(ptr, in, valid_len);
        ptr += valid_len;
        in += valid_len;
        len -= valid_len;

        if (len == 0)
            break;

        // Emit a NUL-escaped byte.
        *ptr++ = '\0';
        *ptr++ = *in & INT8_MAX;
        in += 1;
        len -= 1;
    }
}

size_t arf_sizeof_arf_c_str(const uint8_t *ptr, size_t len) {
    assert(arf_is_valid_arf(ptr, len));

    const uint8_t *end = ptr + len;

    // Examine the NUL-escaped portion, which is the non-lossy portion.
    ptr += sizeof(utf8_bom);
    ptr += strlen((const char *)ptr) + 1;

    size_t c_str_len = 0;
    while (ptr != end) {
        if (*ptr++ == '\0')
            ptr++;
        c_str_len += 1;
    }

    // Add one for the terminating NUL.
    c_str_len += 1;

    return c_str_len;
}

void arf_arf_c_str(const uint8_t *ptr, size_t len, char *__restrict__ c_str) {
    assert(arf_is_valid_arf(ptr, len));

    const uint8_t *end = ptr + len;

    // Examine the NUL-escaped portion, which is the non-lossy portion.
    ptr += sizeof(utf8_bom);
    ptr += strlen((const char *)ptr) + 1;

    // Copy the string data, inverting any escaped bytes.
    while (ptr != end) {
        uint8_t b = *ptr++;
        if (b == '\0')
            b = *ptr++ | (uint8_t)INT8_MIN;
        *c_str++ = (char)b;
    }

    // Append the terminating NUL.
    *c_str = '\0';
}