#include "absl/strings/internal/utf8.h"
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/base/port.h"
#include "absl/strings/string_view.h"
namespace {
using ::absl::strings_internal::kMaxEncodedUTF8Size;
using ::absl::strings_internal::ShiftState;
using ::absl::strings_internal::WideToUtf8;
using ::testing::StartsWith;
using ::testing::TestParamInfo;
using ::testing::TestWithParam;
using ::testing::ValuesIn;
#if !defined(__cpp_char8_t)
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc++2a-compat"
#endif
TEST(EncodeUTF8Char, BasicFunction) {
std::pair<char32_t, std::string> tests[] = {{0x0030, u8"\u0030"},
{0x00A3, u8"\u00A3"},
{0x00010000, u8"\U00010000"},
{0x0000FFFF, u8"\U0000FFFF"},
{0x0010FFFD, u8"\U0010FFFD"}};
for (auto& test : tests) {
char buf0[7] = {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'};
char buf1[7] = {'\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF'};
char* buf0_written =
&buf0[absl::strings_internal::EncodeUTF8Char(buf0, test.first)];
char* buf1_written =
&buf1[absl::strings_internal::EncodeUTF8Char(buf1, test.first)];
int apparent_length = 7;
while (buf0[apparent_length - 1] == '\x00' &&
buf1[apparent_length - 1] == '\xFF') {
if (--apparent_length == 0) break;
}
EXPECT_EQ(apparent_length, buf0_written - buf0);
EXPECT_EQ(apparent_length, buf1_written - buf1);
EXPECT_EQ(apparent_length, test.second.length());
EXPECT_EQ(std::string(buf0, apparent_length), test.second);
EXPECT_EQ(std::string(buf1, apparent_length), test.second);
}
char buf[32] = "Don't Tread On Me";
EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf, 0x00110000),
absl::strings_internal::kMaxEncodedUTF8Size);
char buf2[32] = "Negative is invalid but sane";
EXPECT_LE(absl::strings_internal::EncodeUTF8Char(buf2, -1),
absl::strings_internal::kMaxEncodedUTF8Size);
}
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
#endif
struct WideToUtf8TestCase {
std::string description;
wchar_t input;
std::string expected_utf8_str;
size_t expected_bytes_written;
ShiftState initial_state = {false, 0};
ShiftState expected_state = {false, 0};
};
std::vector<WideToUtf8TestCase> GetWideToUtf8TestCases() {
constexpr size_t kError = static_cast<size_t>(-1);
std::vector<WideToUtf8TestCase> cases = {
{"ASCII_A", L'A', "A", 1},
{"NullChar", L'\0', std::string("\0", 1), 1},
{"ASCII_Max_7F", L'\x7F', "\x7F", 1},
{"TwoByte_Min_80", L'\u0080', "\xC2\x80", 2},
{"PoundSign_A3", L'\u00A3', "\xC2\xA3", 2},
{"TwoByte_Max_7FF", L'\u07FF', "\xDF\xBF", 2},
{"ThreeByte_Min_800", L'\u0800', "\xE0\xA0\x80", 3},
{"EuroSign_20AC", L'\u20AC', "\xE2\x82\xAC", 3},
{"BMP_MaxBeforeSurrogates_D7FF", L'\uD7FF', "\xED\x9F\xBF", 3},
{"BMP_FFFF", L'\uFFFF', "\xEF\xBF\xBF", 3},
{"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {}, {true, 0}},
{"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {}, {true, 3}},
{"HighSurr_D800_after_HighD800",
L'\xD800',
"\xF0\x90",
2,
{true, 0},
{true, 0}},
{"HighSurr_DBFF_after_HighDBFF",
L'\xDBFF',
"\xF4\x8F",
2,
{true, 3},
{true, 3}},
{"LowSurr_DC00_after_HighD800", L'\xDC00', "\x80\x80", 2, {true, 0}, {}},
{"LowSurr_DFFD_after_HighDBFF", L'\xDFFD', "\xBF\xBD", 2, {true, 3}, {}},
{"LowSurr_DC00_with_InitialState_saw_high_bits_1",
L'\xDC00',
"\x90\x80",
2,
{true, 1},
{}},
{"Error_IsolatedLowSurr_DC00_NoPriorHigh", L'\xDC00', "", kError, {}, {}},
{"Error_IsolatedLowSurr_DFFF_NoPriorHigh", L'\xDFFF', "", kError, {}, {}},
#if (defined(WCHAR_MAX) && WCHAR_MAX > 0xFFFF)
{"DirectSupplementaryChars_U10000", static_cast<wchar_t>(0x10000),
"\xF0\x90\x80\x80", 4},
{"DirectSupplementaryChars_U10FFFD", static_cast<wchar_t>(0x10FFFD),
"\xF4\x8F\xBF\xBD", 4},
#endif
};
wchar_t minus_one = static_cast<wchar_t>(-1);
if constexpr (sizeof(wchar_t) == 2) {
cases.push_back({"WChar_MinusOne_as_FFFF", minus_one, "\xEF\xBF\xBF", 3});
} else {
cases.push_back(
{"Error_WChar_MinusOne_as_FFFFFFFF", minus_one, "", kError, {}, {}});
}
if constexpr (sizeof(wchar_t) >= 4) {
#ifdef WCHAR_MAX
if (static_cast<uintmax_t>(WCHAR_MAX) >= 0x110000UL) {
cases.push_back({"Error_OutOfRange_110000",
static_cast<wchar_t>(0x110000UL),
"",
kError,
{},
{}});
}
#else
cases.push_back({"Error_OutOfRange_110000_fallback",
static_cast<wchar_t>(0x110000UL),
"",
kError,
{},
{}});
#endif
}
return cases;
}
class WideToUtf8ParamTest : public TestWithParam<WideToUtf8TestCase> {};
TEST_P(WideToUtf8ParamTest, SingleCharConversion) {
const auto& test_case = GetParam();
ShiftState state = test_case.initial_state;
constexpr char kFillChar = '\xAB';
std::string buffer(32, kFillChar);
size_t bytes_written = WideToUtf8(test_case.input, buffer.data(), state);
EXPECT_EQ(bytes_written, test_case.expected_bytes_written);
EXPECT_THAT(buffer, StartsWith(test_case.expected_utf8_str));
ASSERT_LT(test_case.expected_utf8_str.length(), buffer.size());
EXPECT_EQ(buffer[test_case.expected_utf8_str.length()], kFillChar);
EXPECT_EQ(state.saw_high_surrogate,
test_case.expected_state.saw_high_surrogate);
EXPECT_EQ(state.bits, test_case.expected_state.bits);
}
INSTANTIATE_TEST_SUITE_P(WideCharToUtf8Conversion, WideToUtf8ParamTest,
ValuesIn(GetWideToUtf8TestCases()),
[](auto info) { return info.param.description; });
#define WIDE_STRING_LITERAL L"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩❤️💋👨 中"
#define UTF8_STRING_LITERAL u8"Holá €1 你好 שָׁלוֹם 👍🏻🇺🇸👩❤️💋👨 中"
absl::string_view GetUtf8TestString() {
static absl::string_view kUtf8TestString = [] {
using ConstChar8T = std::remove_reference_t<decltype(*u8"a")>;
constexpr ConstChar8T kOutputUtf8[] = UTF8_STRING_LITERAL;
static char output[sizeof kOutputUtf8];
std::memcpy(output, kOutputUtf8, sizeof kOutputUtf8);
return output;
}();
return kUtf8TestString;
}
TEST(WideToUtf8, FullString) {
std::string buffer(kMaxEncodedUTF8Size * sizeof(WIDE_STRING_LITERAL), '\0');
char* buffer_ptr = buffer.data();
ShiftState state;
for (const wchar_t wc : WIDE_STRING_LITERAL) {
buffer_ptr += WideToUtf8(wc, buffer_ptr, state);
}
EXPECT_THAT(buffer, StartsWith(GetUtf8TestString()));
}
#undef WIDE_STRING_LITERAL
#undef UTF8_STRING_LITERAL
}