#pragma once
#ifdef _MSC_VER
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <Windows.h>
#endif
#include <cstring>
#include "Common.hpp"
#include "Exception.hpp"
namespace opencc {
class OPENCC_EXPORT UTF8Util {
public:
static void SkipUtf8Bom(FILE* fp);
static size_t NextCharLengthNoException(const char* str) {
char ch = *str;
if ((ch & 0xF0) == 0xE0) {
return 3;
} else if ((ch & 0x80) == 0x00) {
return 1;
} else if ((ch & 0xE0) == 0xC0) {
return 2;
} else if ((ch & 0xF8) == 0xF0) {
return 4;
} else if ((ch & 0xFC) == 0xF8) {
return 5;
} else if ((ch & 0xFE) == 0xFC) {
return 6;
}
return 0;
}
static size_t NextCharLength(const char* str) {
size_t length = NextCharLengthNoException(str);
if (length == 0) {
throw InvalidUTF8(str);
}
return length;
}
static size_t PrevCharLength(const char* str) {
{
const size_t length = NextCharLengthNoException(str - 3);
if (length == 3) {
return length;
}
}
{
const size_t length = NextCharLengthNoException(str - 1);
if (length == 1) {
return length;
}
}
{
const size_t length = NextCharLengthNoException(str - 2);
if (length == 2) {
return length;
}
}
for (size_t i = 4; i <= 6; i++) {
const size_t length = NextCharLengthNoException(str - i);
if (length == i) {
return length;
}
}
throw InvalidUTF8(str);
}
static const char* NextChar(const char* str) {
return str + NextCharLength(str);
}
static const char* PrevChar(const char* str) {
return str - PrevCharLength(str);
}
static size_t Length(const char* str) {
size_t length = 0;
while (*str != '\0') {
str = NextChar(str);
length++;
}
return length;
}
static const char* FindNextInline(const char* str, const char ch) {
while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
str = NextChar(str);
}
return str;
}
static bool IsLineEndingOrFileEnding(const char ch) {
return ch == '\0' || ch == '\n' || ch == '\r';
}
static std::string FromSubstr(const char* str, size_t length) {
std::string newStr;
newStr.resize(length);
strncpy(const_cast<char*>(newStr.c_str()), str, length);
return newStr;
}
static bool NotShorterThan(const char* str, size_t byteLength) {
while (byteLength > 0) {
if (*str == '\0') {
return false;
}
byteLength--;
str++;
}
return true;
}
static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
std::string wordTrunc;
if (NotShorterThan(str, maxByteLength)) {
size_t len = 0;
const char* pStr = str;
for (;;) {
const size_t charLength = NextCharLength(pStr);
if (len + charLength > maxByteLength) {
break;
}
pStr += charLength;
len += charLength;
}
wordTrunc = FromSubstr(str, len);
} else {
wordTrunc = str;
}
return wordTrunc;
}
static void ReplaceAll(std::string& str, const char* from, const char* to) {
std::string::size_type pos = 0;
std::string::size_type fromLen = strlen(from);
std::string::size_type toLen = strlen(to);
while ((pos = str.find(from, pos)) != std::string::npos) {
str.replace(pos, fromLen, to);
pos += toLen;
}
}
static std::string Join(const std::vector<std::string>& strings,
const std::string& separator) {
std::ostringstream buffer;
bool first = true;
for (const auto& str : strings) {
if (!first) {
buffer << separator;
}
buffer << str;
first = false;
}
return buffer.str();
}
static std::string Join(const std::vector<std::string>& strings) {
std::ostringstream buffer;
for (const auto& str : strings) {
buffer << str;
}
return buffer.str();
}
static void GetByteMap(const char* str, const size_t utf8Length,
std::vector<size_t>* byteMap) {
if (byteMap->size() < utf8Length) {
byteMap->resize(utf8Length);
}
const char* pstr = str;
for (size_t i = 0; i < utf8Length; i++) {
(*byteMap)[i] = pstr - str;
pstr = NextChar(pstr);
}
}
#ifdef _MSC_VER
static std::wstring GetPlatformString(const std::string& str) {
return U8ToU16(str);
}
#else
static std::string GetPlatformString(const std::string& str) { return str; }
#endif
#ifdef _MSC_VER
static std::string U16ToU8(const std::wstring& wstr) {
std::string ret;
int length = static_cast<int>(wstr.length());
int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
NULL, NULL);
if (convcnt > 0) {
ret.resize(convcnt);
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
NULL, NULL);
}
return ret;
}
static std::wstring U8ToU16(const std::string& str) {
std::wstring ret;
int length = static_cast<int>(str.length());
int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
if (convcnt > 0) {
ret.resize(convcnt);
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
}
return ret;
}
#endif };
}