#ifndef MECAB_UCS_H
#define MECAB_UCS_H
#ifndef MECAB_USE_UTF8_ONLY
#include "ucstable.h"
#endif
namespace MeCab {
inline unsigned short utf8_to_ucs2(const char *begin, const char *end,
size_t* mblen) {
const size_t len = end - begin;
if (static_cast<unsigned char>(begin[0]) < 0x80) {
*mblen = 1;
return static_cast<unsigned char>(begin[0]);
} else if (len >= 2 && (begin[0] & 0xe0) == 0xc0) {
*mblen = 2;
return((begin[0] & 0x1f) << 6) |(begin[1] & 0x3f);
} else if (len >= 3 && (begin[0] & 0xf0) == 0xe0) {
*mblen = 3;
return ((begin[0] & 0x0f) << 12) |
((begin[1] & 0x3f) << 6) |(begin[2] & 0x3f);
} else if (len >= 4 && (begin[0] & 0xf8) == 0xf0) {
*mblen = 4;
return 0;
} else if (len >= 5 && (begin[0] & 0xfc) == 0xf8) {
*mblen = 5;
return 0;
} else if (len >= 6 && (begin[0] & 0xfe) == 0xfc) {
*mblen = 6;
return 0;
} else {
*mblen = 1;
return 0;
}
}
inline unsigned short ascii_to_ucs2(const char *begin, const char *end,
size_t *mblen) {
*mblen = 1;
return static_cast<unsigned char>(begin[0]);
}
inline unsigned short utf16be_to_ucs2(const char *begin, const char *end,
size_t *mblen) {
const size_t len = end - begin;
if (len <= 1) {
*mblen = 1;
return 0;
}
*mblen = 2;
#if defined WORDS_BIGENDIAN
return (begin[0] << 8 | begin[1]);
#else
return (begin[1] << 8 | begin[0]);
#endif
return 0;
}
inline unsigned short utf16le_to_ucs2(const char *begin, const char *end,
size_t *mblen) {
const size_t len = end - begin;
if (len <= 1) {
*mblen = 1;
return 0;
}
*mblen = 2;
#if defined WORDS_BIGENDIAN
return (begin[1] << 8 | begin[0]);
#else
return (begin[0] << 8 | begin[1]);
#endif
}
inline unsigned short utf16_to_ucs2(const char *begin, const char *end,
size_t *mblen) {
#if defined WORDS_BIGENDIAN
return utf16be_to_ucs2(begin, end, mblen);
#else
return utf16le_to_ucs2(begin, end, mblen);
#endif
}
#ifndef MECAB_USE_UTF8_ONLY
inline unsigned short euc_to_ucs2(const char *begin, const char *end,
size_t *mblen) {
const size_t len = end - begin;
if (static_cast<unsigned char>(begin[0]) == 0x8f && len >= 3) {
unsigned short key = (static_cast<unsigned char>(begin[1]) << 8) +
static_cast<unsigned char>(begin[2]);
if (key < 0xA0A0) { *mblen = 1;
return static_cast<unsigned char>(begin[0]);
}
*mblen = 3;
return euc_hojo_tbl[ key - 0xA0A0 ];
} else if ((static_cast<unsigned char>(begin[0]) & 0x80) && len >= 2) {
*mblen = 2;
return euc_tbl[(static_cast<unsigned char>(begin[0]) << 8) +
static_cast<unsigned char>(begin[1]) ];
} else {
*mblen = 1;
return static_cast<unsigned char>(begin[0]);
}
}
inline unsigned short cp932_to_ucs2(const char *begin, const char *end,
size_t *mblen) {
const size_t len = end - begin;
if ((static_cast<unsigned char>(begin[0]) >= 0xA1 &&
static_cast<unsigned char>(begin[0]) <= 0xDF)) {
*mblen = 1;
return cp932_tbl[static_cast<unsigned char>(begin[0]) ];
} else if ((static_cast<unsigned char>(begin[0]) & 0x80) && len >= 2) {
*mblen = 2;
return cp932_tbl[(static_cast<unsigned char>(begin[0]) << 8)
+ static_cast<unsigned char>(begin[1]) ];
} else {
*mblen = 1;
return static_cast<unsigned char>(begin[0]);
}
}
#endif
}
#endif