#include "regint.h"
struct PoolPropertyNameCtype {
short int name;
short int ctype;
};
#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0284, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x01a0,
0x01a0, 0x01a0, 0x30e2, 0x01a0, 0x01a0, 0x00a8, 0x01a0, 0x01a0,
0x01a0, 0x01a0, 0x10a0, 0x10a0, 0x01a0, 0x30e2, 0x01a0, 0x01a0,
0x01a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x01a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x01a0,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
};
#include "st.h"
#include "unicode_fold_data.c"
extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag,
const UChar** pp, const UChar* end, UChar* fold)
{
const struct ByUnfoldKey* buk;
OnigCodePoint code;
int i, len, rlen;
const UChar *p = *pp;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
len = enclen(enc, p);
*pp += len;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
if (code == 0x0130) {
return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
}
#if 0#endif
}
#endif
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) {
buk = onigenc_unicode_unfold_key(code);
if (buk != 0) {
if (buk->fold_len == 1) {
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index)))
return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
}
else {
OnigCodePoint* addr;
FOLDS_FOLD_ADDR_BUK(buk, addr);
rlen = 0;
for (i = 0; i < buk->fold_len; i++) {
OnigCodePoint c = addr[i];
len = ONIGENC_CODE_TO_MBC(enc, c, fold);
fold += len;
rlen += len;
}
return rlen;
}
}
}
for (i = 0; i < len; i++) {
*fold++ = *p++;
}
return len;
}
static int
apply_case_fold1(OnigCaseFoldType flag, int from, int to,
OnigApplyAllCaseFoldFunc f, void* arg)
{
int i, j, k, n, r;
for (i = from; i < to; ) {
OnigCodePoint fold = *FOLDS1_FOLD(i);
if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break;
n = FOLDS1_UNFOLDS_NUM(i);
for (j = 0; j < n; j++) {
OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold))
continue;
r = (*f)(fold, &unfold, 1, arg);
if (r != 0) return r;
r = (*f)(unfold, &fold, 1, arg);
if (r != 0) return r;
for (k = 0; k < j; k++) {
OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
if (CASE_FOLD_IS_ASCII_ONLY(flag) &&
! ONIGENC_IS_ASCII_CODE(unfold2)) continue;
r = (*f)(unfold, &unfold2, 1, arg);
if (r != 0) return r;
r = (*f)(unfold2, &unfold, 1, arg);
if (r != 0) return r;
}
}
i = FOLDS1_NEXT_INDEX(i);
}
return 0;
}
static int
apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
{
int i, j, k, n, r;
for (i = from; i < to; ) {
OnigCodePoint* fold = FOLDS2_FOLD(i);
n = FOLDS2_UNFOLDS_NUM(i);
for (j = 0; j < n; j++) {
OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
r = (*f)(unfold, fold, 2, arg);
if (r != 0) return r;
for (k = 0; k < j; k++) {
OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
r = (*f)(unfold, &unfold2, 1, arg);
if (r != 0) return r;
r = (*f)(unfold2, &unfold, 1, arg);
if (r != 0) return r;
}
}
i = FOLDS2_NEXT_INDEX(i);
}
return 0;
}
static int
apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
{
int i, j, k, n, r;
for (i = from; i < to; ) {
OnigCodePoint* fold = FOLDS3_FOLD(i);
n = FOLDS3_UNFOLDS_NUM(i);
for (j = 0; j < n; j++) {
OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
r = (*f)(unfold, fold, 3, arg);
if (r != 0) return r;
for (k = 0; k < j; k++) {
OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
r = (*f)(unfold, &unfold2, 1, arg);
if (r != 0) return r;
r = (*f)(unfold2, &unfold, 1, arg);
if (r != 0) return r;
}
}
i = FOLDS3_NEXT_INDEX(i);
}
return 0;
}
extern int
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
OnigApplyAllCaseFoldFunc f, void* arg)
{
int r;
r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg);
if (r != 0) return r;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
code = 0x0131;
r = (*f)(0x0049, &code, 1, arg);
if (r != 0) return r;
code = 0x0049;
r = (*f)(0x0131, &code, 1, arg);
if (r != 0) return r;
code = 0x0130;
r = (*f)(0x0069, &code, 1, arg);
if (r != 0) return r;
code = 0x0069;
r = (*f)(0x0130, &code, 1, arg);
if (r != 0) return r;
}
else {
#endif
r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
if (r != 0) return r;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
}
#endif
if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
return 0;
r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
if (r != 0) return r;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
#endif
r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
if (r != 0) return r;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
}
#endif
r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
if (r != 0) return r;
return 0;
}
extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
OnigCaseFoldCodeItem items[])
{
int n, m, i, j, k, len, lens[3];
int index;
int fn, ncs[3];
OnigCodePoint cs[3][4];
OnigCodePoint code, codes[3], orig_codes[3];
const struct ByUnfoldKey* buk1;
n = 0;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (CASE_FOLD_IS_ASCII_ONLY(flag)) {
if (! ONIGENC_IS_ASCII_CODE(code)) return n;
}
len = enclen(enc, p);
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
if (code == 0x0049) {
items[0].byte_len = len;
items[0].code_len = 1;
items[0].code[0] = 0x0131;
return 1;
}
else if (code == 0x0130) {
items[0].byte_len = len;
items[0].code_len = 1;
items[0].code[0] = 0x0069;
return 1;
}
else if (code == 0x0131) {
items[0].byte_len = len;
items[0].code_len = 1;
items[0].code[0] = 0x0049;
return 1;
}
else if (code == 0x0069) {
items[0].byte_len = len;
items[0].code_len = 1;
items[0].code[0] = 0x0130;
return 1;
}
}
#endif
orig_codes[0] = code;
lens[0] = len;
p += len;
buk1 = onigenc_unicode_unfold_key(orig_codes[0]);
if (buk1 != 0 && buk1->fold_len == 1) {
codes[0] = *FOLDS1_FOLD(buk1->index);
}
else
codes[0] = orig_codes[0];
if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
goto fold1;
if (p < end) {
const struct ByUnfoldKey* buk;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
orig_codes[1] = code;
len = enclen(enc, p);
lens[1] = lens[0] + len;
buk = onigenc_unicode_unfold_key(orig_codes[1]);
if (buk != 0 && buk->fold_len == 1) {
codes[1] = *FOLDS1_FOLD(buk->index);
}
else
codes[1] = orig_codes[1];
p += len;
if (p < end) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
orig_codes[2] = code;
len = enclen(enc, p);
lens[2] = lens[1] + len;
buk = onigenc_unicode_unfold_key(orig_codes[2]);
if (buk != 0 && buk->fold_len == 1) {
codes[2] = *FOLDS1_FOLD(buk->index);
}
else
codes[2] = orig_codes[2];
index = onigenc_unicode_fold3_key(codes);
if (index >= 0) {
m = FOLDS3_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
items[n].byte_len = lens[2];
items[n].code_len = 1;
items[n].code[0] = FOLDS3_UNFOLDS(index)[i];
n++;
}
for (fn = 0; fn < 3; fn++) {
int sindex;
cs[fn][0] = FOLDS3_FOLD(index)[fn];
ncs[fn] = 1;
sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
if (sindex >= 0) {
int m = FOLDS1_UNFOLDS_NUM(sindex);
for (i = 0; i < m; i++) {
cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
}
ncs[fn] += m;
}
}
for (i = 0; i < ncs[0]; i++) {
for (j = 0; j < ncs[1]; j++) {
for (k = 0; k < ncs[2]; k++) {
if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1] &&
cs[2][k] == orig_codes[2])
continue;
items[n].byte_len = lens[2];
items[n].code_len = 3;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
items[n].code[2] = cs[2][k];
n++;
}
}
}
return n;
}
}
index = onigenc_unicode_fold2_key(codes);
if (index >= 0) {
m = FOLDS2_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
items[n].byte_len = lens[1];
items[n].code_len = 1;
items[n].code[0] = FOLDS2_UNFOLDS(index)[i];
n++;
}
for (fn = 0; fn < 2; fn++) {
int sindex;
cs[fn][0] = FOLDS2_FOLD(index)[fn];
ncs[fn] = 1;
sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
if (sindex >= 0) {
int m = FOLDS1_UNFOLDS_NUM(sindex);
for (i = 0; i < m; i++) {
cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
}
ncs[fn] += m;
}
}
for (i = 0; i < ncs[0]; i++) {
for (j = 0; j < ncs[1]; j++) {
if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1])
continue;
items[n].byte_len = lens[1];
items[n].code_len = 2;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
n++;
}
}
return n;
}
}
fold1:
if (buk1 != 0) {
if (buk1->fold_len == 1) {
int un;
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) {
items[0].byte_len = lens[0];
items[0].code_len = 1;
items[0].code[0] = *FOLDS1_FOLD(buk1->index);
n++;
}
un = FOLDS1_UNFOLDS_NUM(buk1->index);
for (i = 0; i < un; i++) {
OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];
if (unfold != orig_codes[0]) {
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
ONIGENC_IS_ASCII_CODE(unfold)) {
items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = unfold;
n++;
}
}
}
}
else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
if (buk1->fold_len == 2) {
m = FOLDS2_UNFOLDS_NUM(buk1->index);
for (i = 0; i < m; i++) {
OnigCodePoint unfold = FOLDS2_UNFOLDS(buk1->index)[i];
if (unfold == orig_codes[0]) continue;
items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = unfold;
n++;
}
for (fn = 0; fn < 2; fn++) {
int index;
cs[fn][0] = FOLDS2_FOLD(buk1->index)[fn];
ncs[fn] = 1;
index = onigenc_unicode_fold1_key(&cs[fn][0]);
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
}
ncs[fn] += m;
}
}
for (i = 0; i < ncs[0]; i++) {
for (j = 0; j < ncs[1]; j++) {
items[n].byte_len = lens[0];
items[n].code_len = 2;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
n++;
}
}
}
else {
m = FOLDS3_UNFOLDS_NUM(buk1->index);
for (i = 0; i < m; i++) {
OnigCodePoint unfold = FOLDS3_UNFOLDS(buk1->index)[i];
if (unfold == orig_codes[0]) continue;
items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = unfold;
n++;
}
for (fn = 0; fn < 3; fn++) {
int index;
cs[fn][0] = FOLDS3_FOLD(buk1->index)[fn];
ncs[fn] = 1;
index = onigenc_unicode_fold1_key(&cs[fn][0]);
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
}
ncs[fn] += m;
}
}
for (i = 0; i < ncs[0]; i++) {
for (j = 0; j < ncs[1]; j++) {
for (k = 0; k < ncs[2]; k++) {
items[n].byte_len = lens[0];
items[n].code_len = 3;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
items[n].code[2] = cs[2][k];
n++;
}
}
}
}
}
}
else {
int index = onigenc_unicode_fold1_key(orig_codes);
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
code = FOLDS1_UNFOLDS(index)[i];
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) {
items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = code;
n++;
}
}
}
}
return n;
}
#ifdef USE_UNICODE_PROPERTIES
#include "unicode_property_data.c"
#else
#include "unicode_property_data_posix.c"
#endif
#ifdef USE_UNICODE_WORD_BREAK
enum WB_TYPE {
WB_Any = 0,
WB_ALetter,
WB_CR,
WB_Double_Quote,
WB_Extend,
WB_ExtendNumLet,
WB_Format,
WB_Hebrew_Letter,
WB_Katakana,
WB_LF,
WB_MidLetter,
WB_MidNum,
WB_MidNumLet,
WB_Newline,
WB_Numeric,
WB_Regional_Indicator,
WB_Single_Quote,
WB_WSegSpace,
WB_ZWJ,
};
typedef struct {
OnigCodePoint start;
OnigCodePoint end;
enum WB_TYPE type;
} WB_RANGE_TYPE;
#include "unicode_wb_data.c"
static enum WB_TYPE
wb_get_type(OnigCodePoint code)
{
OnigCodePoint low, high, x;
enum WB_TYPE type;
for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
x = (low + high) >> 1;
if (code > WB_RANGES[x].end)
low = x + 1;
else
high = x;
}
type = (low < (OnigCodePoint )WB_RANGE_NUM &&
code >= WB_RANGES[low].start) ?
WB_RANGES[low].type : WB_Any;
return type;
}
#define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
#define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
#define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
static int
wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
OnigCodePoint* rcode, enum WB_TYPE* rtype)
{
OnigCodePoint code;
enum WB_TYPE type;
while (TRUE) {
p += enclen(enc, p);
if (p >= end) break;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
type = wb_get_type(code);
if (! IS_WB_IGNORE_TAIL(type)) {
*rcode = code;
*rtype = type;
return 1;
}
}
return 0;
}
extern int
onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
const UChar* start, const UChar* end)
{
int r;
UChar* pp;
OnigCodePoint cfrom;
OnigCodePoint cfrom2;
OnigCodePoint cto;
OnigCodePoint cto2;
enum WB_TYPE from;
enum WB_TYPE from2;
enum WB_TYPE to;
enum WB_TYPE to2;
if (p == start) return TRUE;
if (p == end) return TRUE;
if (IS_NULL(prev)) {
prev = onigenc_get_prev_char_head(enc, start, p);
if (IS_NULL(prev)) return TRUE;
}
cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
cto = ONIGENC_MBC_TO_CODE(enc, p, end);
from = wb_get_type(cfrom);
to = wb_get_type(cto);
if (from == 0 && to == 0) goto WB999;
if (from == WB_CR && to == WB_LF) return FALSE;
if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
if (from == WB_ZWJ) {
if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
return FALSE;
}
if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
if (IS_WB_IGNORE_TAIL(to)) return FALSE;
if (IS_WB_IGNORE_TAIL(from)) {
while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
prev = pp;
cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
from = wb_get_type(cfrom);
if (! IS_WB_IGNORE_TAIL(from))
break;
}
}
if (IS_WB_AHLetter(from)) {
if (IS_WB_AHLetter(to)) return FALSE;
if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
if (r == 1) {
if (IS_WB_AHLetter(to2)) return FALSE;
}
}
}
if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
if (IS_WB_AHLetter(to)) {
from2 = WB_Any;
while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
prev = pp;
cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
from2 = wb_get_type(cfrom2);
if (! IS_WB_IGNORE_TAIL(from2))
break;
}
if (IS_WB_AHLetter(from2)) return FALSE;
}
}
if (from == WB_Hebrew_Letter) {
if (to == WB_Single_Quote) return FALSE;
if (to == WB_Double_Quote) {
r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
if (r == 1) {
if (to2 == WB_Hebrew_Letter) return FALSE;
}
}
}
if (from == WB_Double_Quote) {
if (to == WB_Hebrew_Letter) {
from2 = WB_Any;
while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
prev = pp;
cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
from2 = wb_get_type(cfrom2);
if (! IS_WB_IGNORE_TAIL(from2))
break;
}
if (from2 == WB_Hebrew_Letter) return FALSE;
}
}
if (to == WB_Numeric) {
if (from == WB_Numeric) return FALSE;
if (IS_WB_AHLetter(from)) return FALSE;
if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
from2 = WB_Any;
while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
prev = pp;
cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
from2 = wb_get_type(cfrom2);
if (! IS_WB_IGNORE_TAIL(from2))
break;
}
if (from2 == WB_Numeric) return FALSE;
}
}
if (from == WB_Numeric) {
if (IS_WB_AHLetter(to)) return FALSE;
if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
if (r == 1) {
if (to2 == WB_Numeric) return FALSE;
}
}
}
if (from == WB_Katakana && to == WB_Katakana) return FALSE;
if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
|| from == WB_ExtendNumLet) {
if (to == WB_ExtendNumLet) return FALSE;
}
if (from == WB_ExtendNumLet) {
if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
return FALSE;
}
if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
int n = 0;
while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
from2 = wb_get_type(cfrom2);
if (from2 != WB_Regional_Indicator)
break;
n++;
}
if ((n % 2) == 0) return FALSE;
}
WB999:
return TRUE;
}
#endif
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
enum EGCB_BREAK_TYPE {
EGCB_NOT_BREAK = 0,
EGCB_BREAK = 1,
EGCB_BREAK_UNDEF_GB11 = 2,
EGCB_BREAK_UNDEF_RI_RI = 3
};
enum EGCB_TYPE {
EGCB_Other = 0,
EGCB_CR = 1,
EGCB_LF = 2,
EGCB_Control = 3,
EGCB_Extend = 4,
EGCB_Prepend = 5,
EGCB_Regional_Indicator = 6,
EGCB_SpacingMark = 7,
EGCB_ZWJ = 8,
#if 0#endif
EGCB_L = 13,
EGCB_LV = 14,
EGCB_LVT = 15,
EGCB_T = 16,
EGCB_V = 17
};
typedef struct {
OnigCodePoint start;
OnigCodePoint end;
enum EGCB_TYPE type;
} EGCB_RANGE_TYPE;
#include "unicode_egcb_data.c"
static enum EGCB_TYPE
egcb_get_type(OnigCodePoint code)
{
OnigCodePoint low, high, x;
enum EGCB_TYPE type;
for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
x = (low + high) >> 1;
if (code > EGCB_RANGES[x].end)
low = x + 1;
else
high = x;
}
type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
code >= EGCB_RANGES[low].start) ?
EGCB_RANGES[low].type : EGCB_Other;
return type;
}
#define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)
#define IS_HANGUL(code) ((code) >= EGCB_L)
static enum EGCB_BREAK_TYPE
unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
{
enum EGCB_TYPE from;
enum EGCB_TYPE to;
from = egcb_get_type(from_code);
to = egcb_get_type(to_code);
if (from == 0 && to == 0) goto GB999;
if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
if (IS_HANGUL(from) && IS_HANGUL(to)) {
if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
if ((from == EGCB_LV || from == EGCB_V)
&& (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
return EGCB_NOT_BREAK;
goto GB999;
}
if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
if (from == EGCB_ZWJ) {
if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
return EGCB_BREAK_UNDEF_GB11;
goto GB999;
}
if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
return EGCB_BREAK_UNDEF_RI_RI;
}
GB999:
return EGCB_BREAK;
}
#endif
extern int
onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
const UChar* start, const UChar* end)
{
OnigCodePoint from;
OnigCodePoint to;
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
enum EGCB_BREAK_TYPE btype;
enum EGCB_TYPE type;
#endif
if (p == start) return 1;
if (p == end) return 1;
if (IS_NULL(prev)) {
prev = onigenc_get_prev_char_head(enc, start, p);
if (IS_NULL(prev)) return 1;
}
from = ONIGENC_MBC_TO_CODE(enc, prev, end);
to = ONIGENC_MBC_TO_CODE(enc, p, end);
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
return from != 0x000d || to != NEWLINE_CODE;
}
btype = unicode_egcb_is_break_2code(from, to);
switch (btype) {
case EGCB_NOT_BREAK:
return 0;
break;
case EGCB_BREAK:
return 1;
break;
case EGCB_BREAK_UNDEF_GB11:
while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
from = ONIGENC_MBC_TO_CODE(enc, prev, end);
if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
return 0;
type = egcb_get_type(from);
if (type != EGCB_Extend)
break;
}
break;
case EGCB_BREAK_UNDEF_RI_RI:
{
int n = 0;
while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
from = ONIGENC_MBC_TO_CODE(enc, prev, end);
type = egcb_get_type(from);
if (type != EGCB_Regional_Indicator)
break;
n++;
}
if ((n % 2) == 0) return 0;
}
break;
}
return 1;
#else
return from != 0x000d || to != NEWLINE_CODE;
#endif
}
#define USER_DEFINED_PROPERTY_MAX_NUM 20
typedef struct {
int ctype;
OnigCodePoint* ranges;
} UserDefinedPropertyValue;
static int UserDefinedPropertyNum;
static UserDefinedPropertyValue
UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
static st_table* UserDefinedPropertyTable;
extern int
onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
{
UserDefinedPropertyValue* e;
int r;
int i;
int n;
int len;
int c;
char* s;
UChar* uname;
if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
len = (int )strlen(name);
if (len >= PROPERTY_NAME_MAX_SIZE)
return ONIGERR_TOO_LONG_PROPERTY_NAME;
s = (char* )xmalloc(len + 1);
if (s == 0)
return ONIGERR_MEMORY;
uname = (UChar* )name;
n = 0;
for (i = 0; i < len; i++) {
c = uname[i];
if (c < 0x20 || c >= 0x80) {
xfree(s);
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
}
if (c != ' ' && c != '-' && c != '_') {
s[n] = c;
n++;
}
}
s[n] = '\0';
if (UserDefinedPropertyTable == 0) {
UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
if (IS_NULL(UserDefinedPropertyTable)) {
xfree(s);
return ONIGERR_MEMORY;
}
}
e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
e->ranges = ranges;
r = onig_st_insert_strend(UserDefinedPropertyTable,
(const UChar* )s, (const UChar* )s + n,
(hash_data_type )((void* )e));
if (r < 0) return r;
UserDefinedPropertyNum++;
return 0;
}
extern int
onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
{
if (
#ifdef USE_UNICODE_PROPERTIES
ctype <= ONIGENC_MAX_STD_CTYPE &&
#endif
code < 256) {
return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
}
if (ctype >= CODE_RANGES_NUM) {
int index = ctype - CODE_RANGES_NUM;
if (index < UserDefinedPropertyNum)
return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
else
return ONIGERR_TYPE_BUG;
}
return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
}
extern int
onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
{
if (ctype >= CODE_RANGES_NUM) {
int index = ctype - CODE_RANGES_NUM;
if (index < UserDefinedPropertyNum) {
*ranges = UserDefinedPropertyRanges[index].ranges;
return 0;
}
else
return ONIGERR_TYPE_BUG;
}
*ranges = CodeRanges[ctype];
return 0;
}
extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
const OnigCodePoint* ranges[])
{
*sb_out = 0x00;
return onigenc_unicode_ctype_code_range(ctype, ranges);
}
extern int
onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
{
int len;
UChar *p;
OnigCodePoint code;
const struct PoolPropertyNameCtype* pc;
char buf[PROPERTY_NAME_MAX_SIZE];
p = name;
len = 0;
while (p < end) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (code >= 0x80)
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
if (code != ' ' && code != '-' && code != '_') {
buf[len++] = (char )code;
if (len >= PROPERTY_NAME_MAX_SIZE)
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
}
p += enclen(enc, p);
}
buf[len] = 0;
if (UserDefinedPropertyTable != 0) {
UserDefinedPropertyValue* e;
e = (UserDefinedPropertyValue* )NULL;
onig_st_lookup_strend(UserDefinedPropertyTable,
(const UChar* )buf, (const UChar* )buf + len,
(hash_data_type* )((void* )(&e)));
if (e != 0) {
return e->ctype;
}
}
pc = unicode_lookup_property_name(buf, len);
if (pc != 0) {
#ifndef USE_UNICODE_PROPERTIES
if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
#endif
return (int )pc->ctype;
}
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
}