pub mod egcb_data;
mod fold_data;
mod property_data;
pub mod wb_data;
use crate::oniguruma::*;
use crate::regenc::*;
use egcb_data::{EgcbType, EGCB_RANGES};
use fold_data::*;
use property_data::{CODE_RANGES, CODE_RANGES_NUM, PROPERTY_NAMES};
use wb_data::{WbType, WB_RANGES};
pub static ENC_UNICODE_ISO_8859_1_CTYPE_TABLE: [u16; 256] = [
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x428c, 0x4289, 0x4288,
0x4288, 0x4288, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4284, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x41a0, 0x41a0,
0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0288, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
0x0008, 0x0008, 0x0008, 0x0008, 0x0284, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x01a0,
0x01a0, 0x01a0, 0x30e2, 0x01a0, 0x01a0, 0x00a8, 0x01a0, 0x01a0, 0x01a0, 0x01a0, 0x10a0, 0x10a0,
0x01a0, 0x30e2, 0x01a0, 0x01a0, 0x01a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x01a0,
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x01a0, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
0x30e2, 0x30e2, 0x30e2, 0x30e2,
];
fn unfold_key(code: OnigCodePoint) -> Option<(usize, usize)> {
UNFOLD_KEY
.binary_search_by_key(&code, |&(c, _, _)| c)
.ok()
.map(|i| (UNFOLD_KEY[i].1 as usize, UNFOLD_KEY[i].2 as usize))
}
fn fold1_key(code: OnigCodePoint) -> Option<usize> {
FOLD1_KEY
.binary_search_by_key(&code, |&(c, _)| c)
.ok()
.map(|i| FOLD1_KEY[i].1 as usize)
}
fn fold2_key(codes: &[OnigCodePoint]) -> Option<usize> {
let key = [codes[0], codes[1]];
FOLD2_KEY
.binary_search_by_key(&key, |&(k, _)| k)
.ok()
.map(|i| FOLD2_KEY[i].1 as usize)
}
fn fold3_key(codes: &[OnigCodePoint]) -> Option<usize> {
let key = [codes[0], codes[1], codes[2]];
FOLD3_KEY
.binary_search_by_key(&key, |&(k, _)| k)
.ok()
.map(|i| FOLD3_KEY[i].1 as usize)
}
#[inline]
fn folds1_fold(i: usize) -> OnigCodePoint {
UNICODE_FOLDS1[i]
}
#[inline]
fn folds1_unfolds_num(i: usize) -> usize {
UNICODE_FOLDS1[i + 1] as usize
}
#[inline]
fn folds1_unfolds(i: usize) -> &'static [u32] {
let n = folds1_unfolds_num(i);
&UNICODE_FOLDS1[i + 2..i + 2 + n]
}
#[inline]
fn folds1_next(i: usize) -> usize {
i + 2 + folds1_unfolds_num(i)
}
#[inline]
fn folds2_fold(i: usize) -> &'static [u32] {
&UNICODE_FOLDS2[i..i + 2]
}
#[inline]
fn folds2_unfolds_num(i: usize) -> usize {
UNICODE_FOLDS2[i + 2] as usize
}
#[inline]
fn folds2_unfolds(i: usize) -> &'static [u32] {
let n = folds2_unfolds_num(i);
&UNICODE_FOLDS2[i + 3..i + 3 + n]
}
#[inline]
fn folds2_next(i: usize) -> usize {
i + 3 + folds2_unfolds_num(i)
}
#[inline]
fn folds3_fold(i: usize) -> &'static [u32] {
&UNICODE_FOLDS3[i..i + 3]
}
#[inline]
fn folds3_unfolds_num(i: usize) -> usize {
UNICODE_FOLDS3[i + 3] as usize
}
#[inline]
fn folds3_unfolds(i: usize) -> &'static [u32] {
let n = folds3_unfolds_num(i);
&UNICODE_FOLDS3[i + 4..i + 4 + n]
}
#[inline]
fn folds3_next(i: usize) -> usize {
i + 4 + folds3_unfolds_num(i)
}
#[cfg_attr(coverage_nightly, coverage(off))]
fn folds_fold_addr(index: usize, fold_len: usize) -> &'static [u32] {
match fold_len {
1 => &UNICODE_FOLDS1[index..index + 1],
2 => &UNICODE_FOLDS2[index..index + 2],
3 => &UNICODE_FOLDS3[index..index + 3],
_ => &[],
}
}
#[cfg_attr(coverage_nightly, coverage(off))]
pub fn onigenc_unicode_mbc_case_fold(
enc: &dyn Encoding,
flag: OnigCaseFoldType,
pp: &mut usize,
end: usize,
data: &[u8],
fold: &mut [u8],
) -> i32 {
let code = enc.mbc_to_code(&data[*pp..], end);
let len = enc.mbc_enc_len(&data[*pp..]);
let p_start = *pp;
*pp += len;
if case_fold_is_not_ascii_only(flag) || code < 128 {
if let Some((index, fold_len)) = unfold_key(code) {
if fold_len == 1 {
let fold_code = folds1_fold(index);
if case_fold_is_not_ascii_only(flag) || fold_code < 128 {
return enc.code_to_mbc(fold_code, fold);
}
} else {
let addr = folds_fold_addr(index, fold_len);
let mut rlen = 0i32;
for i in 0..fold_len {
let l = enc.code_to_mbc(addr[i], &mut fold[rlen as usize..]);
rlen += l;
}
return rlen;
}
}
}
for i in 0..len {
fold[i] = data[p_start + i];
}
len as i32
}
fn apply_case_fold1(
flag: OnigCaseFoldType,
from: usize,
to: usize,
f: &mut dyn FnMut(OnigCodePoint, &[OnigCodePoint]) -> i32,
) -> i32 {
let mut i = from;
while i < to {
let fold = folds1_fold(i);
if case_fold_is_ascii_only(flag) && fold >= 128 {
break;
}
let unfolds = folds1_unfolds(i);
let n = unfolds.len();
for j in 0..n {
let uf = unfolds[j];
if case_fold_is_ascii_only(flag) && uf >= 128 {
continue;
}
let r = f(fold, &[uf]);
if r != 0 {
return r;
}
let r = f(uf, &[fold]);
if r != 0 {
return r;
}
for k in 0..j {
let uf2 = unfolds[k];
if case_fold_is_ascii_only(flag) && uf2 >= 128 {
continue;
}
let r = f(uf, &[uf2]);
if r != 0 {
return r;
}
let r = f(uf2, &[uf]);
if r != 0 {
return r;
}
}
}
i = folds1_next(i);
}
0
}
fn apply_case_fold2(
from: usize,
to: usize,
f: &mut dyn FnMut(OnigCodePoint, &[OnigCodePoint]) -> i32,
) -> i32 {
let mut i = from;
while i < to {
let fold = folds2_fold(i);
let unfolds = folds2_unfolds(i);
let n = unfolds.len();
for j in 0..n {
let uf = unfolds[j];
let r = f(uf, fold);
if r != 0 {
return r;
}
for k in 0..j {
let uf2 = unfolds[k];
let r = f(uf, &[uf2]);
if r != 0 {
return r;
}
let r = f(uf2, &[uf]);
if r != 0 {
return r;
}
}
}
i = folds2_next(i);
}
0
}
fn apply_case_fold3(
from: usize,
to: usize,
f: &mut dyn FnMut(OnigCodePoint, &[OnigCodePoint]) -> i32,
) -> i32 {
let mut i = from;
while i < to {
let fold = folds3_fold(i);
let unfolds = folds3_unfolds(i);
let n = unfolds.len();
for j in 0..n {
let uf = unfolds[j];
let r = f(uf, fold);
if r != 0 {
return r;
}
for k in 0..j {
let uf2 = unfolds[k];
let r = f(uf, &[uf2]);
if r != 0 {
return r;
}
let r = f(uf2, &[uf]);
if r != 0 {
return r;
}
}
}
i = folds3_next(i);
}
0
}
pub fn onigenc_unicode_apply_all_case_fold(
flag: OnigCaseFoldType,
f: &mut dyn FnMut(OnigCodePoint, &[OnigCodePoint]) -> i32,
) -> i32 {
let mut r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f);
if r != 0 {
return r;
}
r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f);
if r != 0 {
return r;
}
if (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0 {
return 0;
}
r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f);
if r != 0 {
return r;
}
r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f);
if r != 0 {
return r;
}
r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f);
if r != 0 {
return r;
}
0
}
pub fn onigenc_unicode_get_case_fold_codes_by_str(
enc: &dyn Encoding,
flag: OnigCaseFoldType,
p: &[u8],
_end: usize,
items: &mut [OnigCaseFoldCodeItem],
) -> i32 {
let remaining = p.len(); let mut n = 0usize;
let code = enc.mbc_to_code(p, remaining);
if case_fold_is_ascii_only(flag) && code >= 128 {
return 0;
}
let len0 = enc.mbc_enc_len(p);
let mut orig_codes = [0u32; 3];
let mut codes = [0u32; 3];
let mut lens = [0usize; 3];
orig_codes[0] = code;
lens[0] = len0;
let buk1 = unfold_key(orig_codes[0]);
if let Some((index, fold_len)) = buk1 {
if fold_len == 1 {
codes[0] = folds1_fold(index);
} else {
codes[0] = orig_codes[0];
}
} else {
codes[0] = orig_codes[0];
}
if (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0 {
} else if len0 < remaining {
let p1 = &p[len0..];
let code1 = enc.mbc_to_code(p1, p1.len());
orig_codes[1] = code1;
let len1 = enc.mbc_enc_len(p1);
lens[1] = lens[0] + len1;
if let Some((idx, fl)) = unfold_key(orig_codes[1]) {
if fl == 1 {
codes[1] = folds1_fold(idx);
} else {
codes[1] = orig_codes[1];
}
} else {
codes[1] = orig_codes[1];
}
if lens[1] < remaining {
let p2 = &p[lens[1]..];
let code2 = enc.mbc_to_code(p2, p2.len());
orig_codes[2] = code2;
let len2 = enc.mbc_enc_len(p2);
lens[2] = lens[1] + len2;
if let Some((idx, fl)) = unfold_key(orig_codes[2]) {
if fl == 1 {
codes[2] = folds1_fold(idx);
} else {
codes[2] = orig_codes[2];
}
} else {
codes[2] = orig_codes[2];
}
if let Some(index) = fold3_key(&codes) {
let unfolds = folds3_unfolds(index);
for uf in unfolds {
items[n].byte_len = lens[2] as i32;
items[n].code_len = 1;
items[n].code[0] = *uf;
n += 1;
}
let mut cs = [[0u32; 4]; 3];
let mut ncs = [0usize; 3];
let fold3 = folds3_fold(index);
for fn_idx in 0..3 {
cs[fn_idx][0] = fold3[fn_idx];
ncs[fn_idx] = 1;
if let Some(sidx) = fold1_key(cs[fn_idx][0]) {
let sunfolds = folds1_unfolds(sidx);
for (si, &su) in sunfolds.iter().enumerate() {
cs[fn_idx][si + 1] = su;
}
ncs[fn_idx] += sunfolds.len();
}
}
for i in 0..ncs[0] {
for j in 0..ncs[1] {
for k in 0..ncs[2] {
if cs[0][i] == orig_codes[0]
&& cs[1][j] == orig_codes[1]
&& cs[2][k] == orig_codes[2]
{
continue;
}
items[n].byte_len = lens[2] as i32;
items[n].code_len = 3;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
items[n].code[2] = cs[2][k];
n += 1;
}
}
}
return n as i32;
}
}
if let Some(index) = fold2_key(&codes) {
let unfolds = folds2_unfolds(index);
for uf in unfolds {
items[n].byte_len = lens[1] as i32;
items[n].code_len = 1;
items[n].code[0] = *uf;
n += 1;
}
let mut cs = [[0u32; 4]; 2];
let mut ncs = [0usize; 2];
let fold2 = folds2_fold(index);
for fn_idx in 0..2 {
cs[fn_idx][0] = fold2[fn_idx];
ncs[fn_idx] = 1;
if let Some(sidx) = fold1_key(cs[fn_idx][0]) {
let sunfolds = folds1_unfolds(sidx);
for (si, &su) in sunfolds.iter().enumerate() {
cs[fn_idx][si + 1] = su;
}
ncs[fn_idx] += sunfolds.len();
}
}
for i in 0..ncs[0] {
for j in 0..ncs[1] {
if cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1] {
continue;
}
items[n].byte_len = lens[1] as i32;
items[n].code_len = 2;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
n += 1;
}
}
return n as i32;
}
}
if let Some((buk_index, buk_fold_len)) = buk1 {
if buk_fold_len == 1 {
let fold_code = folds1_fold(buk_index);
if case_fold_is_not_ascii_only(flag) || fold_code < 128 {
items[n].byte_len = lens[0] as i32;
items[n].code_len = 1;
items[n].code[0] = fold_code;
n += 1;
}
let unfolds = folds1_unfolds(buk_index);
for &uf in unfolds {
if uf != orig_codes[0] {
if case_fold_is_not_ascii_only(flag) || uf < 128 {
items[n].byte_len = lens[0] as i32;
items[n].code_len = 1;
items[n].code[0] = uf;
n += 1;
}
}
}
} else if (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0 {
if buk_fold_len == 2 {
let unfolds = folds2_unfolds(buk_index);
for &uf in unfolds {
if uf == orig_codes[0] {
continue;
}
items[n].byte_len = lens[0] as i32;
items[n].code_len = 1;
items[n].code[0] = uf;
n += 1;
}
let mut cs = [[0u32; 4]; 2];
let mut ncs = [0usize; 2];
let fold2 = folds2_fold(buk_index);
for fn_idx in 0..2 {
cs[fn_idx][0] = fold2[fn_idx];
ncs[fn_idx] = 1;
if let Some(sidx) = fold1_key(cs[fn_idx][0]) {
let sunfolds = folds1_unfolds(sidx);
for (si, &su) in sunfolds.iter().enumerate() {
cs[fn_idx][si + 1] = su;
}
ncs[fn_idx] += sunfolds.len();
}
}
for i in 0..ncs[0] {
for j in 0..ncs[1] {
items[n].byte_len = lens[0] as i32;
items[n].code_len = 2;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
n += 1;
}
}
} else if buk_fold_len == 3 {
let unfolds = folds3_unfolds(buk_index);
for &uf in unfolds {
if uf == orig_codes[0] {
continue;
}
items[n].byte_len = lens[0] as i32;
items[n].code_len = 1;
items[n].code[0] = uf;
n += 1;
}
let mut cs = [[0u32; 4]; 3];
let mut ncs = [0usize; 3];
let fold3 = folds3_fold(buk_index);
for fn_idx in 0..3 {
cs[fn_idx][0] = fold3[fn_idx];
ncs[fn_idx] = 1;
if let Some(sidx) = fold1_key(cs[fn_idx][0]) {
let sunfolds = folds1_unfolds(sidx);
for (si, &su) in sunfolds.iter().enumerate() {
cs[fn_idx][si + 1] = su;
}
ncs[fn_idx] += sunfolds.len();
}
}
for i in 0..ncs[0] {
for j in 0..ncs[1] {
for k in 0..ncs[2] {
items[n].byte_len = lens[0] as i32;
items[n].code_len = 3;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
items[n].code[2] = cs[2][k];
n += 1;
}
}
}
}
}
} else {
if let Some(index) = fold1_key(orig_codes[0]) {
let unfolds = folds1_unfolds(index);
for &uf in unfolds {
if case_fold_is_not_ascii_only(flag) || uf < 128 {
items[n].byte_len = lens[0] as i32;
items[n].code_len = 1;
items[n].code[0] = uf;
n += 1;
}
}
}
}
n as i32
}
use std::sync::Mutex;
const USER_DEFINED_PROPERTY_MAX_NUM: usize = 32;
struct UserProperty {
name: Vec<u8>,
ranges: Vec<OnigCodePoint>,
}
static USER_DEFINED_PROPERTIES: Mutex<Vec<UserProperty>> = Mutex::new(Vec::new());
#[cfg_attr(coverage_nightly, coverage(off))]
fn normalize_property_name(name: &[u8]) -> Option<Vec<u8>> {
let mut buf = Vec::with_capacity(name.len());
for &b in name {
if b == b' ' || b == b'-' || b == b'_' {
continue;
}
if b >= 0x80 {
return None;
}
buf.push(b.to_ascii_lowercase());
}
if buf.is_empty() {
return None;
}
Some(buf)
}
#[cfg_attr(coverage_nightly, coverage(off))]
pub fn onig_unicode_define_user_property(name: &[u8], ranges: &[OnigCodePoint]) -> Result<(), i32> {
let normalized = normalize_property_name(name)
.ok_or(ONIGERR_INVALID_CHAR_PROPERTY_NAME)?;
let mut props = USER_DEFINED_PROPERTIES.lock().unwrap();
for prop in props.iter() {
if prop.name == normalized {
return Err(ONIGERR_INVALID_CHAR_PROPERTY_NAME);
}
}
if props.len() >= USER_DEFINED_PROPERTY_MAX_NUM {
return Err(ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS);
}
props.push(UserProperty {
name: normalized,
ranges: ranges.to_vec(),
});
Ok(())
}
pub fn onigenc_unicode_property_name_to_ctype(p: &[u8]) -> i32 {
let mut buf = [0u8; 128];
let mut len = 0;
for &b in p {
if b == b' ' || b == b'-' || b == b'_' {
continue;
}
if b >= 0x80 {
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
}
if len >= buf.len() {
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
}
buf[len] = b.to_ascii_lowercase();
len += 1;
}
let key = &buf[..len];
match PROPERTY_NAMES.binary_search_by_key(&key, |(name, _)| name.as_bytes()) {
Ok(idx) => PROPERTY_NAMES[idx].1 as i32,
Err(_) => {
if let Ok(props) = USER_DEFINED_PROPERTIES.lock() {
for (i, prop) in props.iter().enumerate() {
if prop.name == key {
return (CODE_RANGES_NUM + i) as i32;
}
}
}
ONIGERR_INVALID_CHAR_PROPERTY_NAME
}
}
}
pub fn onigenc_unicode_is_code_ctype(code: OnigCodePoint, ctype: u32) -> bool {
if ctype <= ONIGENC_MAX_STD_CTYPE && code < 256 {
return (ENC_UNICODE_ISO_8859_1_CTYPE_TABLE[code as usize] & ctype_to_bit(ctype) as u16)
!= 0;
}
if (ctype as usize) >= CODE_RANGES_NUM {
let user_idx = (ctype as usize) - CODE_RANGES_NUM;
if let Ok(props) = USER_DEFINED_PROPERTIES.lock() {
if user_idx < props.len() {
let ranges = &props[user_idx].ranges;
let n = ranges.len() / 2;
let mut low = 0usize;
let mut high = n;
while low < high {
let mid = (low + high) / 2;
if code > ranges[mid * 2 + 1] {
low = mid + 1;
} else {
high = mid;
}
}
return low < n && code >= ranges[low * 2];
}
}
return false;
}
let ranges = CODE_RANGES[ctype as usize];
let n = ranges.len() / 2;
let mut low = 0usize;
let mut high = n;
while low < high {
let mid = (low + high) / 2;
if code > ranges[mid * 2 + 1] {
low = mid + 1;
} else {
high = mid;
}
}
low < n && code >= ranges[low * 2]
}
pub fn onigenc_unicode_ctype_code_range(ctype: u32) -> Option<&'static [OnigCodePoint]> {
if (ctype as usize) >= CODE_RANGES_NUM {
return None;
}
Some(CODE_RANGES[ctype as usize])
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum EgcbBreakType {
NotBreak,
Break,
BreakUndefGB11,
BreakUndefRiRi,
}
fn egcb_get_type(code: u32) -> EgcbType {
let mut low: usize = 0;
let mut high: usize = EGCB_RANGES.len();
while low < high {
let x = (low + high) >> 1;
if code > EGCB_RANGES[x].end {
low = x + 1;
} else {
high = x;
}
}
if low < EGCB_RANGES.len() && code >= EGCB_RANGES[low].start {
EGCB_RANGES[low].prop
} else {
EgcbType::Other
}
}
#[inline]
fn is_control_cr_lf(t: EgcbType) -> bool {
matches!(t, EgcbType::CR | EgcbType::LF | EgcbType::Control)
}
#[inline]
fn is_hangul(t: EgcbType) -> bool {
matches!(
t,
EgcbType::L | EgcbType::LV | EgcbType::LVT | EgcbType::T | EgcbType::V
)
}
const PROP_INDEX_EXTENDEDPICTOGRAPHIC: u32 = 81;
fn unicode_egcb_is_break_2code(from_code: u32, to_code: u32) -> EgcbBreakType {
let from = egcb_get_type(from_code);
let to = egcb_get_type(to_code);
if from == EgcbType::Other && to == EgcbType::Other {
return EgcbBreakType::Break; }
if from == EgcbType::CR && to == EgcbType::LF {
return EgcbBreakType::NotBreak;
}
if is_control_cr_lf(from) {
return EgcbBreakType::Break;
}
if is_control_cr_lf(to) {
return EgcbBreakType::Break;
}
if is_hangul(from) && is_hangul(to) {
if from == EgcbType::L && to != EgcbType::T {
return EgcbBreakType::NotBreak;
}
if (from == EgcbType::LV || from == EgcbType::V) && (to == EgcbType::V || to == EgcbType::T)
{
return EgcbBreakType::NotBreak;
}
if to == EgcbType::T && (from == EgcbType::LVT || from == EgcbType::T) {
return EgcbBreakType::NotBreak;
}
return EgcbBreakType::Break; }
if to == EgcbType::Extend || to == EgcbType::ZWJ {
return EgcbBreakType::NotBreak;
}
if to == EgcbType::SpacingMark {
return EgcbBreakType::NotBreak;
}
if from == EgcbType::Prepend {
return EgcbBreakType::NotBreak;
}
if from == EgcbType::ZWJ {
if onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC) {
return EgcbBreakType::BreakUndefGB11;
}
return EgcbBreakType::Break; }
if from == EgcbType::RegionalIndicator && to == EgcbType::RegionalIndicator {
return EgcbBreakType::BreakUndefRiRi;
}
EgcbBreakType::Break
}
pub fn onigenc_egcb_is_break_position(
enc: OnigEncoding,
str_data: &[u8],
s: usize,
start: usize,
end: usize,
) -> bool {
if s <= start {
return true;
}
if s >= end {
return true;
}
let mut prev = enc.left_adjust_char_head(start, s - 1, str_data);
if prev < start {
return true;
}
let from = enc.mbc_to_code(&str_data[prev..], end);
let to = enc.mbc_to_code(&str_data[s..], end);
let btype = unicode_egcb_is_break_2code(from, to);
match btype {
EgcbBreakType::NotBreak => false,
EgcbBreakType::Break => true,
EgcbBreakType::BreakUndefGB11 => {
loop {
if prev <= start {
break;
}
prev = enc.left_adjust_char_head(start, prev - 1, str_data);
if prev < start {
break;
}
let code = enc.mbc_to_code(&str_data[prev..], end);
if onigenc_unicode_is_code_ctype(code, PROP_INDEX_EXTENDEDPICTOGRAPHIC) {
return false; }
let t = egcb_get_type(code);
if t != EgcbType::Extend {
break; }
}
true }
EgcbBreakType::BreakUndefRiRi => {
let mut n: usize = 0;
loop {
if prev <= start {
break;
}
prev = enc.left_adjust_char_head(start, prev - 1, str_data);
if prev < start {
break;
}
let code = enc.mbc_to_code(&str_data[prev..], end);
let t = egcb_get_type(code);
if t != EgcbType::RegionalIndicator {
break;
}
n += 1;
}
(n % 2) != 0
}
}
}
fn wb_get_type(code: u32) -> WbType {
let mut low: usize = 0;
let mut high: usize = WB_RANGES.len();
while low < high {
let x = (low + high) >> 1;
if code > WB_RANGES[x].end {
low = x + 1;
} else {
high = x;
}
}
if low < WB_RANGES.len() && code >= WB_RANGES[low].start {
WB_RANGES[low].prop
} else {
WbType::Any
}
}
#[inline]
fn is_wb_ignore_tail(t: WbType) -> bool {
matches!(t, WbType::Extend | WbType::Format | WbType::ZWJ)
}
#[inline]
fn is_wb_ahletter(t: WbType) -> bool {
matches!(t, WbType::ALetter | WbType::HebrewLetter)
}
#[inline]
fn is_wb_midnumletq(t: WbType) -> bool {
matches!(t, WbType::MidNumLet | WbType::SingleQuote)
}
fn wb_get_next_main_code(
enc: OnigEncoding,
str_data: &[u8],
mut pos: usize,
end: usize,
) -> Option<(u32, WbType)> {
loop {
pos += enc.mbc_enc_len(&str_data[pos..]);
if pos >= end {
break;
}
let code = enc.mbc_to_code(&str_data[pos..], end);
let t = wb_get_type(code);
if !is_wb_ignore_tail(t) {
return Some((code, t));
}
}
None
}
pub fn onigenc_wb_is_break_position(
enc: OnigEncoding,
str_data: &[u8],
s: usize,
start: usize,
end: usize,
) -> bool {
if s <= start {
return true;
}
if s >= end {
return true;
}
let mut prev = enc.left_adjust_char_head(start, s - 1, str_data);
if prev < start {
return true;
}
let cfrom = enc.mbc_to_code(&str_data[prev..], end);
let cto = enc.mbc_to_code(&str_data[s..], end);
let mut from = wb_get_type(cfrom);
let to = wb_get_type(cto);
if from == WbType::Any && to == WbType::Any {
return true; }
if from == WbType::CR && to == WbType::LF {
return false;
}
if matches!(from, WbType::Newline | WbType::CR | WbType::LF) {
return true;
}
if matches!(to, WbType::Newline | WbType::CR | WbType::LF) {
return true;
}
if from == WbType::ZWJ {
if onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC) {
return false;
}
}
if from == WbType::WSegSpace && to == WbType::WSegSpace {
return false;
}
if is_wb_ignore_tail(to) {
return false;
}
if is_wb_ignore_tail(from) {
loop {
if prev <= start {
break;
}
let pp = enc.left_adjust_char_head(start, prev - 1, str_data);
if pp < start {
break;
}
prev = pp;
let cf = enc.mbc_to_code(&str_data[prev..], end);
from = wb_get_type(cf);
if !is_wb_ignore_tail(from) {
break;
}
}
}
if is_wb_ahletter(from) {
if is_wb_ahletter(to) {
return false;
}
if to == WbType::MidLetter || is_wb_midnumletq(to) {
if let Some((_cto2, to2)) = wb_get_next_main_code(enc, str_data, s, end) {
if is_wb_ahletter(to2) {
return false;
}
}
}
}
if from == WbType::MidLetter || is_wb_midnumletq(from) {
if is_wb_ahletter(to) {
let mut from2 = WbType::Any;
let mut pp = prev;
loop {
if pp <= start {
break;
}
pp = enc.left_adjust_char_head(start, pp - 1, str_data);
if pp < start {
break;
}
let cf2 = enc.mbc_to_code(&str_data[pp..], end);
from2 = wb_get_type(cf2);
if !is_wb_ignore_tail(from2) {
break;
}
}
if is_wb_ahletter(from2) {
return false;
}
}
}
if from == WbType::HebrewLetter {
if to == WbType::SingleQuote {
return false;
}
if to == WbType::DoubleQuote {
if let Some((_cto2, to2)) = wb_get_next_main_code(enc, str_data, s, end) {
if to2 == WbType::HebrewLetter {
return false;
}
}
}
}
if from == WbType::DoubleQuote {
if to == WbType::HebrewLetter {
let mut from2 = WbType::Any;
let mut pp = prev;
loop {
if pp <= start {
break;
}
pp = enc.left_adjust_char_head(start, pp - 1, str_data);
if pp < start {
break;
}
let cf2 = enc.mbc_to_code(&str_data[pp..], end);
from2 = wb_get_type(cf2);
if !is_wb_ignore_tail(from2) {
break;
}
}
if from2 == WbType::HebrewLetter {
return false;
}
}
}
if to == WbType::Numeric {
if from == WbType::Numeric {
return false;
}
if is_wb_ahletter(from) {
return false;
}
if from == WbType::MidNum || is_wb_midnumletq(from) {
let mut from2 = WbType::Any;
let mut pp = prev;
loop {
if pp <= start {
break;
}
pp = enc.left_adjust_char_head(start, pp - 1, str_data);
if pp < start {
break;
}
let cf2 = enc.mbc_to_code(&str_data[pp..], end);
from2 = wb_get_type(cf2);
if !is_wb_ignore_tail(from2) {
break;
}
}
if from2 == WbType::Numeric {
return false;
}
}
}
if from == WbType::Numeric {
if is_wb_ahletter(to) {
return false;
}
if to == WbType::MidNum || is_wb_midnumletq(to) {
if let Some((_cto2, to2)) = wb_get_next_main_code(enc, str_data, s, end) {
if to2 == WbType::Numeric {
return false;
}
}
}
}
if from == WbType::Katakana && to == WbType::Katakana {
return false;
}
if to == WbType::ExtendNumLet {
if is_wb_ahletter(from)
|| from == WbType::Numeric
|| from == WbType::Katakana
|| from == WbType::ExtendNumLet
{
return false;
}
}
if from == WbType::ExtendNumLet {
if is_wb_ahletter(to) || to == WbType::Numeric || to == WbType::Katakana {
return false;
}
}
if from == WbType::RegionalIndicator && to == WbType::RegionalIndicator {
let mut n: usize = 0;
let mut pp = prev;
loop {
if pp <= start {
break;
}
pp = enc.left_adjust_char_head(start, pp - 1, str_data);
if pp < start {
break;
}
let cf2 = enc.mbc_to_code(&str_data[pp..], end);
let from2 = wb_get_type(cf2);
if from2 != WbType::RegionalIndicator {
break;
}
n += 1;
}
if (n % 2) == 0 {
return false;
}
}
true
}