use crate::codepointset::{CodePointSet, Interval};
use crate::unicodetables::{
FOLDS, TO_UPPERCASE, binary_property_ranges, general_category_property_value_ranges,
script_extensions_value_ranges, script_value_ranges, string_property_sets,
unicode_property_binary_from_str, unicode_property_value_general_category_from_str,
unicode_property_value_script_from_str, unicode_string_property_from_str,
};
use crate::util::SliceHelp;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
use core::cmp::Ordering;
const CODE_POINT_BITS: u32 = 20;
const LENGTH_BITS: u32 = 32 - CODE_POINT_BITS;
#[derive(Copy, Clone, Debug)]
pub struct CodePointRange(u32);
macro_rules! const_assert_true {
($x:expr $(,)*) => {
[()][!$x as usize];
};
}
impl CodePointRange {
#[inline(always)]
pub const fn from(start: u32, len: u32) -> Self {
const_assert_true!(start < (1 << CODE_POINT_BITS));
const_assert_true!(len > 0 && len <= (1 << LENGTH_BITS));
const_assert_true!((start + len - 1) < ((1 << CODE_POINT_BITS) - 1));
CodePointRange((start << LENGTH_BITS) | (len - 1))
}
#[inline(always)]
const fn len_minus_1(self) -> u32 {
self.0 & ((1 << LENGTH_BITS) - 1)
}
#[inline(always)]
pub const fn first(self) -> u32 {
self.0 >> LENGTH_BITS
}
#[inline(always)]
pub const fn last(self) -> u32 {
self.first() + self.len_minus_1()
}
}
const PREDICATE_MASK_BITS: u32 = 4;
pub(crate) struct FoldRange {
pub(crate) range: CodePointRange,
pub(crate) extra: i32,
}
impl FoldRange {
#[inline(always)]
pub const fn from(start: u32, length: u32, delta: i32, modulo: u8) -> Self {
const_assert_true!(modulo.is_power_of_two());
let mask = (modulo - 1) as i32;
const_assert_true!(mask < (1 << PREDICATE_MASK_BITS));
const_assert_true!(((delta << PREDICATE_MASK_BITS) >> PREDICATE_MASK_BITS) == delta);
let extra = mask | (delta << PREDICATE_MASK_BITS);
FoldRange {
range: CodePointRange::from(start, length),
extra,
}
}
#[inline(always)]
fn first(&self) -> u32 {
self.range.first()
}
#[inline(always)]
fn last(&self) -> u32 {
self.range.last()
}
#[inline(always)]
fn delta(&self) -> i32 {
self.extra >> PREDICATE_MASK_BITS
}
#[inline(always)]
fn predicate_mask(&self) -> u32 {
(self.extra as u32) & ((1 << PREDICATE_MASK_BITS) - 1)
}
fn add_delta(&self, cu: u32) -> u32 {
let cs = (cu as i32) + self.delta();
core::debug_assert!(0 <= cs && cs <= 0x10FFFF);
cs as u32
}
fn transformed_to(&self) -> Interval {
Interval {
first: self.add_delta(self.first()),
last: self.add_delta(self.last()),
}
}
fn transformed_from(&self) -> Interval {
Interval {
first: self.first(),
last: self.last(),
}
}
fn can_apply(&self, cu: u32) -> bool {
self.transformed_from().contains(cu)
}
fn apply(&self, cu: u32) -> u32 {
debug_assert!(self.can_apply(cu), "Cannot apply to this code point");
let offset = cu - self.first();
if (offset & self.predicate_mask()) == 0 {
self.add_delta(cu)
} else {
cu
}
}
}
pub(crate) fn fold_code_point(cu: u32, unicode: bool) -> u32 {
if unicode {
return fold(cu);
}
uppercase(cu)
}
pub fn fold(cu: u32) -> u32 {
let searched = FOLDS.binary_search_by(|fr| {
if fr.first() > cu {
Ordering::Greater
} else if fr.last() < cu {
Ordering::Less
} else {
Ordering::Equal
}
});
if let Ok(index) = searched {
let fr: &FoldRange = if cfg!(feature = "prohibit-unsafe") {
unsafe { FOLDS.get_unchecked(index) }
} else {
FOLDS.get(index).expect("Invalid index")
};
fr.apply(cu)
} else {
cu
}
}
fn uppercase(cu: u32) -> u32 {
let searched = TO_UPPERCASE.binary_search_by(|fr| {
if fr.first() > cu {
Ordering::Greater
} else if fr.last() < cu {
Ordering::Less
} else {
Ordering::Equal
}
});
if let Ok(index) = searched {
let fr: &FoldRange = if cfg!(feature = "prohibit-unsafe") {
unsafe { TO_UPPERCASE.get_unchecked(index) }
} else {
TO_UPPERCASE.get(index).expect("Invalid index")
};
fr.apply(cu)
} else {
cu
}
}
fn fold_interval(iv: Interval, recv: &mut CodePointSet) {
let overlaps = FOLDS.equal_range_by(|tr| {
if tr.first() > iv.last {
Ordering::Greater
} else if tr.last() < iv.first {
Ordering::Less
} else {
Ordering::Equal
}
});
for fr in &FOLDS[overlaps] {
debug_assert!(
fr.transformed_from().overlaps(iv),
"Interval does not overlap transform"
);
let first_trans = core::cmp::max(fr.first(), iv.first);
let last_trans = core::cmp::min(fr.last(), iv.last);
let modulo = fr.predicate_mask() + 1;
if modulo == 1 {
for cu in first_trans..(last_trans + 1) {
let cs = fr.add_delta(cu);
if cs != cu {
recv.add_one(cs);
}
}
} else {
let offset_start = first_trans - fr.first();
let start_aligned = first_trans + ((modulo - (offset_start % modulo)) % modulo);
let mut cu = start_aligned;
while cu <= last_trans {
let cs = fr.add_delta(cu);
recv.add_one(cs);
cu += modulo;
}
}
}
}
fn unfold_interval(iv: Interval, recv: &mut CodePointSet) {
for tr in FOLDS.iter() {
if !iv.overlaps(tr.transformed_to()) {
continue;
}
let modulo = tr.predicate_mask() + 1;
let first_source = tr.first();
let last_source = tr.last();
let mut process_cp = |cp| {
let tcp = tr.apply(cp);
if tcp != cp && iv.contains(tcp) {
recv.add_one(cp);
}
};
if modulo == 1 {
for cp in first_source..(last_source + 1) {
process_cp(cp);
}
} else {
let mut cp = first_source;
while cp <= last_source {
process_cp(cp);
cp += modulo;
}
}
}
}
pub fn unfold_char(c: u32) -> Vec<u32> {
let mut res = vec![c];
let fcp = fold(c);
if fcp != c {
res.push(fcp);
}
for tr in FOLDS.iter() {
if !tr.transformed_to().contains(fcp) {
continue;
}
for cp in tr.transformed_from().codepoints() {
let tcp = tr.apply(cp);
if tcp == fcp {
res.push(cp);
}
}
}
res.sort_unstable();
res.dedup();
res
}
pub(crate) fn unfold_uppercase_char(c: u32) -> Vec<u32> {
let mut res = vec![c];
let fcp = uppercase(c);
if fcp != c {
res.push(fcp);
}
for tr in TO_UPPERCASE.iter() {
if !tr.transformed_to().contains(fcp) {
continue;
}
for cp in tr.transformed_from().codepoints() {
let tcp = tr.apply(cp);
if tcp == fcp {
res.push(cp);
}
}
}
res.sort_unstable();
res.dedup();
res
}
pub fn add_icase_code_points(mut input: CodePointSet) -> CodePointSet {
let mut folded = input.clone();
for iv in input.intervals() {
fold_interval(*iv, &mut folded)
}
input.clone_from(&folded);
for iv in folded.intervals() {
unfold_interval(*iv, &mut input);
}
input
}
pub(crate) enum PropertyEscapeKind {
CharacterClass(&'static [Interval]),
StringSet(&'static [&'static [u32]]),
}
#[derive(Debug, Copy, Clone)]
pub(crate) enum UnicodePropertyName {
GeneralCategory,
Script,
ScriptExtensions,
}
pub(crate) fn unicode_property_name_from_str(s: &str) -> Option<UnicodePropertyName> {
use UnicodePropertyName::*;
match s {
"General_Category" | "gc" => Some(GeneralCategory),
"Script" | "sc" => Some(Script),
"Script_Extensions" | "scx" => Some(ScriptExtensions),
_ => None,
}
}
pub(crate) fn unicode_property_from_str(
s: &str,
name: Option<UnicodePropertyName>,
unicode_sets: bool,
) -> Option<PropertyEscapeKind> {
match name {
Some(UnicodePropertyName::GeneralCategory) => Some(PropertyEscapeKind::CharacterClass(
general_category_property_value_ranges(
&unicode_property_value_general_category_from_str(s)?,
),
)),
Some(UnicodePropertyName::Script) => Some(PropertyEscapeKind::CharacterClass(
script_value_ranges(&unicode_property_value_script_from_str(s)?),
)),
Some(UnicodePropertyName::ScriptExtensions) => Some(PropertyEscapeKind::CharacterClass(
script_extensions_value_ranges(&unicode_property_value_script_from_str(s)?),
)),
None => {
if let Some(value) = unicode_property_binary_from_str(s) {
return Some(PropertyEscapeKind::CharacterClass(binary_property_ranges(
&value,
)));
}
if unicode_sets && let Some(value) = unicode_string_property_from_str(s) {
return Some(PropertyEscapeKind::StringSet(string_property_sets(&value)));
}
Some(PropertyEscapeKind::CharacterClass(
general_category_property_value_ranges(
&unicode_property_value_general_category_from_str(s)?,
),
))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
fn get_unfold_map() -> HashMap<u32, Vec<u32>> {
let mut unfold_map: HashMap<u32, Vec<u32>> = HashMap::new();
for c in 0..=0x10FFFF {
let fc = fold(c);
if fc != c {
unfold_map.entry(fc).or_default().push(c);
}
}
for (&k, v) in unfold_map.iter_mut() {
assert_eq!(k, fold(k), "folds should be idempotent");
v.push(k);
v.sort_unstable();
}
unfold_map
}
#[test]
fn test_folds() {
for c in 0..0x41 {
assert_eq!(fold(c), c);
}
for c in 0x41..=0x5A {
assert_eq!(fold(c), c + 0x20);
}
assert_eq!(fold(0xB5), 0x3BC);
assert_eq!(fold(0xC0), 0xE0);
assert_eq!(fold(0x1B8), 0x1B9);
assert_eq!(fold(0x1B9), 0x1B9);
assert_eq!(fold(0x1BA), 0x1BA);
assert_eq!(fold(0x1BB), 0x1BB);
assert_eq!(fold(0x1BC), 0x1BD);
assert_eq!(fold(0x1BD), 0x1BD);
for c in 0x1F8..0x21F {
if c % 2 == 0 {
assert_eq!(fold(c), c + 1);
} else {
assert_eq!(fold(c), c);
}
}
assert_eq!(fold(0x37F), 0x3F3);
assert_eq!(fold(0x380), 0x380);
assert_eq!(fold(0x16E40), 0x16E60);
assert_eq!(fold(0x16E41), 0x16E61);
assert_eq!(fold(0x16E42), 0x16E62);
assert_eq!(fold(0x1E900), 0x1E922);
assert_eq!(fold(0x1E901), 0x1E923);
for c in 0xF0000..=0x10FFFF {
assert_eq!(fold(c), c);
}
}
#[test]
fn test_fold_idempotent() {
for c in 0..=0x10FFFF {
let fc = fold(c);
let ffc = fold(fc);
assert_eq!(ffc, fc);
}
}
#[test]
fn test_unfolds_refold() {
for c in 0..=0x10FFFF {
let fc = fold(c);
let unfolds = unfold_char(c);
for uc in unfolds {
assert_eq!(fold(uc), fc);
}
}
}
#[test]
fn test_unfold_chars() {
let unfold_map = get_unfold_map();
for c in 0..=0x10FFFF {
let mut unfolded = unfold_char(c);
unfolded.sort_unstable();
let fc = fold(c);
if let Some(expected) = unfold_map.get(&fc) {
assert_eq!(&unfolded, expected);
} else {
assert_eq!(&unfolded, &[c]);
}
}
}
#[test]
fn test_add_icase_code_points() {
let unfold_map = get_unfold_map();
let locs = [
0x0, 0x42, 0x100, 0xdeba, 0x11419, 0x278f8, 0x2e000, 0x35df7, 0x462d6, 0x4bc29,
0x4f4c0, 0x58a9b, 0x5bafc, 0x62383, 0x66d60, 0x6974a, 0x77628, 0x87804, 0x9262b,
0x931e4, 0xaa08c, 0xad7a8, 0xca6b0, 0xcce27, 0xcd897, 0xcf5e7, 0xe2802, 0xe561b,
0xe5f43, 0xf4339, 0xfb78c, 0xfc5ee, 0x104fa9, 0x10e402, 0x10e6cf, 0x10FFFF,
];
for (idx, &first) in locs.iter().enumerate() {
let mut expected = CodePointSet::default();
let mut from = first;
for &last in &locs[idx..] {
for c in from..=last {
let fc = fold(c);
if let Some(unfolded) = unfold_map.get(&fc) {
for &ufc in unfolded {
expected.add_one(ufc);
}
} else {
expected.add_one(fc);
}
}
let mut input = CodePointSet::new();
input.add(Interval { first, last });
let folded = add_icase_code_points(input);
assert_eq!(folded, expected);
from = last;
}
}
}
#[test]
fn test_fold_interval() {
let locs = [
0, 0x894, 0x59ac, 0xfa64, 0x10980, 0x12159, 0x16b8d, 0x1aaa2, 0x1f973, 0x1fcd4,
0x20c35, 0x23d8a, 0x276af, 0x2c6b8, 0x2fb25, 0x30b9b, 0x338ad, 0x35ab3, 0x38d37,
0x3bfa7, 0x3fba6, 0x404c9, 0x44572, 0x480c9, 0x4b5c4, 0x4f371, 0x5a9fa, 0x5ad6c,
0x5e395, 0x5f103, 0x5fa98, 0x617fa, 0x6500e, 0x68890, 0x6a3fc, 0x6eab3, 0x704a6,
0x70c22, 0x72efb, 0x737cc, 0x76796, 0x79da8, 0x7a450, 0x7b023, 0x7cc5c, 0x82027,
0x84ef4, 0x8ac66, 0x8b898, 0x8bd1a, 0x95841, 0x98a48, 0x9e6cd, 0xa035a, 0xa41fb,
0xa50e3, 0xa6387, 0xa7ba1, 0xaad9a, 0xabed8, 0xacc88, 0xb2737, 0xb31b1, 0xb6daf,
0xb7ff4, 0xba2b4, 0xbde4f, 0xbe38b, 0xbe7a5, 0xc4eb2, 0xc5670, 0xc7703, 0xc995d,
0xccb72, 0xcdfe3, 0xcfc99, 0xd09eb, 0xd2773, 0xd357d, 0xd6696, 0xd9aec, 0xdc3fa,
0xdc8ae, 0xdc9d5, 0xde31d, 0xe2edb, 0xe652b, 0xe92d5, 0xebf2d, 0xee335, 0xef45f,
0xf4280, 0xf74b1, 0xf9ac4, 0xfafca, 0x10208d, 0x107d63, 0x10821e, 0x108818, 0x10911f,
0x10b6fd, 0x10FFFF,
];
for (idx, &first) in locs.iter().enumerate() {
let mut expected = CodePointSet::default();
let mut from = first;
for &last in &locs[idx..] {
for c in from..=last {
let fc = fold(c);
if fc != c {
expected.add_one(fc);
}
}
let mut cps = CodePointSet::default();
fold_interval(Interval { first, last }, &mut cps);
assert_eq!(cps.intervals(), expected.intervals());
from = last;
}
}
}
}