pub(crate) mod casefold;
pub(crate) mod ccc;
pub(crate) mod ccc_qc;
pub(crate) mod composition;
pub(crate) mod confusable;
pub(crate) mod decomposition;
pub(crate) mod qc;
pub(crate) mod trie;
use trie::CodePointTrie;
#[allow(dead_code)]
const BACKWARD_COMBINING: u32 = 1 << 31;
#[allow(dead_code)]
const NON_ROUND_TRIP: u32 = 1 << 30;
const HAS_DECOMPOSITION: u32 = 1 << 29;
const IS_EXPANSION: u32 = 1 << 24;
const CCC_SHIFT: u32 = 16;
const CCC_MASK: u32 = 0xFF << CCC_SHIFT;
const DECOMP_INFO_MASK: u32 = 0xFFFF;
pub(crate) const NEEDS_STARTER_SHADOW: u32 = 1 << 28;
pub(crate) const EXPANSION_CCC_SHIFT: u32 = 21;
pub(crate) const EXPANSION_CP_MASK: u32 = 0x1FFFFF;
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum DecompResult {
None,
Singleton(char),
Expansion { offset: usize, length: usize },
}
#[inline]
pub(crate) fn canonical_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: decomposition::CANONICAL_BMP_INDEX,
data: decomposition::CANONICAL_TRIE_DATA,
supp_index1: decomposition::CANONICAL_SUPP_INDEX1,
supp_index2: decomposition::CANONICAL_SUPP_INDEX2,
default_value: 0,
}
}
#[inline]
pub(crate) fn compat_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: decomposition::COMPAT_BMP_INDEX,
data: decomposition::COMPAT_TRIE_DATA,
supp_index1: decomposition::COMPAT_SUPP_INDEX1,
supp_index2: decomposition::COMPAT_SUPP_INDEX2,
default_value: 0,
}
}
#[inline]
pub(crate) fn ccc_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: ccc::CCC_BMP_INDEX,
data: ccc::CCC_TRIE_DATA,
supp_index1: ccc::CCC_SUPP_INDEX1,
supp_index2: ccc::CCC_SUPP_INDEX2,
default_value: 0,
}
}
#[allow(dead_code)]
#[inline]
pub(crate) fn nfc_qc_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: qc::NFC_QC_BMP_INDEX,
data: qc::NFC_QC_TRIE_DATA,
supp_index1: qc::NFC_QC_SUPP_INDEX1,
supp_index2: qc::NFC_QC_SUPP_INDEX2,
default_value: 0,
}
}
#[allow(dead_code)]
#[inline]
pub(crate) fn nfd_qc_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: qc::NFD_QC_BMP_INDEX,
data: qc::NFD_QC_TRIE_DATA,
supp_index1: qc::NFD_QC_SUPP_INDEX1,
supp_index2: qc::NFD_QC_SUPP_INDEX2,
default_value: 0,
}
}
#[allow(dead_code)]
#[inline]
pub(crate) fn nfkc_qc_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: qc::NFKC_QC_BMP_INDEX,
data: qc::NFKC_QC_TRIE_DATA,
supp_index1: qc::NFKC_QC_SUPP_INDEX1,
supp_index2: qc::NFKC_QC_SUPP_INDEX2,
default_value: 0,
}
}
#[allow(dead_code)]
#[inline]
pub(crate) fn nfkd_qc_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: qc::NFKD_QC_BMP_INDEX,
data: qc::NFKD_QC_TRIE_DATA,
supp_index1: qc::NFKD_QC_SUPP_INDEX1,
supp_index2: qc::NFKD_QC_SUPP_INDEX2,
default_value: 0,
}
}
#[inline]
pub(crate) fn ccc_qc_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: ccc_qc::CCC_QC_BMP_INDEX,
data: ccc_qc::CCC_QC_TRIE_DATA,
supp_index1: ccc_qc::CCC_QC_SUPP_INDEX1,
supp_index2: ccc_qc::CCC_QC_SUPP_INDEX2,
default_value: 0,
}
}
#[inline]
pub(crate) fn ccc_from_trie_value(v: u32) -> u8 {
((v & CCC_MASK) >> CCC_SHIFT) as u8
}
#[inline(always)]
pub(crate) fn has_decomposition(trie_value: u32) -> bool {
trie_value & HAS_DECOMPOSITION != 0
}
#[inline(always)]
pub(crate) fn needs_starter_shadow(trie_value: u32) -> bool {
trie_value & NEEDS_STARTER_SHADOW != 0
}
#[inline]
pub(crate) fn raw_decomp_trie_value(c: char, form: crate::decompose::DecompForm) -> u32 {
match form {
crate::decompose::DecompForm::Canonical => canonical_trie().get(c as u32),
crate::decompose::DecompForm::Compatible => compat_trie().get(c as u32),
}
}
#[inline(always)]
pub(crate) unsafe fn raw_decomp_trie_value_supplementary(
cp: u32,
form: crate::decompose::DecompForm,
) -> u32 {
match form {
crate::decompose::DecompForm::Canonical => unsafe {
canonical_trie().get_supplementary_unchecked(cp)
},
crate::decompose::DecompForm::Compatible => unsafe {
compat_trie().get_supplementary_unchecked(cp)
},
}
}
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn raw_decomp_trie_values_pipelined<const N: usize>(
cps: &[u32; N],
form: crate::decompose::DecompForm,
) -> [u32; N] {
debug_assert!(N == 2, "only 2-way pipelining is implemented");
debug_assert!(cps[0] < 0x10000 && cps[1] < 0x10000);
let trie = match form {
crate::decompose::DecompForm::Canonical => canonical_trie(),
crate::decompose::DecompForm::Compatible => compat_trie(),
};
let pair = unsafe { trie.get_two_bmp_pipelined_unchecked(cps[0], cps[1]) };
let mut out = [0u32; N];
out[0] = pair[0];
out[1] = pair[1];
out
}
#[inline]
pub(crate) fn decode_trie_value(
trie_value: u32,
form: crate::decompose::DecompForm,
) -> (DecompResult, u8) {
let ccc = ccc_from_trie_value(trie_value);
let expansion_table = match form {
crate::decompose::DecompForm::Canonical => decomposition::CANONICAL_EXPANSIONS,
crate::decompose::DecompForm::Compatible => decomposition::COMPAT_EXPANSIONS,
};
let decomp = decode_decomp(trie_value, expansion_table);
(decomp, ccc)
}
#[inline]
pub(crate) fn decode_decomp(trie_value: u32, expansion_table: &[u32]) -> DecompResult {
if trie_value & HAS_DECOMPOSITION == 0 {
return DecompResult::None;
}
let info = trie_value & DECOMP_INFO_MASK;
if trie_value & IS_EXPANSION != 0 {
let offset = info as usize;
let length = expansion_table[offset] as usize;
DecompResult::Expansion {
offset: offset + 1,
length,
}
} else {
debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
let ch = unsafe { char::from_u32_unchecked(info) };
DecompResult::Singleton(ch)
}
}
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_canonical_decomp(c: char) -> (DecompResult, u8) {
let trie = canonical_trie();
let v = trie.get(c as u32);
let ccc = ccc_from_trie_value(v);
let decomp = decode_decomp(v, decomposition::CANONICAL_EXPANSIONS);
(decomp, ccc)
}
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_compat_decomp(c: char) -> (DecompResult, u8) {
let trie = compat_trie();
let v = trie.get(c as u32);
let ccc = ccc_from_trie_value(v);
let decomp = decode_decomp(v, decomposition::COMPAT_EXPANSIONS);
(decomp, ccc)
}
#[inline]
pub(crate) fn lookup_ccc(c: char) -> u8 {
let trie = ccc_trie();
trie.get(c as u32) as u8
}
#[inline]
pub(crate) fn compose_pair(a: char, b: char) -> Option<char> {
let key = ((a as u64) << 21) | (b as u64);
let pairs = composition::COMPOSITION_PAIRS;
let mut len = pairs.len();
let mut base = 0usize;
while len > 1 {
let half = len / 2;
base += (pairs[base + half].0 <= key) as usize * half;
len -= half;
}
if base < pairs.len() && pairs[base].0 == key {
debug_assert!(pairs[base].1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&(pairs[base].1)));
Some(unsafe { char::from_u32_unchecked(pairs[base].1) })
} else {
None
}
}
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfc_qc(c: char) -> u8 {
nfc_qc_trie().get(c as u32) as u8
}
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfd_qc(c: char) -> u8 {
nfd_qc_trie().get(c as u32) as u8
}
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfkc_qc(c: char) -> u8 {
nfkc_qc_trie().get(c as u32) as u8
}
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfkd_qc(c: char) -> u8 {
nfkd_qc_trie().get(c as u32) as u8
}
pub(crate) const CCC_QC_NFC_SHIFT: u32 = 0;
pub(crate) const CCC_QC_NFD_SHIFT: u32 = 2;
pub(crate) const CCC_QC_NFKC_SHIFT: u32 = 4;
pub(crate) const CCC_QC_NFKD_SHIFT: u32 = 6;
pub(crate) const CCC_QC_CCC_SHIFT: u32 = 8;
#[inline]
pub(crate) fn lookup_ccc_qc(c: char, qc_shift: u32) -> (u8, u8) {
let packed = ccc_qc_trie().get(c as u32);
let ccc = (packed >> CCC_QC_CCC_SHIFT) as u8;
let qc = ((packed >> qc_shift) & 0x3) as u8;
(ccc, qc)
}
#[inline]
pub(crate) fn canonical_expansion_data(offset: usize, length: usize) -> &'static [u32] {
&decomposition::CANONICAL_EXPANSIONS[offset..offset + length]
}
#[inline]
pub(crate) fn compat_expansion_data(offset: usize, length: usize) -> &'static [u32] {
&decomposition::COMPAT_EXPANSIONS[offset..offset + length]
}
#[inline(always)]
pub(crate) fn expansion_data_from_trie_value(
trie_value: u32,
form: crate::decompose::DecompForm,
) -> Option<&'static [u32]> {
if trie_value & IS_EXPANSION == 0 {
return None;
}
let offset = (trie_value & DECOMP_INFO_MASK) as usize;
let table = match form {
crate::decompose::DecompForm::Canonical => decomposition::CANONICAL_EXPANSIONS,
crate::decompose::DecompForm::Compatible => decomposition::COMPAT_EXPANSIONS,
};
let length = table[offset] as usize;
Some(&table[offset + 1..offset + 1 + length])
}
#[inline]
pub(crate) fn casefold_trie() -> CodePointTrie {
CodePointTrie {
bmp_index: casefold::CASEFOLD_BMP_INDEX,
data: casefold::CASEFOLD_TRIE_DATA,
supp_index1: casefold::CASEFOLD_SUPP_INDEX1,
supp_index2: casefold::CASEFOLD_SUPP_INDEX2,
default_value: 0,
}
}
#[inline]
pub(crate) fn lookup_casefold(c: char) -> Option<char> {
let trie = casefold_trie();
let v = trie.get(c as u32);
if v == 0 {
None
} else {
debug_assert!(v <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&v));
Some(unsafe { char::from_u32_unchecked(v) })
}
}
#[inline]
pub(crate) fn turkish_casefold(c: char) -> Option<char> {
let cp = c as u32;
for &(src, tgt) in casefold::TURKISH_FOLDS {
if src == cp {
return Some(unsafe { char::from_u32_unchecked(tgt) });
}
}
None
}
const CONFUSABLE_EXPANSION_FLAG: u32 = 0x80000000;
pub(crate) enum ConfusableResult {
None,
Single(char),
Expansion { offset: usize, length: usize },
}
#[inline]
pub(crate) fn lookup_confusable(c: char) -> ConfusableResult {
let cp = c as u32;
let pairs = confusable::CONFUSABLE_MAPPINGS;
let mut len = pairs.len();
let mut base = 0usize;
while len > 1 {
let half = len / 2;
base += (pairs[base + half].0 <= cp) as usize * half;
len -= half;
}
if base < pairs.len() && pairs[base].0 == cp {
let value = pairs[base].1;
if value & CONFUSABLE_EXPANSION_FLAG != 0 {
let length = ((value >> 16) & 0xFF) as usize;
let offset = (value & 0xFFFF) as usize;
ConfusableResult::Expansion { offset, length }
} else {
debug_assert!(value <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&value));
ConfusableResult::Single(unsafe { char::from_u32_unchecked(value) })
}
} else {
ConfusableResult::None
}
}
#[inline]
pub(crate) fn confusable_expansion_data(offset: usize, length: usize) -> &'static [u32] {
&confusable::CONFUSABLE_EXPANSIONS[offset..offset + length]
}
#[inline(always)]
pub(crate) const fn confusable_bloom_hash(cp: u32) -> (usize, u8) {
let h = (((cp >> 8) ^ (cp & 0xFF)) & 0xFF) as usize;
let bit = (((cp >> 16) ^ (cp >> 4)) & 0x07) as u8;
(h, bit)
}
const fn build_confusable_bloom() -> [u8; 256] {
let mut bloom = [0u8; 256];
let mappings = confusable::CONFUSABLE_MAPPINGS;
let mut i = 0;
while i < mappings.len() {
let cp = mappings[i].0;
let (idx, bit) = confusable_bloom_hash(cp);
bloom[idx] |= 1u8 << bit;
i += 1;
}
bloom
}
pub(crate) const CONFUSABLE_BLOOM: [u8; 256] = build_confusable_bloom();
#[inline(always)]
pub(crate) fn confusable_bloom_might_contain(cp: u32) -> bool {
let (idx, bit) = confusable_bloom_hash(cp);
(CONFUSABLE_BLOOM[idx] >> bit) & 1 != 0
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ccc_extraction() {
assert_eq!(ccc_from_trie_value(0), 0);
let v = 0xE6 << 16;
assert_eq!(ccc_from_trie_value(v), 230);
let v = 0x01 << 16;
assert_eq!(ccc_from_trie_value(v), 1);
let v = 0xFE << 16;
assert_eq!(ccc_from_trie_value(v), 254);
}
#[test]
fn test_decode_decomp_none() {
let dummy: [u32; 0] = [];
assert_eq!(decode_decomp(0, &dummy), DecompResult::None);
assert_eq!(
decode_decomp(BACKWARD_COMBINING | NON_ROUND_TRIP | 0x00FF_FFFF, &dummy),
DecompResult::None
);
}
#[test]
fn test_decode_decomp_singleton() {
let v = HAS_DECOMPOSITION | 0x0041; let dummy: [u32; 0] = [];
assert_eq!(decode_decomp(v, &dummy), DecompResult::Singleton('A'));
}
#[test]
fn test_decode_decomp_expansion() {
let v = HAS_DECOMPOSITION | IS_EXPANSION | 0x0003;
let expansion_table: [u32; 6] = [0, 0, 0, 2, 0x0000_0041, 0x0000_0042];
assert_eq!(
decode_decomp(v, &expansion_table),
DecompResult::Expansion {
offset: 4,
length: 2
}
);
}
#[test]
fn test_ccc_from_trie_value_with_other_bits() {
let v = HAS_DECOMPOSITION | (202u32 << 16) | 0x1234;
assert_eq!(ccc_from_trie_value(v), 202);
}
#[test]
fn test_compose_pair_a_grave() {
let result = compose_pair('A', '\u{0300}');
assert_eq!(result, Some('\u{00C0}'));
}
#[test]
fn test_compose_pair_e_acute() {
let result = compose_pair('E', '\u{0301}');
assert_eq!(result, Some('\u{00C9}'));
}
#[test]
fn test_compose_pair_nonexistent() {
let result = compose_pair('Z', '\u{0300}');
assert_eq!(result, Option::None);
}
#[test]
fn test_compose_pair_non_composable() {
assert_eq!(compose_pair('x', 'y'), Option::None);
}
#[test]
fn test_lookup_ccc_grave_accent() {
assert_eq!(lookup_ccc('\u{0300}'), 230);
}
#[test]
fn test_lookup_ccc_cedilla() {
assert_eq!(lookup_ccc('\u{0327}'), 202);
}
#[test]
fn test_lookup_ccc_ascii() {
assert_eq!(lookup_ccc('A'), 0);
}
#[test]
fn test_canonical_decomp_ascii() {
let (decomp, ccc) = lookup_canonical_decomp('A');
assert_eq!(decomp, DecompResult::None);
assert_eq!(ccc, 0);
}
#[test]
fn test_canonical_decomp_a_grave() {
let (decomp, _ccc) = lookup_canonical_decomp('\u{00C0}');
match decomp {
DecompResult::Expansion { offset, length } => {
let data = canonical_expansion_data(offset, length);
assert_eq!(data.len(), 2);
assert_eq!(data[0] & EXPANSION_CP_MASK, 0x0041); assert_eq!(data[0] >> EXPANSION_CCC_SHIFT, 0); assert_eq!(data[1] & EXPANSION_CP_MASK, 0x0300); assert_eq!(data[1] >> EXPANSION_CCC_SHIFT, 230); },
DecompResult::Singleton(ch) => {
panic!("Expected Expansion for U+00C0, got Singleton({ch:?})");
},
DecompResult::None => {
panic!("Expected decomposition for U+00C0, got None");
},
}
}
#[test]
fn test_nfc_qc_ascii() {
assert_eq!(lookup_nfc_qc('A'), 0);
assert_eq!(lookup_nfc_qc('z'), 0);
}
#[test]
fn test_nfd_qc_ascii() {
assert_eq!(lookup_nfd_qc('A'), 0);
}
#[test]
fn test_nfd_qc_precomposed() {
let v = lookup_nfd_qc('\u{00C0}');
assert_eq!(v, 2);
}
#[test]
fn test_nfc_qc_combining_mark() {
let v = lookup_nfc_qc('\u{0300}');
assert_ne!(v, 0, "U+0300 should not be NFC_QC=Yes");
}
#[test]
fn test_nfkd_qc_ascii() {
assert_eq!(lookup_nfkd_qc('a'), 0);
}
#[test]
fn test_nfkc_qc_ascii() {
assert_eq!(lookup_nfkc_qc('a'), 0);
}
#[test]
fn test_trie_constructors_dont_panic() {
let _ = canonical_trie();
let _ = compat_trie();
let _ = ccc_trie();
let _ = nfc_qc_trie();
let _ = nfd_qc_trie();
let _ = nfkc_qc_trie();
let _ = nfkd_qc_trie();
}
#[test]
fn test_backward_combining_and_non_round_trip_bits() {
assert_eq!(BACKWARD_COMBINING & NON_ROUND_TRIP, 0);
assert_eq!(BACKWARD_COMBINING & HAS_DECOMPOSITION, 0);
assert_eq!(NON_ROUND_TRIP & HAS_DECOMPOSITION, 0);
assert_eq!(HAS_DECOMPOSITION & IS_EXPANSION, 0);
assert_eq!(IS_EXPANSION & CCC_MASK, 0);
assert_eq!(CCC_MASK & DECOMP_INFO_MASK, 0);
assert_eq!(NEEDS_STARTER_SHADOW & BACKWARD_COMBINING, 0);
assert_eq!(NEEDS_STARTER_SHADOW & NON_ROUND_TRIP, 0);
assert_eq!(NEEDS_STARTER_SHADOW & HAS_DECOMPOSITION, 0);
assert_eq!(NEEDS_STARTER_SHADOW & IS_EXPANSION, 0);
assert_eq!(NEEDS_STARTER_SHADOW & CCC_MASK, 0);
assert_eq!(NEEDS_STARTER_SHADOW & DECOMP_INFO_MASK, 0);
}
#[test]
fn pipelined_trie_walk_matches_serial() {
use crate::decompose::DecompForm;
for cp0 in (0u32..=0xFFFE).step_by(2) {
let cp1 = cp0 + 1;
if (0xD800..=0xDFFF).contains(&cp0) || (0xD800..=0xDFFF).contains(&cp1) {
continue;
}
let ch0 = char::from_u32(cp0).unwrap();
let ch1 = char::from_u32(cp1).unwrap();
let serial = [
raw_decomp_trie_value(ch0, DecompForm::Canonical),
raw_decomp_trie_value(ch1, DecompForm::Canonical),
];
let pipelined =
raw_decomp_trie_values_pipelined::<2>(&[cp0, cp1], DecompForm::Canonical);
assert_eq!(pipelined, serial, "mismatch at cp=U+{:04X}", cp0);
}
}
}