use super::normalization_data::{
CCC_TABLE, COMPOSITION_PAIRS, NFD_INDEX, NFD_POOL, NFKD_INDEX, NFKD_POOL,
};
pub const HANGUL_S_BASE: u32 = 0xAC00;
pub const HANGUL_L_BASE: u32 = 0x1100;
pub const HANGUL_V_BASE: u32 = 0x1161;
pub const HANGUL_T_BASE: u32 = 0x11A7;
pub const HANGUL_L_COUNT: u32 = 19;
pub const HANGUL_V_COUNT: u32 = 21;
pub const HANGUL_T_COUNT: u32 = 28;
pub const HANGUL_N_COUNT: u32 = HANGUL_V_COUNT * HANGUL_T_COUNT; pub const HANGUL_S_COUNT: u32 = HANGUL_L_COUNT * HANGUL_N_COUNT;
#[inline]
pub fn ccc(cp: u32) -> u8 {
match CCC_TABLE.binary_search_by_key(&cp, |entry| entry.0) {
Ok(idx) => CCC_TABLE[idx].1,
Err(_) => 0,
}
}
#[inline]
pub fn nfd_lookup(cp: u32) -> Option<&'static [u32]> {
let idx = NFD_INDEX.binary_search_by_key(&cp, |entry| entry.0).ok()?;
let (_, off, len) = NFD_INDEX[idx];
let start = off as usize;
let end = start + len as usize;
Some(&NFD_POOL[start..end])
}
#[inline]
pub fn nfkd_lookup(cp: u32) -> Option<&'static [u32]> {
let idx = NFKD_INDEX.binary_search_by_key(&cp, |entry| entry.0).ok()?;
let (_, off, len) = NFKD_INDEX[idx];
let start = off as usize;
let end = start + len as usize;
Some(&NFKD_POOL[start..end])
}
#[inline]
pub fn compose_pair(first: u32, second: u32) -> Option<u32> {
let idx = COMPOSITION_PAIRS
.binary_search_by(|entry| (entry.0, entry.1).cmp(&(first, second)))
.ok()?;
Some(COMPOSITION_PAIRS[idx].2)
}
#[inline]
pub fn hangul_decompose_into(cp: u32, out: &mut Vec<u32>) -> bool {
if !(HANGUL_S_BASE..HANGUL_S_BASE + HANGUL_S_COUNT).contains(&cp) {
return false;
}
let s_index = cp - HANGUL_S_BASE;
let l = HANGUL_L_BASE + s_index / HANGUL_N_COUNT;
let v = HANGUL_V_BASE + (s_index % HANGUL_N_COUNT) / HANGUL_T_COUNT;
let t_offset = s_index % HANGUL_T_COUNT;
out.push(l);
out.push(v);
if t_offset != 0 {
out.push(HANGUL_T_BASE + t_offset);
}
true
}
#[inline]
pub fn hangul_compose(first: u32, second: u32) -> Option<u32> {
if (HANGUL_L_BASE..HANGUL_L_BASE + HANGUL_L_COUNT).contains(&first)
&& (HANGUL_V_BASE..HANGUL_V_BASE + HANGUL_V_COUNT).contains(&second)
{
let l_index = first - HANGUL_L_BASE;
let v_index = second - HANGUL_V_BASE;
return Some(HANGUL_S_BASE + (l_index * HANGUL_V_COUNT + v_index) * HANGUL_T_COUNT);
}
if (HANGUL_S_BASE..HANGUL_S_BASE + HANGUL_S_COUNT).contains(&first) {
let s_index = first - HANGUL_S_BASE;
if s_index.is_multiple_of(HANGUL_T_COUNT)
&& (HANGUL_T_BASE + 1..HANGUL_T_BASE + HANGUL_T_COUNT).contains(&second)
{
return Some(first + (second - HANGUL_T_BASE));
}
}
None
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum DecompKind {
Canonical,
Compatibility,
}
pub fn decompose_to_buffer(input: &str, kind: DecompKind, out: &mut Vec<u32>) {
out.reserve(input.len() * 2);
for ch in input.chars() {
let cp = ch as u32;
if hangul_decompose_into(cp, out) {
continue;
}
let mapping = match kind {
DecompKind::Canonical => nfd_lookup(cp),
DecompKind::Compatibility => nfkd_lookup(cp),
};
match mapping {
Some(slice) => out.extend_from_slice(slice),
None => out.push(cp),
}
}
}
pub fn canonical_reorder(buf: &mut [u32]) {
let len = buf.len();
let mut i = 0;
while i < len {
if ccc(buf[i]) == 0 {
i += 1;
continue;
}
let start = i;
while i < len && ccc(buf[i]) != 0 {
i += 1;
}
buf[start..i].sort_by_key(|&cp| ccc(cp));
}
}
pub fn decompose_and_reorder(input: &str, kind: DecompKind) -> Vec<u32> {
let mut buf = Vec::with_capacity(input.len() + 4);
decompose_to_buffer(input, kind, &mut buf);
canonical_reorder(&mut buf);
buf
}
pub fn encode(cps: &[u32]) -> String {
let mut out = String::with_capacity(cps.len());
for &cp in cps {
if let Some(c) = char::from_u32(cp) {
out.push(c);
}
}
out
}
pub fn to_nfd(input: &str) -> String {
encode(&decompose_and_reorder(input, DecompKind::Canonical))
}
pub fn to_nfkd(input: &str) -> String {
encode(&decompose_and_reorder(input, DecompKind::Compatibility))
}
pub fn compose(buf: Vec<u32>) -> Vec<u32> {
if buf.is_empty() {
return buf;
}
let mut out: Vec<u32> = Vec::with_capacity(buf.len());
let mut last_starter: usize = usize::MAX;
let mut last_ccc: u8 = 0;
for cp in buf {
let cur_ccc = ccc(cp);
if last_starter != usize::MAX {
let starter_cp = out[last_starter];
let composed = hangul_compose(starter_cp, cp).or_else(|| compose_pair(starter_cp, cp));
if let Some(comp) = composed {
let blocked = cur_ccc != 0 && last_ccc >= cur_ccc;
if !blocked {
out[last_starter] = comp;
continue;
}
}
}
out.push(cp);
if cur_ccc == 0 {
last_starter = out.len() - 1;
last_ccc = 0;
} else {
last_ccc = cur_ccc;
}
}
out
}
pub fn to_nfc(input: &str) -> String {
let decomposed = decompose_and_reorder(input, DecompKind::Canonical);
encode(&compose(decomposed))
}
pub fn to_nfkc(input: &str) -> String {
let decomposed = decompose_and_reorder(input, DecompKind::Compatibility);
encode(&compose(decomposed))
}
pub fn encode_decomp_table_bytes(index: &[(u32, u32, u8)], pool: &[u32]) -> Vec<u8> {
let mut bytes = Vec::with_capacity(4 + index.len() * 12 + 4 + pool.len() * 4);
bytes.extend_from_slice(&(index.len() as u32).to_le_bytes());
for (cp, off, len) in index {
bytes.extend_from_slice(&cp.to_le_bytes());
bytes.extend_from_slice(&off.to_le_bytes());
bytes.extend_from_slice(&u32::from(*len).to_le_bytes());
}
bytes.extend_from_slice(&(pool.len() as u32).to_le_bytes());
for cp in pool {
bytes.extend_from_slice(&cp.to_le_bytes());
}
bytes
}
pub fn encode_ccc_table_bytes(table: &[(u32, u8)]) -> Vec<u8> {
let mut bytes = Vec::with_capacity(4 + table.len() * 8);
bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
for (cp, ccc) in table {
bytes.extend_from_slice(&cp.to_le_bytes());
bytes.extend_from_slice(&u32::from(*ccc).to_le_bytes());
}
bytes
}
pub fn encode_composition_table_bytes(table: &[(u32, u32, u32)]) -> Vec<u8> {
let mut bytes = Vec::with_capacity(4 + table.len() * 12);
bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
for (first, second, composed) in table {
bytes.extend_from_slice(&first.to_le_bytes());
bytes.extend_from_slice(&second.to_le_bytes());
bytes.extend_from_slice(&composed.to_le_bytes());
}
bytes
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_roundtrips_unchanged() {
for s in ["", "hello", "ABC 123", "the quick brown fox"] {
assert_eq!(to_nfc(s), s);
assert_eq!(to_nfd(s), s);
assert_eq!(to_nfkc(s), s);
assert_eq!(to_nfkd(s), s);
}
}
#[test]
fn nfc_composes_combining_acute() {
let decomposed = "cafe\u{0301}";
let composed = "caf\u{00E9}";
assert_eq!(to_nfc(decomposed), composed);
assert_eq!(to_nfc(composed), composed);
}
#[test]
fn nfd_decomposes_precomposed_acute() {
let composed = "caf\u{00E9}";
let decomposed = "cafe\u{0301}";
assert_eq!(to_nfd(composed), decomposed);
assert_eq!(to_nfd(decomposed), decomposed);
}
#[test]
fn hangul_nfd_uses_algorithmic_decomposition() {
let composed = "\u{D55C}";
let decomposed = "\u{1112}\u{1161}\u{11AB}";
assert_eq!(to_nfd(composed), decomposed);
}
#[test]
fn hangul_nfc_recomposes_jamos() {
let composed = "\u{D55C}";
let decomposed = "\u{1112}\u{1161}\u{11AB}";
assert_eq!(to_nfc(decomposed), composed);
}
#[test]
fn nfkd_expands_compatibility_form() {
let input = "\u{00BD}";
let expected = "1\u{2044}2";
assert_eq!(to_nfkd(input), expected);
assert_eq!(to_nfd(input), input);
}
#[test]
fn nfkc_does_not_recompose_compatibility_fraction() {
assert_eq!(to_nfkc("\u{00BD}"), "1\u{2044}2");
}
#[test]
fn canonical_reorder_sorts_combining_marks_by_ccc() {
let input = "a\u{0307}\u{0323}";
let expected = "a\u{0323}\u{0307}";
assert_eq!(to_nfd(input), expected);
assert_eq!(to_nfd(expected), expected);
}
#[test]
fn nfc_idempotence() {
for s in [
"",
"caf\u{00E9}",
"\u{D55C}\u{AD6D}\u{C5B4}",
"1\u{2044}2",
"a\u{0307}\u{0323}b",
] {
let once = to_nfc(s);
assert_eq!(to_nfc(&once), once, "NFC idempotence fail on {s:?}");
}
}
#[test]
fn nfd_idempotence() {
for s in [
"",
"caf\u{00E9}",
"\u{D55C}\u{AD6D}\u{C5B4}",
"a\u{0307}\u{0323}b",
] {
let once = to_nfd(s);
assert_eq!(to_nfd(&once), once, "NFD idempotence fail on {s:?}");
}
}
#[test]
fn nfkc_idempotence() {
for s in [
"",
"caf\u{00E9}",
"\u{D55C}\u{AD6D}\u{C5B4}",
"\u{00BD}",
"\u{FB01}le",
] {
let once = to_nfkc(s);
assert_eq!(to_nfkc(&once), once, "NFKC idempotence fail on {s:?}");
}
}
#[test]
fn nfkd_idempotence() {
for s in [
"",
"caf\u{00E9}",
"\u{D55C}\u{AD6D}\u{C5B4}",
"\u{00BD}",
"\u{FB01}le",
] {
let once = to_nfkd(s);
assert_eq!(to_nfkd(&once), once, "NFKD idempotence fail on {s:?}");
}
}
#[test]
fn nfc_skips_full_composition_exclusion() {
assert_eq!(to_nfc("K"), "K");
assert_eq!(to_nfc("\u{212A}"), "K");
}
#[test]
fn nfd_decomposes_kelvin_to_ascii_k() {
assert_eq!(to_nfd("\u{212A}"), "K");
}
#[test]
fn ligature_nfkc_splits_into_components() {
assert_eq!(to_nfkd("\u{FB01}"), "fi");
assert_eq!(to_nfkc("\u{FB01}"), "fi");
}
#[test]
fn nfc_starter_blocking_prevents_invalid_composition() {
assert_eq!(to_nfc("a\u{0308}\u{0301}"), "\u{00E4}\u{0301}");
}
#[test]
fn encode_decomp_table_layout() {
let index: &[(u32, u32, u8)] = &[(0x00C0, 0, 2), (0x00C1, 2, 2)];
let pool: &[u32] = &[0x0041, 0x0300, 0x0041, 0x0301];
let bytes = encode_decomp_table_bytes(index, pool);
assert_eq!(bytes.len(), 4 + 2 * 12 + 4 + 4 * 4);
assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
assert_eq!(&bytes[4..8], &0x00C0u32.to_le_bytes());
assert_eq!(&bytes[8..12], &0u32.to_le_bytes());
assert_eq!(&bytes[12..16], &2u32.to_le_bytes());
assert_eq!(&bytes[28..32], &4u32.to_le_bytes());
assert_eq!(&bytes[32..36], &0x0041u32.to_le_bytes());
}
#[test]
fn encode_ccc_table_layout() {
let table: &[(u32, u8)] = &[(0x0300, 230), (0x0301, 230)];
let bytes = encode_ccc_table_bytes(table);
assert_eq!(bytes.len(), 4 + 2 * 8);
assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
assert_eq!(&bytes[4..8], &0x0300u32.to_le_bytes());
assert_eq!(&bytes[8..12], &230u32.to_le_bytes());
}
#[test]
fn encode_composition_table_layout() {
let table: &[(u32, u32, u32)] = &[(0x0041, 0x0300, 0x00C0)];
let bytes = encode_composition_table_bytes(table);
assert_eq!(bytes.len(), 4 + 12);
assert_eq!(&bytes[0..4], &1u32.to_le_bytes());
assert_eq!(&bytes[4..8], &0x0041u32.to_le_bytes());
assert_eq!(&bytes[8..12], &0x0300u32.to_le_bytes());
assert_eq!(&bytes[12..16], &0x00C0u32.to_le_bytes());
}
#[test]
fn ccc_table_contains_combining_acute() {
assert_eq!(ccc(0x0301), 230);
assert_eq!(ccc(0x0041), 0);
}
#[test]
fn composition_table_sorted_and_excludes_kelvin() {
for w in COMPOSITION_PAIRS.windows(2) {
let a = (w[0].0, w[0].1);
let b = (w[1].0, w[1].1);
assert!(a < b, "COMPOSITION_PAIRS must be sorted: {a:?} >= {b:?}");
}
let kelvin_idx = COMPOSITION_PAIRS.binary_search_by(|t| (t.0, t.1).cmp(&(0x004B, 0)));
assert!(
kelvin_idx.is_err(),
"U+212A should be excluded from COMPOSITION_PAIRS"
);
}
}