Skip to main content

relon_unicode/
normalization.rs

1//! Unicode normalization (UAX #15).
2//!
3//! v3++ b-5: implements the four standard normalization forms - NFC,
4//! NFD, NFKC, NFKD - directly against the embedded UCD 14.0.0 tables
5//! in [`super::normalization_data`]. The implementation is intentionally
6//! third-party-free so:
7//!
8//!   * Both the tree-walk evaluator and the wasm-AOT backend share
9//!     **one** dataset and one algorithm, avoiding silent drift
10//!     between executors.
11//!   * Bumping the Unicode version is a single regenerate-and-commit
12//!     step (see `tools/gen_normalization_tables.py`).
13//!
14//! The four entry points ([`to_nfd`], [`to_nfkd`], [`to_nfc`],
15//! [`to_nfkc`]) all return owned [`String`]s. Hangul syllables are
16//! decomposed / composed algorithmically per UAX #15 section 16 -
17//! keeping them in the data tables would cost ~88 KB for the syllable
18//! block alone with no performance gain.
19//!
20//! ### Algorithm sketch
21//!
22//! * **NFD**:  decode each `char` -> recursive canonical decomposition
23//!   (data table + Hangul algorithm) -> canonical reorder (stable sort
24//!   on CCC within each non-starter run) -> re-encode.
25//! * **NFKD**: same as NFD but using the compatibility table.
26//! * **NFC**:  run NFD, then a single left-to-right composition pass
27//!   that pairs each starter with subsequent characters via
28//!   `COMPOSITION_PAIRS` plus the algorithmic Hangul composer.
29//! * **NFKC**: run NFKD, then the same composition pass.
30//!
31//! Excluded composites (`Full_Composition_Exclusion` plus the explicit
32//! `CompositionExclusions.txt` list) are absent from
33//! `COMPOSITION_PAIRS` at generation time, so the composition pass
34//! never needs to consult an exclusion table at runtime.
35
36use super::normalization_data::{
37    CCC_TABLE, COMPOSITION_PAIRS, NFD_INDEX, NFD_POOL, NFKD_INDEX, NFKD_POOL,
38};
39
40// Hangul syllable algorithm constants (UAX #15 section 16).
41/// First precomposed Hangul syllable (U+AC00).
42pub const HANGUL_S_BASE: u32 = 0xAC00;
43/// First Hangul leading consonant jamo (U+1100).
44pub const HANGUL_L_BASE: u32 = 0x1100;
45/// First Hangul vowel jamo (U+1161).
46pub const HANGUL_V_BASE: u32 = 0x1161;
47/// Hangul trailing-consonant filler (T_BASE itself never composes; the
48/// real trailing jamo range is `T_BASE + 1 ..= T_BASE + T_COUNT - 1`).
49pub const HANGUL_T_BASE: u32 = 0x11A7;
50/// Count of leading-consonant jamos.
51pub const HANGUL_L_COUNT: u32 = 19;
52/// Count of vowel jamos.
53pub const HANGUL_V_COUNT: u32 = 21;
54/// Count of trailing-consonant jamos (including the filler at offset 0).
55pub const HANGUL_T_COUNT: u32 = 28;
56/// `HANGUL_V_COUNT * HANGUL_T_COUNT` — block size per leading jamo.
57pub const HANGUL_N_COUNT: u32 = HANGUL_V_COUNT * HANGUL_T_COUNT; // 588
58/// Total count of precomposed Hangul syllables.
59pub const HANGUL_S_COUNT: u32 = HANGUL_L_COUNT * HANGUL_N_COUNT; // 11172
60
61/// Canonical_Combining_Class for `cp`. Returns 0 for any code point
62/// not present in [`CCC_TABLE`] - the table only stores non-zero
63/// classes (Not_Reordered is the default).
64#[inline]
65pub fn ccc(cp: u32) -> u8 {
66    match CCC_TABLE.binary_search_by_key(&cp, |entry| entry.0) {
67        Ok(idx) => CCC_TABLE[idx].1,
68        Err(_) => 0,
69    }
70}
71
72/// Look up the canonical decomposition of `cp` in [`NFD_INDEX`] /
73/// [`NFD_POOL`]. Returns `None` if `cp` has no canonical
74/// decomposition.
75#[inline]
76pub fn nfd_lookup(cp: u32) -> Option<&'static [u32]> {
77    let idx = NFD_INDEX.binary_search_by_key(&cp, |entry| entry.0).ok()?;
78    let (_, off, len) = NFD_INDEX[idx];
79    let start = off as usize;
80    let end = start + len as usize;
81    Some(&NFD_POOL[start..end])
82}
83
84/// Compatibility analog of [`nfd_lookup`]. Falls back to the canonical
85/// entry when no compatibility mapping exists (the generator script
86/// duplicates canonical-only entries into NFKD as well).
87#[inline]
88pub fn nfkd_lookup(cp: u32) -> Option<&'static [u32]> {
89    let idx = NFKD_INDEX.binary_search_by_key(&cp, |entry| entry.0).ok()?;
90    let (_, off, len) = NFKD_INDEX[idx];
91    let start = off as usize;
92    let end = start + len as usize;
93    Some(&NFKD_POOL[start..end])
94}
95
96/// Composition pair lookup: `(first, second) -> composed`. Returns
97/// `None` when no canonical composition exists or when the composite
98/// is on the exclusion list (filtered out at table-generation time,
99/// so the runtime never re-checks).
100#[inline]
101pub fn compose_pair(first: u32, second: u32) -> Option<u32> {
102    let idx = COMPOSITION_PAIRS
103        .binary_search_by(|entry| (entry.0, entry.1).cmp(&(first, second)))
104        .ok()?;
105    Some(COMPOSITION_PAIRS[idx].2)
106}
107
108/// Algorithmic Hangul decomposition. Returns the L / V (/ optional T)
109/// jamo sequence in `out` when `cp` is in the syllable block, or
110/// `false` if `cp` is not a precomposed Hangul syllable.
111#[inline]
112pub fn hangul_decompose_into(cp: u32, out: &mut Vec<u32>) -> bool {
113    if !(HANGUL_S_BASE..HANGUL_S_BASE + HANGUL_S_COUNT).contains(&cp) {
114        return false;
115    }
116    let s_index = cp - HANGUL_S_BASE;
117    let l = HANGUL_L_BASE + s_index / HANGUL_N_COUNT;
118    let v = HANGUL_V_BASE + (s_index % HANGUL_N_COUNT) / HANGUL_T_COUNT;
119    let t_offset = s_index % HANGUL_T_COUNT;
120    out.push(l);
121    out.push(v);
122    if t_offset != 0 {
123        out.push(HANGUL_T_BASE + t_offset);
124    }
125    true
126}
127
128/// Algorithmic Hangul composition. Tries L + V (and optionally + T)
129/// -> precomposed syllable. Returns `None` when the pair is not a
130/// valid jamo pairing.
131#[inline]
132pub fn hangul_compose(first: u32, second: u32) -> Option<u32> {
133    // L + V -> LV syllable.
134    if (HANGUL_L_BASE..HANGUL_L_BASE + HANGUL_L_COUNT).contains(&first)
135        && (HANGUL_V_BASE..HANGUL_V_BASE + HANGUL_V_COUNT).contains(&second)
136    {
137        let l_index = first - HANGUL_L_BASE;
138        let v_index = second - HANGUL_V_BASE;
139        return Some(HANGUL_S_BASE + (l_index * HANGUL_V_COUNT + v_index) * HANGUL_T_COUNT);
140    }
141    // LV + T -> LVT syllable. We detect "LV-shaped" by checking
142    // `(cp - S_BASE) % T_COUNT == 0` - that's exactly the precomposed
143    // LV syllables. T_BASE itself is the filler; skip it.
144    if (HANGUL_S_BASE..HANGUL_S_BASE + HANGUL_S_COUNT).contains(&first) {
145        let s_index = first - HANGUL_S_BASE;
146        if s_index.is_multiple_of(HANGUL_T_COUNT)
147            && (HANGUL_T_BASE + 1..HANGUL_T_BASE + HANGUL_T_COUNT).contains(&second)
148        {
149            return Some(first + (second - HANGUL_T_BASE));
150        }
151    }
152    None
153}
154
155/// Mode flag for [`decompose_to_buffer`].
156#[derive(Clone, Copy, Debug, PartialEq, Eq)]
157pub enum DecompKind {
158    /// Canonical decomposition (NFD / NFC source pass).
159    Canonical,
160    /// Compatibility decomposition (NFKD / NFKC source pass).
161    Compatibility,
162}
163
164/// Decompose `input` into `out` using the requested table. The payload
165/// tables are already fully expanded (the generator script flattens
166/// nested decompositions), so a single lookup per code point is
167/// sufficient - no recursion needed at runtime.
168pub fn decompose_to_buffer(input: &str, kind: DecompKind, out: &mut Vec<u32>) {
169    // Worst-case expansion factor across all Unicode 14.0 mappings is
170    // 18 (U+FDFA -> 18 cps). Reserve roughly that to keep reallocs
171    // off the hot path for compatibility decomposition; canonical
172    // expansion stays bounded around 4.
173    out.reserve(input.len() * 2);
174    for ch in input.chars() {
175        let cp = ch as u32;
176        if hangul_decompose_into(cp, out) {
177            continue;
178        }
179        let mapping = match kind {
180            DecompKind::Canonical => nfd_lookup(cp),
181            DecompKind::Compatibility => nfkd_lookup(cp),
182        };
183        match mapping {
184            Some(slice) => out.extend_from_slice(slice),
185            None => out.push(cp),
186        }
187    }
188}
189
190/// Canonical reorder pass (UAX #15 D109): within every run of
191/// non-starters (CCC > 0) sort code points by CCC ascending, stably.
192/// Starters (CCC == 0) are anchors that break runs.
193pub fn canonical_reorder(buf: &mut [u32]) {
194    let len = buf.len();
195    let mut i = 0;
196    while i < len {
197        if ccc(buf[i]) == 0 {
198            i += 1;
199            continue;
200        }
201        let start = i;
202        while i < len && ccc(buf[i]) != 0 {
203            i += 1;
204        }
205        // `sort_by_key` is stable in std, which matters: same-CCC code
206        // points must keep their original order or Quick_Check
207        // round-trips break.
208        buf[start..i].sort_by_key(|&cp| ccc(cp));
209    }
210}
211
212/// Common scaffold: decompose into a `Vec<u32>` then canonical-reorder.
213pub fn decompose_and_reorder(input: &str, kind: DecompKind) -> Vec<u32> {
214    let mut buf = Vec::with_capacity(input.len() + 4);
215    decompose_to_buffer(input, kind, &mut buf);
216    canonical_reorder(&mut buf);
217    buf
218}
219
220/// Re-encode a `Vec<u32>` to a `String`. Any code point that does not
221/// round-trip through `char::from_u32` (surrogates, > U+10FFFF) is
222/// silently dropped - they cannot appear in our tables, but defensive
223/// coding keeps `from_u32_unchecked` out of the picture.
224pub fn encode(cps: &[u32]) -> String {
225    let mut out = String::with_capacity(cps.len());
226    for &cp in cps {
227        if let Some(c) = char::from_u32(cp) {
228            out.push(c);
229        }
230    }
231    out
232}
233
234/// Public: NFD.
235pub fn to_nfd(input: &str) -> String {
236    encode(&decompose_and_reorder(input, DecompKind::Canonical))
237}
238
239/// Public: NFKD.
240pub fn to_nfkd(input: &str) -> String {
241    encode(&decompose_and_reorder(input, DecompKind::Compatibility))
242}
243
244/// Canonical composition pass (UAX #15 section 16). Operates on a
245/// `Vec<u32>` that has already been decomposed and reordered.
246pub fn compose(buf: Vec<u32>) -> Vec<u32> {
247    if buf.is_empty() {
248        return buf;
249    }
250    let mut out: Vec<u32> = Vec::with_capacity(buf.len());
251    // Index in `out` of the most recent starter that can still absorb
252    // following non-starters. `usize::MAX` means "no live starter yet".
253    let mut last_starter: usize = usize::MAX;
254    // CCC of the last non-starter we've emitted since `last_starter`.
255    let mut last_ccc: u8 = 0;
256
257    for cp in buf {
258        let cur_ccc = ccc(cp);
259        if last_starter != usize::MAX {
260            let starter_cp = out[last_starter];
261            // Try Hangul composition first - pure algorithm, no table
262            // hit at all.
263            let composed = hangul_compose(starter_cp, cp).or_else(|| compose_pair(starter_cp, cp));
264            if let Some(comp) = composed {
265                // The composition is only valid if `cp` is not
266                // "blocked" by a preceding non-starter of equal or
267                // higher CCC. Starters (cur_ccc == 0) are never blocked
268                // but they also don't have last_ccc semantics until
269                // they become the new starter.
270                let blocked = cur_ccc != 0 && last_ccc >= cur_ccc;
271                if !blocked {
272                    out[last_starter] = comp;
273                    continue;
274                }
275            }
276        }
277        out.push(cp);
278        if cur_ccc == 0 {
279            last_starter = out.len() - 1;
280            last_ccc = 0;
281        } else {
282            last_ccc = cur_ccc;
283        }
284    }
285    out
286}
287
288/// Public: NFC.
289pub fn to_nfc(input: &str) -> String {
290    let decomposed = decompose_and_reorder(input, DecompKind::Canonical);
291    encode(&compose(decomposed))
292}
293
294/// Public: NFKC.
295pub fn to_nfkc(input: &str) -> String {
296    let decomposed = decompose_and_reorder(input, DecompKind::Compatibility);
297    encode(&compose(decomposed))
298}
299
300// -------------------------------------------------------------------
301// Table encoding helpers for the wasm-AOT backend.
302//
303// The wasm-AOT bodies embed the four normalization tables into the
304// const data section so the runtime can binary-search them via raw
305// memory loads. The helpers below produce the matching byte layouts.
306// -------------------------------------------------------------------
307
308/// Encode [`NFD_INDEX`] + [`NFD_POOL`] into the byte layout the wasm
309/// runtime expects.
310///
311/// Layout: `[index_count: u32 LE]` followed by `index_count` records
312/// of `(cp: u32, pool_off: u32, pool_len: u32)` - 12 bytes per record
313/// so the runtime helper can rebase as `table_addr + 4 + mid * 12`.
314/// Then `[pool_count: u32 LE]` and `pool_count * u32 LE` payload
315/// entries.
316///
317/// `pool_len` is widened from `u8` to `u32` on the wire so every entry
318/// stays on a 4-byte stride; the wasm body has no narrow load opcodes
319/// it would prefer over `i32.load`.
320pub fn encode_decomp_table_bytes(index: &[(u32, u32, u8)], pool: &[u32]) -> Vec<u8> {
321    let mut bytes = Vec::with_capacity(4 + index.len() * 12 + 4 + pool.len() * 4);
322    bytes.extend_from_slice(&(index.len() as u32).to_le_bytes());
323    for (cp, off, len) in index {
324        bytes.extend_from_slice(&cp.to_le_bytes());
325        bytes.extend_from_slice(&off.to_le_bytes());
326        bytes.extend_from_slice(&u32::from(*len).to_le_bytes());
327    }
328    bytes.extend_from_slice(&(pool.len() as u32).to_le_bytes());
329    for cp in pool {
330        bytes.extend_from_slice(&cp.to_le_bytes());
331    }
332    bytes
333}
334
335/// Encode the canonical-combining-class table.
336///
337/// Layout: `[count: u32 LE]` followed by `count` records of
338/// `(cp: u32 LE, ccc: u32 LE)` - 8 bytes per record so the runtime
339/// helper can reuse the same `(table_addr + 4 + mid * 8)` rebase
340/// arithmetic as the existing case-folding helper. The CCC value is
341/// widened from `u8` to `u32` for the same alignment reason as the
342/// decomposition `pool_len`.
343pub fn encode_ccc_table_bytes(table: &[(u32, u8)]) -> Vec<u8> {
344    let mut bytes = Vec::with_capacity(4 + table.len() * 8);
345    bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
346    for (cp, ccc) in table {
347        bytes.extend_from_slice(&cp.to_le_bytes());
348        bytes.extend_from_slice(&u32::from(*ccc).to_le_bytes());
349    }
350    bytes
351}
352
353/// Encode the canonical composition pair table.
354///
355/// Layout: `[count: u32 LE]` followed by `count` records of
356/// `(first: u32 LE, second: u32 LE, composed: u32 LE)` - 12 bytes per
357/// record. Sorted by `(first, second)` lexicographic so the runtime
358/// helper can binary-search by the combined 64-bit key.
359pub fn encode_composition_table_bytes(table: &[(u32, u32, u32)]) -> Vec<u8> {
360    let mut bytes = Vec::with_capacity(4 + table.len() * 12);
361    bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
362    for (first, second, composed) in table {
363        bytes.extend_from_slice(&first.to_le_bytes());
364        bytes.extend_from_slice(&second.to_le_bytes());
365        bytes.extend_from_slice(&composed.to_le_bytes());
366    }
367    bytes
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373
374    #[test]
375    fn ascii_roundtrips_unchanged() {
376        for s in ["", "hello", "ABC 123", "the quick brown fox"] {
377            assert_eq!(to_nfc(s), s);
378            assert_eq!(to_nfd(s), s);
379            assert_eq!(to_nfkc(s), s);
380            assert_eq!(to_nfkd(s), s);
381        }
382    }
383
384    #[test]
385    fn nfc_composes_combining_acute() {
386        // "café" written as 'e' + U+0301 should compose to the
387        // precomposed e-acute (U+00E9).
388        let decomposed = "cafe\u{0301}";
389        let composed = "caf\u{00E9}";
390        assert_eq!(to_nfc(decomposed), composed);
391        assert_eq!(to_nfc(composed), composed);
392    }
393
394    #[test]
395    fn nfd_decomposes_precomposed_acute() {
396        let composed = "caf\u{00E9}";
397        let decomposed = "cafe\u{0301}";
398        assert_eq!(to_nfd(composed), decomposed);
399        assert_eq!(to_nfd(decomposed), decomposed);
400    }
401
402    #[test]
403    fn hangul_nfd_uses_algorithmic_decomposition() {
404        // U+D55C -> U+1112 U+1161 U+11AB
405        let composed = "\u{D55C}";
406        let decomposed = "\u{1112}\u{1161}\u{11AB}";
407        assert_eq!(to_nfd(composed), decomposed);
408    }
409
410    #[test]
411    fn hangul_nfc_recomposes_jamos() {
412        let composed = "\u{D55C}";
413        let decomposed = "\u{1112}\u{1161}\u{11AB}";
414        assert_eq!(to_nfc(decomposed), composed);
415    }
416
417    #[test]
418    fn nfkd_expands_compatibility_form() {
419        // U+00BD (1/2 fraction) -> "1" + U+2044 + "2"
420        let input = "\u{00BD}";
421        let expected = "1\u{2044}2";
422        assert_eq!(to_nfkd(input), expected);
423        // NFD leaves U+00BD untouched.
424        assert_eq!(to_nfd(input), input);
425    }
426
427    #[test]
428    fn nfkc_does_not_recompose_compatibility_fraction() {
429        assert_eq!(to_nfkc("\u{00BD}"), "1\u{2044}2");
430    }
431
432    #[test]
433    fn canonical_reorder_sorts_combining_marks_by_ccc() {
434        // U+0307 (CCC 230) followed by U+0323 (CCC 220) reorders to
435        // (U+0323, U+0307) under NFD.
436        let input = "a\u{0307}\u{0323}";
437        let expected = "a\u{0323}\u{0307}";
438        assert_eq!(to_nfd(input), expected);
439        assert_eq!(to_nfd(expected), expected);
440    }
441
442    #[test]
443    fn nfc_idempotence() {
444        for s in [
445            "",
446            "caf\u{00E9}",
447            "\u{D55C}\u{AD6D}\u{C5B4}",
448            "1\u{2044}2",
449            "a\u{0307}\u{0323}b",
450        ] {
451            let once = to_nfc(s);
452            assert_eq!(to_nfc(&once), once, "NFC idempotence fail on {s:?}");
453        }
454    }
455
456    #[test]
457    fn nfd_idempotence() {
458        for s in [
459            "",
460            "caf\u{00E9}",
461            "\u{D55C}\u{AD6D}\u{C5B4}",
462            "a\u{0307}\u{0323}b",
463        ] {
464            let once = to_nfd(s);
465            assert_eq!(to_nfd(&once), once, "NFD idempotence fail on {s:?}");
466        }
467    }
468
469    #[test]
470    fn nfkc_idempotence() {
471        for s in [
472            "",
473            "caf\u{00E9}",
474            "\u{D55C}\u{AD6D}\u{C5B4}",
475            "\u{00BD}",
476            "\u{FB01}le",
477        ] {
478            let once = to_nfkc(s);
479            assert_eq!(to_nfkc(&once), once, "NFKC idempotence fail on {s:?}");
480        }
481    }
482
483    #[test]
484    fn nfkd_idempotence() {
485        for s in [
486            "",
487            "caf\u{00E9}",
488            "\u{D55C}\u{AD6D}\u{C5B4}",
489            "\u{00BD}",
490            "\u{FB01}le",
491        ] {
492            let once = to_nfkd(s);
493            assert_eq!(to_nfkd(&once), once, "NFKD idempotence fail on {s:?}");
494        }
495    }
496
497    #[test]
498    fn nfc_skips_full_composition_exclusion() {
499        // U+212A (KELVIN SIGN) decomposes canonically to U+004B ('K').
500        // Full_Composition_Exclusion = True, so NFC must NOT recompose
501        // 'K' back to U+212A. The generator filters U+212A out of
502        // COMPOSITION_PAIRS, making the exclusion automatic at runtime.
503        assert_eq!(to_nfc("K"), "K");
504        assert_eq!(to_nfc("\u{212A}"), "K");
505    }
506
507    #[test]
508    fn nfd_decomposes_kelvin_to_ascii_k() {
509        assert_eq!(to_nfd("\u{212A}"), "K");
510    }
511
512    #[test]
513    fn ligature_nfkc_splits_into_components() {
514        assert_eq!(to_nfkd("\u{FB01}"), "fi");
515        assert_eq!(to_nfkc("\u{FB01}"), "fi");
516    }
517
518    #[test]
519    fn nfc_starter_blocking_prevents_invalid_composition() {
520        // Per UAX #15: in `a U+0308 U+0301`, the U+0308 (CCC 230) is a
521        // non-starter that blocks composition between `a` and U+0301
522        // (CCC 230). NFC composes `a + U+0308` -> U+00E4, then U+0301
523        // follows unaltered.
524        assert_eq!(to_nfc("a\u{0308}\u{0301}"), "\u{00E4}\u{0301}");
525    }
526
527    #[test]
528    fn encode_decomp_table_layout() {
529        let index: &[(u32, u32, u8)] = &[(0x00C0, 0, 2), (0x00C1, 2, 2)];
530        let pool: &[u32] = &[0x0041, 0x0300, 0x0041, 0x0301];
531        let bytes = encode_decomp_table_bytes(index, pool);
532        assert_eq!(bytes.len(), 4 + 2 * 12 + 4 + 4 * 4);
533        assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
534        assert_eq!(&bytes[4..8], &0x00C0u32.to_le_bytes());
535        assert_eq!(&bytes[8..12], &0u32.to_le_bytes());
536        assert_eq!(&bytes[12..16], &2u32.to_le_bytes());
537        // pool header sits after the index
538        assert_eq!(&bytes[28..32], &4u32.to_le_bytes());
539        assert_eq!(&bytes[32..36], &0x0041u32.to_le_bytes());
540    }
541
542    #[test]
543    fn encode_ccc_table_layout() {
544        let table: &[(u32, u8)] = &[(0x0300, 230), (0x0301, 230)];
545        let bytes = encode_ccc_table_bytes(table);
546        assert_eq!(bytes.len(), 4 + 2 * 8);
547        assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
548        assert_eq!(&bytes[4..8], &0x0300u32.to_le_bytes());
549        assert_eq!(&bytes[8..12], &230u32.to_le_bytes());
550    }
551
552    #[test]
553    fn encode_composition_table_layout() {
554        let table: &[(u32, u32, u32)] = &[(0x0041, 0x0300, 0x00C0)];
555        let bytes = encode_composition_table_bytes(table);
556        assert_eq!(bytes.len(), 4 + 12);
557        assert_eq!(&bytes[0..4], &1u32.to_le_bytes());
558        assert_eq!(&bytes[4..8], &0x0041u32.to_le_bytes());
559        assert_eq!(&bytes[8..12], &0x0300u32.to_le_bytes());
560        assert_eq!(&bytes[12..16], &0x00C0u32.to_le_bytes());
561    }
562
563    #[test]
564    fn ccc_table_contains_combining_acute() {
565        assert_eq!(ccc(0x0301), 230);
566        assert_eq!(ccc(0x0041), 0);
567    }
568
569    #[test]
570    fn composition_table_sorted_and_excludes_kelvin() {
571        // Sanity: sorted by (first, second).
572        for w in COMPOSITION_PAIRS.windows(2) {
573            let a = (w[0].0, w[0].1);
574            let b = (w[1].0, w[1].1);
575            assert!(a < b, "COMPOSITION_PAIRS must be sorted: {a:?} >= {b:?}");
576        }
577        // Sanity: U+212A is excluded.
578        let kelvin_idx = COMPOSITION_PAIRS.binary_search_by(|t| (t.0, t.1).cmp(&(0x004B, 0)));
579        assert!(
580            kelvin_idx.is_err(),
581            "U+212A should be excluded from COMPOSITION_PAIRS"
582        );
583    }
584}