Skip to main content

simd_normalizer/
normalizer.rs

1//! Single-pass SIMD-guided normalizer implementations (NFC, NFD, NFKC, NFKD).
2//!
3//! The core loop scans 64-byte chunks via SIMD to identify passthrough regions
4//! (all bytes below a form-dependent bound), copying them directly.  Non-passthrough
5//! bytes trigger scalar decode + decompose + CCC sort + optional recomposition.
6
7use alloc::borrow::Cow;
8use alloc::string::String;
9
10use crate::ccc::CccBuffer;
11use crate::compose;
12use crate::decompose::{self, DecompForm};
13use crate::hangul;
14use crate::quick_check;
15use crate::simd;
16use crate::simd::prefetch;
17use crate::tables;
18use crate::utf8;
19
20// ---------------------------------------------------------------------------
21// Form enum
22// ---------------------------------------------------------------------------
23
24/// Unicode normalization form.
25#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum Form {
27    /// Canonical Decomposition, followed by Canonical Composition (NFC).
28    Nfc,
29    /// Canonical Decomposition (NFD).
30    Nfd,
31    /// Compatibility Decomposition, followed by Canonical Composition (NFKC).
32    Nfkc,
33    /// Compatibility Decomposition (NFKD).
34    Nfkd,
35}
36
37impl Form {
38    /// The SIMD passthrough byte bound for this form.
39    ///
40    /// Any byte below this value is guaranteed to not require normalization
41    /// processing: it is either ASCII or a continuation byte of a character
42    /// that does not need decomposition.
43    ///
44    /// - NFD/NFKD: 0xC0  (first byte of U+00C0, which decomposes)
45    /// - NFC/NFKC: 0xC0  (same: characters >= U+00C0 may need processing)
46    #[inline]
47    fn passthrough_bound(self) -> u8 {
48        match self {
49            Form::Nfc | Form::Nfkc => 0xC0,
50            Form::Nfd | Form::Nfkd => 0xC0,
51        }
52    }
53
54    /// Whether this form applies canonical composition after decomposition.
55    #[inline]
56    fn composes(self) -> bool {
57        matches!(self, Form::Nfc | Form::Nfkc)
58    }
59
60    /// Which decomposition form to use.
61    #[inline]
62    fn decomp_form(self) -> DecompForm {
63        match self {
64            Form::Nfc | Form::Nfd => DecompForm::Canonical,
65            Form::Nfkc | Form::Nfkd => DecompForm::Compatible,
66        }
67    }
68
69    /// Estimated output capacity for a given input length.
70    #[inline]
71    fn estimated_capacity(self, input_len: usize) -> usize {
72        match self {
73            Form::Nfc | Form::Nfkc => input_len,
74            Form::Nfd | Form::Nfkd => input_len + input_len / 2,
75        }
76    }
77
78    /// Run quick_check for this normalization form.
79    #[inline]
80    fn quick_check(self, input: &str) -> quick_check::IsNormalized {
81        match self {
82            Form::Nfc => quick_check::quick_check_nfc(input),
83            Form::Nfd => quick_check::quick_check_nfd(input),
84            Form::Nfkc => quick_check::quick_check_nfkc(input),
85            Form::Nfkd => quick_check::quick_check_nfkd(input),
86        }
87    }
88}
89
90// ---------------------------------------------------------------------------
91// NormState -- accumulation state for a starter + its combining marks
92// ---------------------------------------------------------------------------
93
94struct NormState {
95    /// The current starter character (CCC == 0) being accumulated.
96    current_starter: Option<char>,
97    /// Combining marks (CCC > 0) following the current starter, not yet sorted.
98    ccc_buf: CccBuffer,
99}
100
101impl NormState {
102    #[inline]
103    fn new() -> Self {
104        NormState {
105            current_starter: None,
106            ccc_buf: CccBuffer::new(),
107        }
108    }
109
110    /// Flush the current accumulation (starter + combining marks) to `out`.
111    ///
112    /// If `composes` is true, applies canonical composition.
113    #[inline]
114    fn flush(&mut self, out: &mut String, composes: bool) {
115        let starter = match self.current_starter.take() {
116            Some(s) => s,
117            None => {
118                // No starter -- flush any orphan combining marks (leading combiners).
119                if !self.ccc_buf.is_empty() {
120                    self.ccc_buf.sort_in_place();
121                    for entry in self.ccc_buf.as_slice() {
122                        out.push(entry.ch);
123                    }
124                    self.ccc_buf.clear();
125                }
126                return;
127            },
128        };
129
130        if self.ccc_buf.is_empty() {
131            // Starter with no combining marks -- just emit it.
132            out.push(starter);
133            return;
134        }
135
136        // Sort combining marks by CCC in place.
137        self.ccc_buf.sort_in_place();
138
139        if composes {
140            compose::compose_combining_sequence_into(starter, self.ccc_buf.as_slice(), out);
141        } else {
142            // Decomposition only: emit starter + sorted marks.
143            out.push(starter);
144            for entry in self.ccc_buf.as_slice() {
145                out.push(entry.ch);
146            }
147        }
148        self.ccc_buf.clear();
149    }
150
151    /// Process a single character (after decomposition) into the accumulation state.
152    ///
153    /// Characters with CCC == 0 are starters. When a new starter arrives, the
154    /// previous accumulation is flushed. In composition mode, starter-to-starter
155    /// composition is attempted first (required for Hangul jamo L+V, LV+T).
156    #[inline]
157    fn feed_entry(&mut self, ch: char, ccc: u8, out: &mut String, composes: bool) {
158        if ccc == 0 {
159            // New starter.
160            if composes && self.ccc_buf.is_empty() {
161                // No intervening combining marks -- try starter-to-starter composition.
162                if let Some(prev) = self.current_starter
163                    && let Some(composed) = compose::compose(prev, ch)
164                {
165                    self.current_starter = Some(composed);
166                    return;
167                }
168            }
169            // Either not composing, has intervening marks, or composition failed.
170            self.flush(out, composes);
171            self.current_starter = Some(ch);
172        } else {
173            // Combining mark: add to buffer.
174            self.ccc_buf.push(ch, ccc);
175        }
176    }
177
178    /// NFD-specialized flush: no composition logic.
179    #[inline]
180    fn flush_nfd(&mut self, out: &mut String) {
181        let starter = match self.current_starter.take() {
182            Some(s) => s,
183            None => {
184                if !self.ccc_buf.is_empty() {
185                    self.ccc_buf.sort_in_place();
186                    for entry in self.ccc_buf.as_slice() {
187                        out.push(entry.ch);
188                    }
189                    self.ccc_buf.clear();
190                }
191                return;
192            },
193        };
194
195        // Fast path: single combining mark (most common for precomposed Latin).
196        // Skip sort (unnecessary for 1 element) and avoid as_slice/clear overhead.
197        if let Some(entry) = self.ccc_buf.take_single_inline() {
198            out.push(starter);
199            out.push(entry.ch);
200            return;
201        }
202
203        if self.ccc_buf.is_empty() {
204            out.push(starter);
205            return;
206        }
207
208        // Multiple marks: sort and emit.
209        self.ccc_buf.sort_in_place();
210        out.push(starter);
211        for entry in self.ccc_buf.as_slice() {
212            out.push(entry.ch);
213        }
214        self.ccc_buf.clear();
215    }
216
217    /// NFD-specialized feed_entry: no composition checks.
218    #[inline]
219    fn feed_entry_nfd(&mut self, ch: char, ccc: u8, out: &mut String) {
220        if ccc == 0 {
221            self.flush_nfd(out);
222            self.current_starter = Some(ch);
223        } else {
224            self.ccc_buf.push(ch, ccc);
225        }
226    }
227}
228
229// ---------------------------------------------------------------------------
230// process_char -- decompose a char and feed entries to NormState
231// ---------------------------------------------------------------------------
232
233/// Check if a code point is a CJK Unified Ideograph (CCC=0, no decomposition,
234/// no composition). These can bypass the entire decompose pipeline.
235#[inline(always)]
236fn is_cjk_unified(cp: u32) -> bool {
237    (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
238}
239
240/// Check if a supplementary code point (cp >= 0x10000) is safe (CCC=0, no
241/// decomposition in any normalization form). Covers the vast majority of
242/// supplementary characters; only narrow exception ranges need trie lookups.
243#[inline(always)]
244fn is_supp_safe(cp: u32) -> bool {
245    if cp >= 0x20000 {
246        // Plane 2+: safe except CJK Compatibility Ideographs Supplement
247        return !(0x2F800..=0x2FA1F).contains(&cp);
248    }
249    // Plane 1: core emoji and symbols block (U+1F252-U+1FBEF) is safe.
250    // Verified: no decompositions and CCC=0 for all normalization forms.
251    (0x1F252..=0x1FBEF).contains(&cp)
252}
253
254/// Decompose a character and feed each resulting entry into the accumulation state.
255///
256/// Uses a single trie lookup with passthrough fast-paths for non-decomposing
257/// characters, avoiding the full decomposition pipeline for the common case.
258#[inline]
259fn process_char(
260    ch: char,
261    state: &mut NormState,
262    out: &mut String,
263    form: Form,
264    decomp_buf: &mut CccBuffer,
265) {
266    let cp = ch as u32;
267
268    // Fast path: CJK ideographs never decompose, have CCC=0, and never
269    // participate in canonical composition. No trie lookup needed.
270    if cp >= 0x3400 && is_cjk_unified(cp) {
271        state.flush(out, form.composes());
272        state.current_starter = Some(ch);
273        return;
274    }
275
276    // Hangul syllables: algorithmic decomposition, no trie lookup needed.
277    if hangul::is_hangul_syllable(ch) {
278        let (l, v, t) = hangul::decompose_hangul(ch);
279        state.feed_entry(l, 0, out, form.composes());
280        state.feed_entry(v, 0, out, form.composes());
281        if let Some(t_char) = t {
282            state.feed_entry(t_char, 0, out, form.composes());
283        }
284        return;
285    }
286
287    // Single trie lookup for both passthrough check and decomposition.
288    let trie_value = tables::raw_decomp_trie_value(ch, form.decomp_form());
289
290    // Non-decomposing character: extract CCC and feed directly.
291    // This covers both starters (CCC=0) and combining marks (CCC>0)
292    // that map to themselves, skipping the full decompose pipeline.
293    if !tables::has_decomposition(trie_value) {
294        let ccc = tables::ccc_from_trie_value(trie_value);
295        state.feed_entry(ch, ccc, out, form.composes());
296        return;
297    }
298
299    // Character has a decomposition: decode from the pre-looked-up trie value.
300    decomp_buf.clear();
301    decompose::decompose_from_trie_value(ch, trie_value, decomp_buf, form.decomp_form());
302    for entry in decomp_buf.as_slice() {
303        state.feed_entry(entry.ch, entry.ccc, out, form.composes());
304    }
305}
306
307/// Process a non-CJK, non-Hangul character using a pre-computed trie value.
308///
309/// Used by the NFC/NFKC passthrough path in the SIMD loop to avoid a redundant
310/// trie lookup (the caller already looked up the trie value to decide whether
311/// the character is passthrough).
312#[allow(dead_code)]
313#[inline(always)]
314fn process_from_trie(
315    ch: char,
316    tv: u32,
317    state: &mut NormState,
318    out: &mut String,
319    form: Form,
320    decomp_buf: &mut CccBuffer,
321) {
322    if !tables::has_decomposition(tv) {
323        let ccc = tables::ccc_from_trie_value(tv);
324        state.feed_entry(ch, ccc, out, form.composes());
325    } else {
326        decomp_buf.clear();
327        decompose::decompose_from_trie_value(ch, tv, decomp_buf, form.decomp_form());
328        for entry in decomp_buf.as_slice() {
329            state.feed_entry(entry.ch, entry.ccc, out, form.composes());
330        }
331    }
332}
333
334/// Process a non-CJK, non-Hangul character for NFD/NFKD using a pre-computed
335/// trie value. Avoids `DecompResult` enum construction by inlining the expansion
336/// path and specializing the common 2-entry case (starter + single combining mark).
337#[inline(always)]
338fn process_from_trie_nfd(
339    ch: char,
340    tv: u32,
341    state: &mut NormState,
342    out: &mut String,
343    decomp_form: DecompForm,
344) {
345    if !tables::has_decomposition(tv) {
346        // Non-decomposing character (e.g. combining mark): extract CCC and feed.
347        let ccc = tables::ccc_from_trie_value(tv);
348        state.feed_entry_nfd(ch, ccc, out);
349        return;
350    }
351
352    // Fast path: expansion (the vast majority of decomposing BMP characters).
353    if let Some(data) = tables::expansion_data_from_trie_value(tv, decomp_form) {
354        // Specialize 2-entry expansion: starter + single combining mark.
355        // This is the most common case (precomposed Latin, Greek, Cyrillic, etc.)
356        // and avoids one feed_entry_nfd call per character.
357        if data.len() == 2 {
358            let e0 = data[0];
359            let ccc0 = (e0 >> tables::EXPANSION_CCC_SHIFT) as u8;
360            if ccc0 == 0 {
361                // First entry is a starter: flush previous state, set new starter.
362                state.flush_nfd(out);
363                let cp0 = e0 & tables::EXPANSION_CP_MASK;
364                debug_assert!(cp0 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp0));
365                state.current_starter = Some(unsafe { char::from_u32_unchecked(cp0) });
366                // Second entry: combine directly without feed_entry_nfd overhead.
367                let e1 = data[1];
368                let cp1 = e1 & tables::EXPANSION_CP_MASK;
369                let ccc1 = (e1 >> tables::EXPANSION_CCC_SHIFT) as u8;
370                debug_assert!(cp1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp1));
371                let ch1 = unsafe { char::from_u32_unchecked(cp1) };
372                if ccc1 != 0 {
373                    state.ccc_buf.push(ch1, ccc1);
374                } else {
375                    // Both starters (rare): use general path for second entry.
376                    state.feed_entry_nfd(ch1, 0, out);
377                }
378                return;
379            }
380        }
381        // General expansion loop (3+ entries or first entry is non-starter).
382        for &entry in data {
383            let cp = entry & tables::EXPANSION_CP_MASK;
384            let ccc = (entry >> tables::EXPANSION_CCC_SHIFT) as u8;
385            debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
386            let exp_ch = unsafe { char::from_u32_unchecked(cp) };
387            state.feed_entry_nfd(exp_ch, ccc, out);
388        }
389        return;
390    }
391
392    // Singleton decomposition: the trie value's lower 16 bits are the BMP code point.
393    let info = tv & 0xFFFF;
394    debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
395    let decomposed = unsafe { char::from_u32_unchecked(info) };
396    let ccc = if info <= 0x7F {
397        0
398    } else {
399        tables::lookup_ccc(decomposed)
400    };
401    state.feed_entry_nfd(decomposed, ccc, out);
402}
403
404/// Compose-mode passthrough flush. Called from both the chunk loop and scalar
405/// tail after `state.flush(out, true)` when `composes == true`. Peeks at the
406/// upcoming codepoint `ch` (whose bytes have not yet been consumed) and decides
407/// whether to copy the whole `pass` run verbatim or feed the final ASCII
408/// starter through `NormState` so subsequent combining marks can still see it.
409#[inline(always)]
410fn flush_compose_passthrough(
411    pass: &str,
412    ch: char,
413    form: Form,
414    state: &mut NormState,
415    out: &mut String,
416) {
417    let cp = ch as u32;
418    // Safety: `cp >= 0x10000` proves `ch` is a supplementary code point, which
419    // is the precondition of `raw_decomp_trie_value_supplementary`.
420    let next_tv = if cp >= 0x10000 {
421        unsafe { tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form()) }
422    } else {
423        tables::raw_decomp_trie_value(ch, form.decomp_form())
424    };
425    if tables::needs_starter_shadow(next_tv) {
426        let n = pass.len();
427        if n > 1 {
428            out.push_str(&pass[..n - 1]);
429        }
430        let last_ch = pass.as_bytes()[n - 1] as char;
431        state.feed_entry(last_ch, 0, out, true);
432    } else {
433        out.push_str(pass);
434    }
435}
436
437// ---------------------------------------------------------------------------
438// normalize_scalar -- fallback for short inputs
439// ---------------------------------------------------------------------------
440
441/// Normalize a string using pure scalar processing (no SIMD).
442fn normalize_scalar<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
443    if input.is_empty() {
444        return Cow::Borrowed(input);
445    }
446
447    // Quick-check: if the string is definitely already normalized, return early.
448    if form.quick_check(input) == quick_check::IsNormalized::Yes {
449        return Cow::Borrowed(input);
450    }
451
452    let mut out = String::with_capacity(input.len());
453    let mut state = NormState::new();
454    let mut decomp_buf = CccBuffer::new();
455
456    for ch in input.chars() {
457        process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
458    }
459
460    // Flush any remaining state.
461    state.flush(&mut out, form.composes());
462
463    if out == input {
464        Cow::Borrowed(input)
465    } else {
466        Cow::Owned(out)
467    }
468}
469
470// ---------------------------------------------------------------------------
471// normalize_impl -- main SIMD-accelerated loop
472// ---------------------------------------------------------------------------
473
474/// Core normalization function.
475///
476/// Uses SIMD scanning for inputs >= 64 bytes, with scalar fallback for shorter
477/// inputs and tails. Returns `Cow::Borrowed` if the input was already normalized.
478fn normalize_impl<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
479    let bytes = input.as_bytes();
480    let len = bytes.len();
481
482    // Short inputs: use scalar path directly (includes quick_check).
483    if len < 64 {
484        return normalize_scalar(input, form);
485    }
486
487    // Single upfront quick-check. If definitely normalized, return early.
488    let qc = form.quick_check(input);
489    if qc == quick_check::IsNormalized::Yes {
490        return Cow::Borrowed(input);
491    }
492
493    // QC = No or Maybe: allocate and normalize.
494    let bound = form.passthrough_bound();
495    let composes = form.composes();
496    let mut out = String::with_capacity(form.estimated_capacity(len));
497    let mut last_written: usize = 0;
498    let mut state = NormState::new();
499    let mut decomp_buf = CccBuffer::new();
500
501    let mut pos: usize = 0;
502    let ptr = bytes.as_ptr();
503
504    // SIMD chunk loop.
505    while pos + 64 <= len {
506        let chunk_start = pos;
507
508        // SAFETY: pos + 64 <= len, so ptr.add(pos) is valid for 64 bytes.
509        // Prefetch pointers use wrapping_add because they may exceed the
510        // allocation; prefetch is a non-faulting hint on all architectures.
511        let mask = unsafe {
512            let prefetch_l1 =
513                ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
514            let prefetch_l2 =
515                ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
516            simd::scan_and_prefetch(ptr.add(pos), prefetch_l1, prefetch_l2, bound)
517        };
518
519        // Prefetch the output buffer write-head to overlap write-allocate
520        // fills with the SIMD scanner read on the source. Guarded against the
521        // reallocation boundary: if the prefetched line would land past the
522        // current capacity, skip it (the next push_str will realloc anyway).
523        unsafe {
524            let write_head = out.len();
525            let distance = prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE;
526            if write_head + distance <= out.capacity() {
527                prefetch::prefetch_write(out.as_ptr().wrapping_add(write_head + distance));
528            }
529        }
530
531        if mask == 0 {
532            // All passthrough: no bytes >= bound in this chunk.
533            pos += 64;
534            continue;
535        }
536
537        // Walk set bits in the mask.
538        let mut chunk_mask = mask;
539        while chunk_mask != 0 {
540            let bit_pos = chunk_mask.trailing_zeros() as usize;
541            chunk_mask &= chunk_mask.wrapping_sub(1); // clear lowest set bit
542
543            let byte_pos = chunk_start + bit_pos;
544
545            // Skip if we already processed past this position (multi-byte char from previous bit).
546            if byte_pos < last_written {
547                continue;
548            }
549
550            // Skip continuation bytes -- they belong to a char whose leading byte
551            // was already processed.
552            if utf8::is_continuation_byte(bytes[byte_pos]) {
553                continue;
554            }
555
556            // Decode the character at this position.
557            let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
558
559            // Extended passthrough for decomposition-only forms (NFD/NFKD):
560            // Non-decomposing starters (CCC=0) produce identical output, so
561            // they can be bulk-copied with surrounding passthrough bytes,
562            // avoiding per-character NormState flush + push overhead.
563            if !composes {
564                let cp = ch as u32;
565                // CJK ideographs: guaranteed non-decomposing starters, no trie needed.
566                if (cp >= 0x3400 && is_cjk_unified(cp)) || (cp >= 0x10000 && is_supp_safe(cp)) {
567                    continue;
568                }
569                // Hangul syllables: algorithmic decomposition, write jamo directly
570                // to output bypassing per-entry NormState overhead.
571                if hangul::is_hangul_syllable(ch) {
572                    if byte_pos > last_written {
573                        state.flush_nfd(&mut out);
574                        out.push_str(&input[last_written..byte_pos]);
575                    }
576                    last_written = byte_pos + width;
577                    state.flush_nfd(&mut out);
578                    let (l, v, t) = hangul::decompose_hangul(ch);
579                    out.push(l);
580                    out.push(v);
581                    if let Some(t_char) = t {
582                        out.push(t_char);
583                    }
584                    continue;
585                }
586                // Non-CJK, non-Hangul: single trie lookup for both the
587                // passthrough check and (if needed) decomposition processing.
588                // Use unchecked supplementary path for cp >= 0x10000 (emoji etc).
589                let tv = if cp >= 0x10000 {
590                    // SAFETY: cp is a valid supplementary code point from a valid char.
591                    unsafe { tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form()) }
592                } else {
593                    tables::raw_decomp_trie_value(ch, form.decomp_form())
594                };
595                if !tables::has_decomposition(tv) && tables::ccc_from_trie_value(tv) == 0 {
596                    continue; // non-decomposing starter → passthrough
597                }
598                // Needs work: copy passthrough, then process with inline NFD path.
599                if byte_pos > last_written {
600                    state.flush_nfd(&mut out);
601                    out.push_str(&input[last_written..byte_pos]);
602                }
603                last_written = byte_pos + width;
604                process_from_trie_nfd(ch, tv, &mut state, &mut out, form.decomp_form());
605                continue;
606            }
607
608            // Copy any passthrough bytes between last_written and this position.
609            // Flush NormState first: it may hold a buffered starter that must
610            // appear *before* the passthrough run in the output.
611            //
612            // In composition mode, keep the last passthrough character as a
613            // potential starter for the following combining mark. Passthrough
614            // bytes are guaranteed to be ASCII (< 0xC0) and thus single-byte
615            // starters with CCC 0.
616            if byte_pos > last_written {
617                state.flush(&mut out, composes);
618                let pass = &input[last_written..byte_pos];
619                if composes {
620                    flush_compose_passthrough(pass, ch, form, &mut state, &mut out);
621                } else {
622                    out.push_str(pass);
623                }
624            }
625
626            last_written = byte_pos + width;
627
628            // Process through decomposition + accumulation.
629            process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
630        }
631
632        pos += 64;
633    }
634
635    // Scalar tail: remaining bytes after the last full chunk.
636    if pos < len {
637        // Check if the tail has any non-passthrough bytes.
638        let tail_has_work = bytes[pos..].iter().any(|&b| b >= bound);
639
640        if tail_has_work {
641            // Process remaining bytes character-by-character.
642            let mut tail_pos = pos;
643            while tail_pos < len {
644                if tail_pos < last_written {
645                    tail_pos += 1;
646                    continue;
647                }
648
649                if utf8::is_continuation_byte(bytes[tail_pos]) {
650                    tail_pos += 1;
651                    continue;
652                }
653
654                let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
655
656                // Extended passthrough (NFD/NFKD): skip non-decomposing starters.
657                if !composes {
658                    let cp = ch as u32;
659                    if (cp >= 0x3400 && is_cjk_unified(cp)) || (cp >= 0x10000 && is_supp_safe(cp)) {
660                        tail_pos += width;
661                        continue;
662                    }
663                    // Hangul syllables: algorithmic decomposition, write directly.
664                    if hangul::is_hangul_syllable(ch) {
665                        if tail_pos > last_written {
666                            state.flush_nfd(&mut out);
667                            out.push_str(&input[last_written..tail_pos]);
668                        }
669                        last_written = tail_pos + width;
670                        state.flush_nfd(&mut out);
671                        let (l, v, t) = hangul::decompose_hangul(ch);
672                        out.push(l);
673                        out.push(v);
674                        if let Some(t_char) = t {
675                            out.push(t_char);
676                        }
677                        tail_pos += width;
678                        continue;
679                    }
680                    let tv = if cp >= 0x10000 {
681                        // SAFETY: cp is a valid supplementary code point from a valid char.
682                        unsafe {
683                            tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form())
684                        }
685                    } else {
686                        tables::raw_decomp_trie_value(ch, form.decomp_form())
687                    };
688                    if !tables::has_decomposition(tv) && tables::ccc_from_trie_value(tv) == 0 {
689                        tail_pos += width;
690                        continue;
691                    }
692                    // Needs work: copy passthrough, process with inline NFD path.
693                    if tail_pos > last_written {
694                        state.flush_nfd(&mut out);
695                        out.push_str(&input[last_written..tail_pos]);
696                    }
697                    last_written = tail_pos + width;
698                    process_from_trie_nfd(ch, tv, &mut state, &mut out, form.decomp_form());
699                    tail_pos += width;
700                    continue;
701                }
702
703                // Copy passthrough bytes before this char.
704                if tail_pos > last_written {
705                    state.flush(&mut out, composes);
706                    let pass = &input[last_written..tail_pos];
707                    if composes {
708                        flush_compose_passthrough(pass, ch, form, &mut state, &mut out);
709                    } else {
710                        out.push_str(pass);
711                    }
712                }
713
714                last_written = tail_pos + width;
715
716                process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
717
718                tail_pos += width;
719            }
720        }
721    }
722
723    // Flush any remaining state.
724    if composes {
725        state.flush(&mut out, true);
726    } else {
727        state.flush_nfd(&mut out);
728    }
729
730    // Copy any trailing passthrough bytes.
731    if last_written < len {
732        out.push_str(&input[last_written..len]);
733    }
734
735    // For the Maybe case (NFC/NFKC only), normalization might not have changed
736    // anything. Check and return Borrowed if so.
737    if qc == quick_check::IsNormalized::Maybe && out == input {
738        Cow::Borrowed(input)
739    } else {
740        Cow::Owned(out)
741    }
742}
743
744// ---------------------------------------------------------------------------
745// Public normalizer types
746// ---------------------------------------------------------------------------
747
748/// NFC normalizer: Canonical Decomposition, followed by Canonical Composition.
749pub struct NfcNormalizer;
750
751/// NFD normalizer: Canonical Decomposition.
752pub struct NfdNormalizer;
753
754/// NFKC normalizer: Compatibility Decomposition, followed by Canonical Composition.
755pub struct NfkcNormalizer;
756
757/// NFKD normalizer: Compatibility Decomposition.
758pub struct NfkdNormalizer;
759
760impl Default for NfcNormalizer {
761    fn default() -> Self {
762        Self::new()
763    }
764}
765
766impl Default for NfdNormalizer {
767    fn default() -> Self {
768        Self::new()
769    }
770}
771
772impl Default for NfkcNormalizer {
773    fn default() -> Self {
774        Self::new()
775    }
776}
777
778impl Default for NfkdNormalizer {
779    fn default() -> Self {
780        Self::new()
781    }
782}
783
784impl NfcNormalizer {
785    /// Create a new NFC normalizer.
786    pub fn new() -> Self {
787        NfcNormalizer
788    }
789
790    /// Run the NFC quick-check algorithm on `input`.
791    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
792        quick_check::quick_check_nfc(input)
793    }
794
795    /// Normalize the input string to NFC form.
796    ///
797    /// Returns `Cow::Borrowed` if the input is already in NFC.
798    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
799        normalize_impl(input, Form::Nfc)
800    }
801
802    /// Normalize the input string to NFC form, appending to `out`.
803    ///
804    /// Returns `true` if the input was already normalized (nothing was modified).
805    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
806        let result = normalize_impl(input, Form::Nfc);
807        let already_normalized = matches!(&result, Cow::Borrowed(_));
808        out.push_str(&result);
809        already_normalized
810    }
811
812    /// Check if the input is already in NFC form.
813    pub fn is_normalized(&self, input: &str) -> bool {
814        quick_check::is_normalized_nfc(input)
815    }
816}
817
818impl NfdNormalizer {
819    /// Create a new NFD normalizer.
820    pub fn new() -> Self {
821        NfdNormalizer
822    }
823
824    /// Run the NFD quick-check algorithm on `input`.
825    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
826        quick_check::quick_check_nfd(input)
827    }
828
829    /// Normalize the input string to NFD form.
830    ///
831    /// Returns `Cow::Borrowed` if the input is already in NFD.
832    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
833        normalize_impl(input, Form::Nfd)
834    }
835
836    /// Normalize the input string to NFD form, appending to `out`.
837    ///
838    /// Returns `true` if the input was already normalized (nothing was modified).
839    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
840        let result = normalize_impl(input, Form::Nfd);
841        let already_normalized = matches!(&result, Cow::Borrowed(_));
842        out.push_str(&result);
843        already_normalized
844    }
845
846    /// Check if the input is already in NFD form.
847    pub fn is_normalized(&self, input: &str) -> bool {
848        quick_check::is_normalized_nfd(input)
849    }
850}
851
852impl NfkcNormalizer {
853    /// Create a new NFKC normalizer.
854    pub fn new() -> Self {
855        NfkcNormalizer
856    }
857
858    /// Run the NFKC quick-check algorithm on `input`.
859    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
860        quick_check::quick_check_nfkc(input)
861    }
862
863    /// Normalize the input string to NFKC form.
864    ///
865    /// Returns `Cow::Borrowed` if the input is already in NFKC.
866    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
867        normalize_impl(input, Form::Nfkc)
868    }
869
870    /// Normalize the input string to NFKC form, appending to `out`.
871    ///
872    /// Returns `true` if the input was already normalized (nothing was modified).
873    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
874        let result = normalize_impl(input, Form::Nfkc);
875        let already_normalized = matches!(&result, Cow::Borrowed(_));
876        out.push_str(&result);
877        already_normalized
878    }
879
880    /// Check if the input is already in NFKC form.
881    pub fn is_normalized(&self, input: &str) -> bool {
882        quick_check::is_normalized_nfkc(input)
883    }
884}
885
886impl NfkdNormalizer {
887    /// Create a new NFKD normalizer.
888    pub fn new() -> Self {
889        NfkdNormalizer
890    }
891
892    /// Run the NFKD quick-check algorithm on `input`.
893    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
894        quick_check::quick_check_nfkd(input)
895    }
896
897    /// Normalize the input string to NFKD form.
898    ///
899    /// Returns `Cow::Borrowed` if the input is already in NFKD.
900    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
901        normalize_impl(input, Form::Nfkd)
902    }
903
904    /// Normalize the input string to NFKD form, appending to `out`.
905    ///
906    /// Returns `true` if the input was already normalized (nothing was modified).
907    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
908        let result = normalize_impl(input, Form::Nfkd);
909        let already_normalized = matches!(&result, Cow::Borrowed(_));
910        out.push_str(&result);
911        already_normalized
912    }
913
914    /// Check if the input is already in NFKD form.
915    pub fn is_normalized(&self, input: &str) -> bool {
916        quick_check::is_normalized_nfkd(input)
917    }
918}
919
920// ---------------------------------------------------------------------------
921// Unit tests
922// ---------------------------------------------------------------------------
923
924#[cfg(test)]
925mod tests {
926    use super::*;
927    use alloc::borrow::Cow;
928    use alloc::string::String;
929    use alloc::vec::Vec;
930
931    // ===================================================================
932    // 1. Form enum methods
933    // ===================================================================
934
935    #[test]
936    fn passthrough_bound_all_forms_return_0xc0() {
937        assert_eq!(Form::Nfc.passthrough_bound(), 0xC0);
938        assert_eq!(Form::Nfd.passthrough_bound(), 0xC0);
939        assert_eq!(Form::Nfkc.passthrough_bound(), 0xC0);
940        assert_eq!(Form::Nfkd.passthrough_bound(), 0xC0);
941    }
942
943    #[test]
944    fn composes_nfc_nfkc_true_nfd_nfkd_false() {
945        assert!(Form::Nfc.composes());
946        assert!(Form::Nfkc.composes());
947        assert!(!Form::Nfd.composes());
948        assert!(!Form::Nfkd.composes());
949    }
950
951    #[test]
952    fn decomp_form_canonical_vs_compatible() {
953        assert_eq!(Form::Nfc.decomp_form(), DecompForm::Canonical);
954        assert_eq!(Form::Nfd.decomp_form(), DecompForm::Canonical);
955        assert_eq!(Form::Nfkc.decomp_form(), DecompForm::Compatible);
956        assert_eq!(Form::Nfkd.decomp_form(), DecompForm::Compatible);
957    }
958
959    #[test]
960    fn estimated_capacity_nfc_nfkc_same_nfd_nfkd_larger() {
961        let input_len = 100;
962        assert_eq!(Form::Nfc.estimated_capacity(input_len), 100);
963        assert_eq!(Form::Nfkc.estimated_capacity(input_len), 100);
964        assert_eq!(Form::Nfd.estimated_capacity(input_len), 150);
965        assert_eq!(Form::Nfkd.estimated_capacity(input_len), 150);
966    }
967
968    #[test]
969    fn estimated_capacity_zero_length() {
970        assert_eq!(Form::Nfc.estimated_capacity(0), 0);
971        assert_eq!(Form::Nfd.estimated_capacity(0), 0);
972    }
973
974    #[test]
975    fn quick_check_ascii_is_yes_for_all_forms() {
976        let ascii = "Hello, World!";
977        assert_eq!(Form::Nfc.quick_check(ascii), quick_check::IsNormalized::Yes);
978        assert_eq!(Form::Nfd.quick_check(ascii), quick_check::IsNormalized::Yes);
979        assert_eq!(
980            Form::Nfkc.quick_check(ascii),
981            quick_check::IsNormalized::Yes
982        );
983        assert_eq!(
984            Form::Nfkd.quick_check(ascii),
985            quick_check::IsNormalized::Yes
986        );
987    }
988
989    // ===================================================================
990    // 2. NormState state machine
991    // ===================================================================
992
993    #[test]
994    fn normstate_new_has_no_starter_empty_ccc_buf() {
995        let state = NormState::new();
996        assert!(state.current_starter.is_none());
997        assert!(state.ccc_buf.is_empty());
998    }
999
1000    #[test]
1001    fn feed_entry_single_starter_sets_current_starter() {
1002        let mut state = NormState::new();
1003        let mut out = String::new();
1004        // Feed a starter (CCC=0)
1005        state.feed_entry('A', 0, &mut out, false);
1006        assert_eq!(state.current_starter, Some('A'));
1007        assert!(state.ccc_buf.is_empty());
1008        assert!(out.is_empty()); // No flush yet
1009    }
1010
1011    #[test]
1012    fn feed_entry_combining_mark_buffers_in_ccc_buf() {
1013        let mut state = NormState::new();
1014        let mut out = String::new();
1015        // Set up a starter first
1016        state.feed_entry('e', 0, &mut out, false);
1017        // Feed combining acute (CCC=230)
1018        state.feed_entry('\u{0301}', 230, &mut out, false);
1019        assert_eq!(state.current_starter, Some('e'));
1020        assert!(!state.ccc_buf.is_empty());
1021        assert_eq!(state.ccc_buf.len(), 1);
1022        assert_eq!(state.ccc_buf.as_slice()[0].ch, '\u{0301}');
1023        assert_eq!(state.ccc_buf.as_slice()[0].ccc, 230);
1024    }
1025
1026    #[test]
1027    fn feed_entry_two_starters_first_gets_flushed() {
1028        let mut state = NormState::new();
1029        let mut out = String::new();
1030        // Feed first starter
1031        state.feed_entry('A', 0, &mut out, false);
1032        assert!(out.is_empty());
1033        // Feed second starter -- first should be flushed to `out`
1034        state.feed_entry('B', 0, &mut out, false);
1035        assert_eq!(out, "A");
1036        assert_eq!(state.current_starter, Some('B'));
1037    }
1038
1039    #[test]
1040    fn feed_entry_starter_to_starter_composition_hangul_lv() {
1041        let mut state = NormState::new();
1042        let mut out = String::new();
1043        // Hangul L
1044        state.feed_entry('\u{1100}', 0, &mut out, true);
1045        // Hangul V -- should compose with L in compose mode
1046        state.feed_entry('\u{1161}', 0, &mut out, true);
1047        // The composed syllable should be the current starter
1048        assert_eq!(state.current_starter, Some('\u{AC00}'));
1049        // Nothing flushed yet
1050        assert!(out.is_empty());
1051    }
1052
1053    #[test]
1054    fn feed_entry_starter_to_starter_composition_e_acute() {
1055        let mut state = NormState::new();
1056        let mut out = String::new();
1057        // In compose mode, 'e' followed by combining acute (CCC=230)
1058        // is not starter-to-starter, but let's test the compose path
1059        // with a combining mark that composes.
1060        state.feed_entry('e', 0, &mut out, true);
1061        state.feed_entry('\u{0301}', 230, &mut out, true);
1062        // Now flush to get the composed result
1063        state.flush(&mut out, true);
1064        assert_eq!(out, "\u{00E9}"); // e-acute
1065    }
1066
1067    #[test]
1068    fn feed_entry_nfd_starters_and_combining_marks() {
1069        let mut state = NormState::new();
1070        let mut out = String::new();
1071        // Feed starter
1072        state.feed_entry_nfd('A', 0, &mut out);
1073        assert_eq!(state.current_starter, Some('A'));
1074        // Feed combining grave (CCC=230)
1075        state.feed_entry_nfd('\u{0300}', 230, &mut out);
1076        assert_eq!(state.ccc_buf.len(), 1);
1077        // Feed new starter -- flushes A + combining grave
1078        state.feed_entry_nfd('B', 0, &mut out);
1079        assert_eq!(out, "A\u{0300}");
1080        assert_eq!(state.current_starter, Some('B'));
1081    }
1082
1083    // ===================================================================
1084    // 3. NormState flush() and flush_nfd()
1085    // ===================================================================
1086
1087    #[test]
1088    fn flush_no_starter_no_marks_nothing_emitted() {
1089        let mut state = NormState::new();
1090        let mut out = String::new();
1091        state.flush(&mut out, false);
1092        assert!(out.is_empty());
1093        state.flush(&mut out, true);
1094        assert!(out.is_empty());
1095    }
1096
1097    #[test]
1098    fn flush_starter_only_emits_starter() {
1099        let mut state = NormState::new();
1100        let mut out = String::new();
1101        state.current_starter = Some('X');
1102        state.flush(&mut out, false);
1103        assert_eq!(out, "X");
1104    }
1105
1106    #[test]
1107    fn flush_starter_one_combining_mark_no_compose() {
1108        let mut state = NormState::new();
1109        let mut out = String::new();
1110        state.current_starter = Some('e');
1111        state.ccc_buf.push('\u{0301}', 230); // combining acute
1112        state.flush(&mut out, false);
1113        assert_eq!(out, "e\u{0301}");
1114    }
1115
1116    #[test]
1117    fn flush_starter_one_combining_mark_with_compose() {
1118        let mut state = NormState::new();
1119        let mut out = String::new();
1120        state.current_starter = Some('e');
1121        state.ccc_buf.push('\u{0301}', 230); // combining acute
1122        state.flush(&mut out, true);
1123        assert_eq!(out, "\u{00E9}"); // e-acute composed
1124    }
1125
1126    #[test]
1127    fn flush_starter_multiple_ccc_disordered_marks_emits_sorted() {
1128        let mut state = NormState::new();
1129        let mut out = String::new();
1130        state.current_starter = Some('a');
1131        // Push marks in wrong CCC order: 230, 220, 202
1132        state.ccc_buf.push('\u{0301}', 230); // combining acute, CCC=230
1133        state.ccc_buf.push('\u{0323}', 220); // combining dot below, CCC=220
1134        state.ccc_buf.push('\u{0327}', 202); // combining cedilla, CCC=202
1135        state.flush(&mut out, false);
1136        // Should emit starter + marks sorted by CCC: 202, 220, 230
1137        let chars: Vec<char> = out.chars().collect();
1138        assert_eq!(chars[0], 'a');
1139        assert_eq!(chars[1], '\u{0327}'); // CCC=202
1140        assert_eq!(chars[2], '\u{0323}'); // CCC=220
1141        assert_eq!(chars[3], '\u{0301}'); // CCC=230
1142    }
1143
1144    #[test]
1145    fn flush_orphan_combining_marks_no_starter_emits_sorted() {
1146        let mut state = NormState::new();
1147        let mut out = String::new();
1148        // No starter set, just orphan combining marks
1149        state.ccc_buf.push('\u{0301}', 230); // CCC=230
1150        state.ccc_buf.push('\u{0327}', 202); // CCC=202
1151        state.flush(&mut out, false);
1152        let chars: Vec<char> = out.chars().collect();
1153        assert_eq!(chars.len(), 2);
1154        assert_eq!(chars[0], '\u{0327}'); // CCC=202 first
1155        assert_eq!(chars[1], '\u{0301}'); // CCC=230 second
1156    }
1157
1158    #[test]
1159    fn flush_nfd_no_starter_no_marks_nothing_emitted() {
1160        let mut state = NormState::new();
1161        let mut out = String::new();
1162        state.flush_nfd(&mut out);
1163        assert!(out.is_empty());
1164    }
1165
1166    #[test]
1167    fn flush_nfd_starter_only_emits_starter() {
1168        let mut state = NormState::new();
1169        let mut out = String::new();
1170        state.current_starter = Some('Z');
1171        state.flush_nfd(&mut out);
1172        assert_eq!(out, "Z");
1173    }
1174
1175    #[test]
1176    fn flush_nfd_single_mark_fast_path_take_single_inline() {
1177        let mut state = NormState::new();
1178        let mut out = String::new();
1179        state.current_starter = Some('e');
1180        state.ccc_buf.push('\u{0301}', 230); // single combining mark
1181        // This should hit the take_single_inline fast path in flush_nfd
1182        state.flush_nfd(&mut out);
1183        assert_eq!(out, "e\u{0301}");
1184        // Buffer should be cleared
1185        assert!(state.ccc_buf.is_empty());
1186    }
1187
1188    #[test]
1189    fn flush_nfd_multiple_marks_sorted() {
1190        let mut state = NormState::new();
1191        let mut out = String::new();
1192        state.current_starter = Some('o');
1193        state.ccc_buf.push('\u{0301}', 230); // CCC=230
1194        state.ccc_buf.push('\u{0327}', 202); // CCC=202
1195        state.flush_nfd(&mut out);
1196        let chars: Vec<char> = out.chars().collect();
1197        assert_eq!(chars[0], 'o');
1198        assert_eq!(chars[1], '\u{0327}'); // CCC=202
1199        assert_eq!(chars[2], '\u{0301}'); // CCC=230
1200    }
1201
1202    #[test]
1203    fn flush_nfd_orphan_combining_marks_no_starter() {
1204        let mut state = NormState::new();
1205        let mut out = String::new();
1206        state.ccc_buf.push('\u{0301}', 230);
1207        state.ccc_buf.push('\u{0323}', 220);
1208        state.flush_nfd(&mut out);
1209        let chars: Vec<char> = out.chars().collect();
1210        assert_eq!(chars.len(), 2);
1211        assert_eq!(chars[0], '\u{0323}'); // CCC=220
1212        assert_eq!(chars[1], '\u{0301}'); // CCC=230
1213    }
1214
1215    // ===================================================================
1216    // 4. normalize_impl() Cow::Borrowed path
1217    // ===================================================================
1218
1219    #[test]
1220    fn normalize_impl_nfc_already_normalized_returns_borrowed() {
1221        // U+00C5 (A with ring) followed by U+0300 (combining grave).
1222        // This is already in NFC -- the quick check should return Maybe
1223        // (because U+0300 has NFC_QC=Maybe), but after normalization,
1224        // the output equals input, so Cow::Borrowed is returned.
1225        let input = "\u{00C5}\u{0300}";
1226        let result = normalize_impl(input, Form::Nfc);
1227        assert!(
1228            matches!(result, Cow::Borrowed(_)),
1229            "Expected Cow::Borrowed for already-NFC input with Maybe QC, got Cow::Owned({:?})",
1230            result
1231        );
1232        assert_eq!(&*result, input);
1233    }
1234
1235    #[test]
1236    fn normalize_impl_nfc_maybe_borrowed_simd_path() {
1237        // Exercise the SIMD normalize_impl Maybe->Borrowed code path (line 720-721).
1238        // Input must be >= 64 bytes and trigger QC=Maybe but produce identical output.
1239        // 60 bytes of ASCII padding + "\u{00C5}\u{0300}" (already NFC, QC=Maybe).
1240        let mut input = String::new();
1241        input.push_str(&"a".repeat(60));
1242        input.push_str("\u{00C5}\u{0300}"); // Å + combining grave, already NFC
1243        assert!(input.len() >= 64, "input must be >= 64 bytes for SIMD path");
1244        let result = normalize_impl(&input, Form::Nfc);
1245        assert!(
1246            matches!(result, Cow::Borrowed(_)),
1247            "Expected Cow::Borrowed for >=64 byte already-NFC input with Maybe QC, got Cow::Owned({:?})",
1248            result
1249        );
1250        assert_eq!(&*result, &*input);
1251    }
1252
1253    #[test]
1254    fn normalize_impl_ascii_returns_borrowed() {
1255        let input = "Hello, world!";
1256        let result = normalize_impl(input, Form::Nfc);
1257        assert!(matches!(result, Cow::Borrowed(_)));
1258        assert_eq!(&*result, input);
1259    }
1260
1261    #[test]
1262    fn normalize_impl_nfd_already_decomposed_returns_borrowed() {
1263        // "e" + combining acute is already NFD
1264        let input = "e\u{0301}";
1265        let result = normalize_impl(input, Form::Nfd);
1266        assert!(
1267            matches!(result, Cow::Borrowed(_)),
1268            "Expected Cow::Borrowed for already-NFD input"
1269        );
1270    }
1271
1272    #[test]
1273    fn normalize_impl_nfc_not_normalized_returns_owned() {
1274        // NFD form of e-acute: "e" + combining acute -- not NFC
1275        let input = "e\u{0301}";
1276        let result = normalize_impl(input, Form::Nfc);
1277        assert!(matches!(result, Cow::Owned(_)));
1278        assert_eq!(&*result, "\u{00E9}");
1279    }
1280
1281    // ===================================================================
1282    // 5. is_cjk_unified() boundary tests
1283    // ===================================================================
1284
1285    #[test]
1286    fn cjk_unified_extension_a_start() {
1287        assert!(is_cjk_unified(0x3400));
1288    }
1289
1290    #[test]
1291    fn cjk_unified_extension_a_end() {
1292        assert!(is_cjk_unified(0x4DBF));
1293    }
1294
1295    #[test]
1296    fn cjk_unified_main_start() {
1297        assert!(is_cjk_unified(0x4E00));
1298    }
1299
1300    #[test]
1301    fn cjk_unified_main_end() {
1302        assert!(is_cjk_unified(0x9FFF));
1303    }
1304
1305    #[test]
1306    fn cjk_unified_just_before_extension_a() {
1307        assert!(!is_cjk_unified(0x33FF));
1308    }
1309
1310    #[test]
1311    fn cjk_unified_gap_between_extension_a_and_main() {
1312        assert!(!is_cjk_unified(0x4DC0));
1313    }
1314
1315    #[test]
1316    fn cjk_unified_just_after_main() {
1317        assert!(!is_cjk_unified(0xA000));
1318    }
1319
1320    // ===================================================================
1321    // 6. is_supp_safe() boundary tests
1322    // ===================================================================
1323
1324    #[test]
1325    fn supp_safe_plane2_start() {
1326        // 0x20000 is Plane 2 start, not in compat range -> true
1327        assert!(is_supp_safe(0x20000));
1328    }
1329
1330    #[test]
1331    fn supp_safe_cjk_compat_supplement_start() {
1332        assert!(!is_supp_safe(0x2F800));
1333    }
1334
1335    #[test]
1336    fn supp_safe_cjk_compat_supplement_end() {
1337        assert!(!is_supp_safe(0x2FA1F));
1338    }
1339
1340    #[test]
1341    fn supp_safe_just_after_compat_supplement() {
1342        assert!(is_supp_safe(0x2FA20));
1343    }
1344
1345    #[test]
1346    fn supp_safe_plane1_safe_range_start() {
1347        assert!(is_supp_safe(0x1F252));
1348    }
1349
1350    #[test]
1351    fn supp_safe_plane1_safe_range_end() {
1352        assert!(is_supp_safe(0x1FBEF));
1353    }
1354
1355    #[test]
1356    fn supp_safe_just_before_plane1_safe_range() {
1357        assert!(!is_supp_safe(0x1F251));
1358    }
1359
1360    #[test]
1361    fn supp_safe_just_after_plane1_safe_range() {
1362        assert!(!is_supp_safe(0x1FBF0));
1363    }
1364
1365    #[test]
1366    fn supp_safe_smp_start_before_safe_range() {
1367        // 0x10000 is SMP start, before the safe range
1368        assert!(!is_supp_safe(0x10000));
1369    }
1370}