simd_normalizer/
normalizer.rs

1//! Single-pass SIMD-guided normalizer implementations (NFC, NFD, NFKC, NFKD).
2//!
3//! The core loop scans 64-byte chunks via SIMD to identify passthrough regions
4//! (all bytes below a form-dependent bound), copying them directly.  Non-passthrough
5//! bytes trigger scalar decode + decompose + CCC sort + optional recomposition.
6
7use alloc::borrow::Cow;
8use alloc::string::String;
9
10use crate::ccc::CccBuffer;
11use crate::compose;
12use crate::decompose::{self, DecompForm};
13use crate::hangul;
14use crate::quick_check;
15use crate::simd;
16use crate::simd::prefetch;
17use crate::tables;
18use crate::utf8;
19
20// ---------------------------------------------------------------------------
21// Form enum
22// ---------------------------------------------------------------------------
23
24/// Unicode normalization form.
25#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum Form {
27    /// Canonical Decomposition, followed by Canonical Composition (NFC).
28    Nfc,
29    /// Canonical Decomposition (NFD).
30    Nfd,
31    /// Compatibility Decomposition, followed by Canonical Composition (NFKC).
32    Nfkc,
33    /// Compatibility Decomposition (NFKD).
34    Nfkd,
35}
36
37impl Form {
38    /// The SIMD passthrough byte bound for this form.
39    ///
40    /// Any byte below this value is guaranteed to not require normalization
41    /// processing: it is either ASCII or a continuation byte of a character
42    /// that does not need decomposition.
43    ///
44    /// - NFD/NFKD: 0xC0  (first byte of U+00C0, which decomposes)
45    /// - NFC/NFKC: 0xC0  (same: characters >= U+00C0 may need processing)
46    #[inline]
47    fn passthrough_bound(self) -> u8 {
48        match self {
49            Form::Nfc | Form::Nfkc => 0xC0,
50            Form::Nfd | Form::Nfkd => 0xC0,
51        }
52    }
53
54    /// Whether this form applies canonical composition after decomposition.
55    #[inline]
56    fn composes(self) -> bool {
57        matches!(self, Form::Nfc | Form::Nfkc)
58    }
59
60    /// Which decomposition form to use.
61    #[inline]
62    fn decomp_form(self) -> DecompForm {
63        match self {
64            Form::Nfc | Form::Nfd => DecompForm::Canonical,
65            Form::Nfkc | Form::Nfkd => DecompForm::Compatible,
66        }
67    }
68
69    /// Estimated output capacity for a given input length.
70    #[inline]
71    fn estimated_capacity(self, input_len: usize) -> usize {
72        match self {
73            Form::Nfc | Form::Nfkc => input_len,
74            Form::Nfd | Form::Nfkd => input_len + input_len / 2,
75        }
76    }
77
78    /// Run quick_check for this normalization form.
79    #[inline]
80    fn quick_check(self, input: &str) -> quick_check::IsNormalized {
81        match self {
82            Form::Nfc => quick_check::quick_check_nfc(input),
83            Form::Nfd => quick_check::quick_check_nfd(input),
84            Form::Nfkc => quick_check::quick_check_nfkc(input),
85            Form::Nfkd => quick_check::quick_check_nfkd(input),
86        }
87    }
88}
89
90// ---------------------------------------------------------------------------
91// NormState -- accumulation state for a starter + its combining marks
92// ---------------------------------------------------------------------------
93
94struct NormState {
95    /// The current starter character (CCC == 0) being accumulated.
96    current_starter: Option<char>,
97    /// Combining marks (CCC > 0) following the current starter, not yet sorted.
98    ccc_buf: CccBuffer,
99}
100
101impl NormState {
102    #[inline]
103    fn new() -> Self {
104        NormState {
105            current_starter: None,
106            ccc_buf: CccBuffer::new(),
107        }
108    }
109
110    /// Flush the current accumulation (starter + combining marks) to `out`.
111    ///
112    /// If `composes` is true, applies canonical composition.
113    #[inline]
114    fn flush(&mut self, out: &mut String, composes: bool) {
115        let starter = match self.current_starter.take() {
116            Some(s) => s,
117            None => {
118                // No starter -- flush any orphan combining marks (leading combiners).
119                if !self.ccc_buf.is_empty() {
120                    self.ccc_buf.sort_in_place();
121                    for entry in self.ccc_buf.as_slice() {
122                        out.push(entry.ch);
123                    }
124                    self.ccc_buf.clear();
125                }
126                return;
127            },
128        };
129
130        if self.ccc_buf.is_empty() {
131            // Starter with no combining marks -- just emit it.
132            out.push(starter);
133            return;
134        }
135
136        // Sort combining marks by CCC in place.
137        self.ccc_buf.sort_in_place();
138
139        if composes {
140            compose::compose_combining_sequence_into(starter, self.ccc_buf.as_slice(), out);
141        } else {
142            // Decomposition only: emit starter + sorted marks.
143            out.push(starter);
144            for entry in self.ccc_buf.as_slice() {
145                out.push(entry.ch);
146            }
147        }
148        self.ccc_buf.clear();
149    }
150
151    /// Process a single character (after decomposition) into the accumulation state.
152    ///
153    /// Characters with CCC == 0 are starters. When a new starter arrives, the
154    /// previous accumulation is flushed. In composition mode, starter-to-starter
155    /// composition is attempted first (required for Hangul jamo L+V, LV+T).
156    #[inline]
157    fn feed_entry(&mut self, ch: char, ccc: u8, out: &mut String, composes: bool) {
158        if ccc == 0 {
159            // New starter.
160            if composes && self.ccc_buf.is_empty() {
161                // No intervening combining marks -- try starter-to-starter composition.
162                if let Some(prev) = self.current_starter
163                    && let Some(composed) = compose::compose(prev, ch)
164                {
165                    self.current_starter = Some(composed);
166                    return;
167                }
168            }
169            // Either not composing, has intervening marks, or composition failed.
170            self.flush(out, composes);
171            self.current_starter = Some(ch);
172        } else {
173            // Combining mark: add to buffer.
174            self.ccc_buf.push(ch, ccc);
175        }
176    }
177
178    /// NFD-specialized flush: no composition logic.
179    #[inline]
180    fn flush_nfd(&mut self, out: &mut String) {
181        let starter = match self.current_starter.take() {
182            Some(s) => s,
183            None => {
184                if !self.ccc_buf.is_empty() {
185                    self.ccc_buf.sort_in_place();
186                    for entry in self.ccc_buf.as_slice() {
187                        out.push(entry.ch);
188                    }
189                    self.ccc_buf.clear();
190                }
191                return;
192            },
193        };
194
195        // Fast path: single combining mark (most common for precomposed Latin).
196        // Skip sort (unnecessary for 1 element) and avoid as_slice/clear overhead.
197        if let Some(entry) = self.ccc_buf.take_single_inline() {
198            out.push(starter);
199            out.push(entry.ch);
200            return;
201        }
202
203        if self.ccc_buf.is_empty() {
204            out.push(starter);
205            return;
206        }
207
208        // Multiple marks: sort and emit.
209        self.ccc_buf.sort_in_place();
210        out.push(starter);
211        for entry in self.ccc_buf.as_slice() {
212            out.push(entry.ch);
213        }
214        self.ccc_buf.clear();
215    }
216
217    /// NFD-specialized feed_entry: no composition checks.
218    #[inline]
219    fn feed_entry_nfd(&mut self, ch: char, ccc: u8, out: &mut String) {
220        if ccc == 0 {
221            self.flush_nfd(out);
222            self.current_starter = Some(ch);
223        } else {
224            self.ccc_buf.push(ch, ccc);
225        }
226    }
227}
228
229// ---------------------------------------------------------------------------
230// process_char -- decompose a char and feed entries to NormState
231// ---------------------------------------------------------------------------
232
233/// Check if a code point is a CJK Unified Ideograph (CCC=0, no decomposition,
234/// no composition). These can bypass the entire decompose pipeline.
235#[inline(always)]
236fn is_cjk_unified(cp: u32) -> bool {
237    (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
238}
239
240// ---------------------------------------------------------------------------
241// Latin-1 supplement (U+00C0..=U+00FF) decomposition fast path
242// ---------------------------------------------------------------------------
243//
244// NFD/NFKD on the precomposed Latin-1 supplement is by far the most common
245// "needs work" payload in real-world text (`é`, `ö`, `à`, …). Each codepoint
246// otherwise pays for: trie BMP lookup + has_decomposition mask + expansion
247// table indexing + length read + slice. We can short-circuit the entire
248// pipeline because the canonical *and* compatibility decompositions in this
249// 64-codepoint range are always either (a) self-mapping (e.g. `Æ`, `Ð`,
250// `×`) or (b) `(ASCII letter, single combining mark)` — never longer, never
251// case-dependent. NFD == NFKD here, so a single table covers both forms.
252//
253// The table is indexed by `cp - 0xC0`, where `cp` is the codepoint extracted
254// from the 2-byte UTF-8 sequence with `b0 == 0xC3`. Entry encoding:
255//
256//   * Self-mapping: `(0, 0, 0)`  (the caller falls through to the general
257//     "non-decomposing" path which just bulk-passthrough's the bytes)
258//   * Decomposing:  `(starter_ascii_byte, mark_cp_u16, mark_ccc_u8)`
259//
260// Storing only the ASCII byte for the starter (instead of a `u32` codepoint)
261// keeps the table at 8 bytes/entry (with packing), which fits 64 entries in
262// 8 cache lines.
263
264/// Sentinel meaning "U+00xx maps to itself (no decomposition); use the
265/// general path to bulk-passthrough this codepoint as-is".
266const LATIN1_SELF_MAPPING: (u8, u16, u8) = (0, 0, 0);
267
268/// NFD == NFKD decomposition table for U+00C0..=U+00FF.
269///
270/// Indexed by `cp - 0xC0`. Each `(starter_ascii, mark_cp, mark_ccc)` tuple
271/// describes the canonical decomposition. `starter_ascii == 0` means the
272/// codepoint does not decompose (use the general path).
273///
274/// Hand-derived from `UnicodeData.txt` and verified at module-init time by
275/// the `latin1_table_matches_runtime_lookup` test.
276#[rustfmt::skip]
277static LATIN1_NFD_TABLE: [(u8, u16, u8); 0x40] = [
278    // U+00C0..=U+00CF
279    (b'A', 0x0300, 230), (b'A', 0x0301, 230), (b'A', 0x0302, 230), (b'A', 0x0303, 230),
280    (b'A', 0x0308, 230), (b'A', 0x030A, 230), LATIN1_SELF_MAPPING,  (b'C', 0x0327, 202),
281    (b'E', 0x0300, 230), (b'E', 0x0301, 230), (b'E', 0x0302, 230), (b'E', 0x0308, 230),
282    (b'I', 0x0300, 230), (b'I', 0x0301, 230), (b'I', 0x0302, 230), (b'I', 0x0308, 230),
283    // U+00D0..=U+00DF
284    LATIN1_SELF_MAPPING,  (b'N', 0x0303, 230), (b'O', 0x0300, 230), (b'O', 0x0301, 230),
285    (b'O', 0x0302, 230), (b'O', 0x0303, 230), (b'O', 0x0308, 230), LATIN1_SELF_MAPPING,
286    LATIN1_SELF_MAPPING,  (b'U', 0x0300, 230), (b'U', 0x0301, 230), (b'U', 0x0302, 230),
287    (b'U', 0x0308, 230), (b'Y', 0x0301, 230), LATIN1_SELF_MAPPING,  LATIN1_SELF_MAPPING,
288    // U+00E0..=U+00EF
289    (b'a', 0x0300, 230), (b'a', 0x0301, 230), (b'a', 0x0302, 230), (b'a', 0x0303, 230),
290    (b'a', 0x0308, 230), (b'a', 0x030A, 230), LATIN1_SELF_MAPPING,  (b'c', 0x0327, 202),
291    (b'e', 0x0300, 230), (b'e', 0x0301, 230), (b'e', 0x0302, 230), (b'e', 0x0308, 230),
292    (b'i', 0x0300, 230), (b'i', 0x0301, 230), (b'i', 0x0302, 230), (b'i', 0x0308, 230),
293    // U+00F0..=U+00FF
294    LATIN1_SELF_MAPPING,  (b'n', 0x0303, 230), (b'o', 0x0300, 230), (b'o', 0x0301, 230),
295    (b'o', 0x0302, 230), (b'o', 0x0303, 230), (b'o', 0x0308, 230), LATIN1_SELF_MAPPING,
296    LATIN1_SELF_MAPPING,  (b'u', 0x0300, 230), (b'u', 0x0301, 230), (b'u', 0x0302, 230),
297    (b'u', 0x0308, 230), (b'y', 0x0301, 230), LATIN1_SELF_MAPPING,  (b'y', 0x0308, 230),
298];
299
300/// NFD/NFKD fast path for the Latin-1 supplement (U+00C0..=U+00FF).
301///
302/// Only valid when the byte at `byte_pos` is `0xC3` (the only UTF-8 leading
303/// byte that emits codepoints in this range). Reads `b1`, indexes the table,
304/// and either short-circuits the decode-or-passthrough decision or returns
305/// `None` for self-mapping codepoints (which the caller falls through to
306/// the general non-decomposing path for).
307///
308/// Returns `Some((starter, mark_cp, mark_ccc))` when the codepoint decomposes
309/// to a single ASCII starter + single combining mark.
310///
311/// # Safety
312///
313/// `byte_pos + 1 < len` and the input must be valid UTF-8 starting with
314/// `0xC3` at `byte_pos` — both guaranteed by the SIMD bit-walk's leading-byte
315/// filter and the original `&str` input invariant.
316#[inline(always)]
317unsafe fn latin1_supplement_nfd(bytes: *const u8, byte_pos: usize) -> Option<(u8, char, u8)> {
318    // SAFETY: caller guarantees `byte_pos + 1 < len`.
319    let b1 = unsafe { *bytes.add(byte_pos + 1) };
320    let idx = (b1 & 0x3F) as usize; // b1 = 0x80 | (cp & 0x3F); cp = 0xC0 | (b1 & 0x3F).
321    let entry = LATIN1_NFD_TABLE[idx];
322    if entry.0 == 0 {
323        return None;
324    }
325    // SAFETY: every non-self-mapping entry stores a valid combining-mark
326    // codepoint (all in the U+0300..=U+0327 range, well-formed scalars).
327    let mark = unsafe { char::from_u32_unchecked(entry.1 as u32) };
328    Some((entry.0, mark, entry.2))
329}
330
331/// Decompose a character and feed each resulting entry into the accumulation state.
332///
333/// Uses a single trie lookup with passthrough fast-paths for non-decomposing
334/// characters, avoiding the full decomposition pipeline for the common case.
335#[inline]
336fn process_char(
337    ch: char,
338    state: &mut NormState,
339    out: &mut String,
340    form: Form,
341    decomp_buf: &mut CccBuffer,
342) {
343    let cp = ch as u32;
344
345    // Fast path: CJK ideographs never decompose, have CCC=0, and never
346    // participate in canonical composition. No trie lookup needed.
347    if cp >= 0x3400 && is_cjk_unified(cp) {
348        state.flush(out, form.composes());
349        state.current_starter = Some(ch);
350        return;
351    }
352
353    // Hangul syllables: algorithmic decomposition, no trie lookup needed.
354    if hangul::is_hangul_syllable(ch) {
355        let (l, v, t) = hangul::decompose_hangul(ch);
356        state.feed_entry(l, 0, out, form.composes());
357        state.feed_entry(v, 0, out, form.composes());
358        if let Some(t_char) = t {
359            state.feed_entry(t_char, 0, out, form.composes());
360        }
361        return;
362    }
363
364    // Single trie lookup for both passthrough check and decomposition.
365    let trie_value = tables::raw_decomp_trie_value(ch, form.decomp_form());
366
367    // Non-decomposing character: extract CCC and feed directly.
368    // This covers both starters (CCC=0) and combining marks (CCC>0)
369    // that map to themselves, skipping the full decompose pipeline.
370    if !tables::has_decomposition(trie_value) {
371        let ccc = tables::ccc_from_trie_value(trie_value);
372        state.feed_entry(ch, ccc, out, form.composes());
373        return;
374    }
375
376    // Character has a decomposition: decode from the pre-looked-up trie value.
377    decomp_buf.clear();
378    decompose::decompose_from_trie_value(ch, trie_value, decomp_buf, form.decomp_form());
379    for entry in decomp_buf.as_slice() {
380        state.feed_entry(entry.ch, entry.ccc, out, form.composes());
381    }
382}
383
384// ---------------------------------------------------------------------------
385// Unified per-codepoint decode pipeline (Component D)
386// ---------------------------------------------------------------------------
387//
388// On dense scripts (CJK, Hangul, emoji, Arabic) every byte in a 64-byte SIMD
389// chunk is a "hit", and the per-codepoint decode/CCC/decompose calls dominate
390// runtime. The unified `decode_at` performs UTF-8 decode, CCC extraction, and
391// decomposition lookup in one pass, sharing the bounds check and table-pointer
392// derivation. `process_codepoint` then dispatches form-specifically on the
393// resulting `DecodedCodepoint` without re-doing any of that work.
394
395/// Discriminator describing what kind of decomposition (if any) was looked up
396/// for a codepoint. The variant determines which decomposition table the
397/// `decomp` slice came from (or whether it is empty / singleton-encoded).
398#[derive(Clone, Copy, Debug, PartialEq, Eq)]
399enum DecompKind {
400    /// Codepoint has no decomposition mapping in the relevant form. The
401    /// codepoint passes through with its own CCC.
402    None,
403    /// Decomposition was looked up via the canonical (NFC/NFD) trie.
404    Canonical,
405    /// Decomposition was looked up via the compatibility (NFKC/NFKD) trie.
406    Compat,
407}
408
409/// Result of a unified UTF-8 + CCC + decomposition lookup at a byte position.
410///
411/// `decomp` is the expansion slice from the canonical/compat table when the
412/// codepoint has an expansion-form decomposition; for singleton decompositions
413/// (which target a single BMP codepoint encoded inside the trie value), the
414/// slice is empty and `tv` carries the trie value so the singleton can be
415/// resolved without a second lookup.
416struct DecodedCodepoint {
417    /// The decoded codepoint.
418    cp: u32,
419    /// UTF-8 byte length, in `1..=4`.
420    cp_len: u8,
421    /// Canonical Combining Class for this codepoint (0 for starters).
422    ccc: u8,
423    /// Discriminator for the `decomp` field interpretation.
424    decomp_kind: DecompKind,
425    /// Expansion data slice (empty if `decomp_kind == None` or if the
426    /// decomposition is a singleton — see `tv` for the singleton case).
427    decomp: &'static [u32],
428    /// Raw trie value from the relevant decomposition trie. Used to fall back
429    /// to the existing singleton path without a second lookup.
430    tv: u32,
431}
432
433/// Single-pass UTF-8 decode + CCC + decomposition lookup at byte index `idx`.
434///
435/// Performs the bounds check and table-pointer derivation once for all four
436/// pieces of information, replacing four separate sub-calls each with their
437/// own checks.
438///
439/// # Safety
440///
441/// - `bytes` must point to at least `len` valid bytes.
442/// - `idx` must be a valid index `< len` and must point to a UTF-8 leading
443///   byte (not a continuation byte). Both invariants are guaranteed by the
444///   SIMD scanner contract on the bit-walk: bits in the chunk mask only
445///   correspond to bytes `>= bound`, and our bound rules out the
446///   `0x80..=0xBF` range from being a leading byte.
447/// - The bytes at `idx..idx + cp_len` must form a valid UTF-8 sequence (true
448///   because `bytes` originates from a valid `&str`).
449#[inline(always)]
450unsafe fn decode_at(bytes: *const u8, idx: usize, len: usize, form: Form) -> DecodedCodepoint {
451    debug_assert!(idx < len);
452    // SAFETY: `idx < len` and `bytes` is valid for `len` bytes.
453    let b0 = unsafe { *bytes.add(idx) };
454    let cp_len = utf8::utf8_char_width(b0);
455    debug_assert!(cp_len > 0, "decode_at called on continuation/invalid byte");
456    debug_assert!(idx + cp_len <= len, "UTF-8 sequence runs past end of input");
457
458    // Decode the codepoint. The branch arms correspond to the four possible
459    // UTF-8 widths; `cp_len == 0` is impossible because the SIMD scanner only
460    // surfaces leading bytes (any continuation byte is filtered out by the
461    // caller before invoking `decode_at`).
462    let cp = match cp_len {
463        1 => b0 as u32,
464        2 => {
465            // SAFETY: `idx + 1 < len` because `cp_len == 2` and the input is
466            // valid UTF-8 with at least `cp_len` bytes available.
467            let b1 = unsafe { *bytes.add(idx + 1) } as u32;
468            ((b0 as u32 & 0x1F) << 6) | (b1 & 0x3F)
469        },
470        3 => {
471            // SAFETY: as above with `cp_len == 3`.
472            let b1 = unsafe { *bytes.add(idx + 1) } as u32;
473            let b2 = unsafe { *bytes.add(idx + 2) } as u32;
474            ((b0 as u32 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)
475        },
476        4 => {
477            // SAFETY: as above with `cp_len == 4`.
478            let b1 = unsafe { *bytes.add(idx + 1) } as u32;
479            let b2 = unsafe { *bytes.add(idx + 2) } as u32;
480            let b3 = unsafe { *bytes.add(idx + 3) } as u32;
481            ((b0 as u32 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
482        },
483        // SAFETY: `cp_len` is one of {1,2,3,4} for a valid UTF-8 leading byte;
484        // the SIMD bit-walk only surfaces leading bytes by contract.
485        _ => unsafe { core::hint::unreachable_unchecked() },
486    };
487
488    // Single trie lookup for both CCC and decomposition. Use the unchecked
489    // supplementary path for cp >= 0x10000 (emoji etc.) to skip the BMP
490    // branch in `CodePointTrie::get`.
491    let decomp_form = form.decomp_form();
492    let tv = if cp >= 0x10000 {
493        // SAFETY: `cp` is a valid supplementary code point from a valid char.
494        unsafe { tables::raw_decomp_trie_value_supplementary(cp, decomp_form) }
495    } else {
496        // SAFETY: BMP path — `cp` is a valid Unicode scalar value.
497        let ch = unsafe { char::from_u32_unchecked(cp) };
498        tables::raw_decomp_trie_value(ch, decomp_form)
499    };
500    let ccc = tables::ccc_from_trie_value(tv);
501
502    let (decomp_kind, decomp) = if !tables::has_decomposition(tv) {
503        (DecompKind::None, &[][..])
504    } else {
505        let kind = match decomp_form {
506            DecompForm::Canonical => DecompKind::Canonical,
507            DecompForm::Compatible => DecompKind::Compat,
508        };
509        // Expansion if present; empty slice marks a singleton (handled via tv).
510        let slice = tables::expansion_data_from_trie_value(tv, decomp_form).unwrap_or(&[]);
511        (kind, slice)
512    };
513
514    DecodedCodepoint {
515        cp,
516        cp_len: cp_len as u8,
517        ccc,
518        decomp_kind,
519        decomp,
520        tv,
521    }
522}
523
524/// Feed the entries of an expansion-form decomposition into `state`. Each
525/// entry packs `(ccc, codepoint)` per `EXPANSION_CCC_SHIFT` / `EXPANSION_CP_MASK`.
526///
527/// In NFD/NFKD mode (`!composes`), specializes the 2-entry "starter +
528/// combining mark" expansion (the common precomposed-Latin / Greek / Cyrillic
529/// case) to bypass per-entry `feed_entry_nfd` overhead.
530#[inline(always)]
531fn feed_expansion(decomp: &'static [u32], state: &mut NormState, out: &mut String, composes: bool) {
532    if !composes && decomp.len() == 2 {
533        let e0 = decomp[0];
534        let ccc0 = (e0 >> tables::EXPANSION_CCC_SHIFT) as u8;
535        if ccc0 == 0 {
536            // Starter + (any second entry). Hot path on precomposed Latin.
537            state.flush_nfd(out);
538            let cp0 = e0 & tables::EXPANSION_CP_MASK;
539            debug_assert!(cp0 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp0));
540            // SAFETY: expansion data targets are valid Unicode scalar values.
541            state.current_starter = Some(unsafe { char::from_u32_unchecked(cp0) });
542            let e1 = decomp[1];
543            let cp1 = e1 & tables::EXPANSION_CP_MASK;
544            let ccc1 = (e1 >> tables::EXPANSION_CCC_SHIFT) as u8;
545            debug_assert!(cp1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp1));
546            // SAFETY: expansion data targets are valid Unicode scalar values.
547            let ch1 = unsafe { char::from_u32_unchecked(cp1) };
548            if ccc1 != 0 {
549                state.ccc_buf.push(ch1, ccc1);
550            } else {
551                state.feed_entry_nfd(ch1, 0, out);
552            }
553            return;
554        }
555    }
556    for &entry in decomp {
557        let cp = entry & tables::EXPANSION_CP_MASK;
558        let ccc = (entry >> tables::EXPANSION_CCC_SHIFT) as u8;
559        debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
560        // SAFETY: expansion data is generated from valid Unicode scalar values.
561        let exp_ch = unsafe { char::from_u32_unchecked(cp) };
562        if composes {
563            state.feed_entry(exp_ch, ccc, out, true);
564        } else {
565            state.feed_entry_nfd(exp_ch, ccc, out);
566        }
567    }
568}
569
570/// Cold path: feed a singleton decomposition (one BMP codepoint embedded in
571/// the trie value) into `state`. Singletons are rare on dense scripts where
572/// the SIMD-hit branch dominates, so we mark this branch `#[cold]` to keep
573/// the hot path (passthrough + expansion) tighter.
574#[cold]
575#[inline(never)]
576fn feed_singleton(tv: u32, state: &mut NormState, out: &mut String, composes: bool) {
577    let info = tv & 0xFFFF;
578    debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
579    // SAFETY: singleton target is a valid BMP codepoint by table construction.
580    let decomposed = unsafe { char::from_u32_unchecked(info) };
581    let ccc = if info <= 0x7F {
582        0
583    } else {
584        tables::lookup_ccc(decomposed)
585    };
586    if composes {
587        state.feed_entry(decomposed, ccc, out, true);
588    } else {
589        state.feed_entry_nfd(decomposed, ccc, out);
590    }
591}
592
593/// Helper: feed a combining mark (CCC > 0) into `state`. Combining marks
594/// drive the CCC reorder chain inside `NormState`. On dense scripts (CJK /
595/// Hangul / emoji) where the SIMD-hit branch dominates this is rare, but on
596/// scripts dominated by combining marks (Arabic, the `worst_case` base+marks
597/// stream) it is hit on every codepoint, so we keep it `#[inline]` so it
598/// folds into the bit-walk and benefits from `feed_entry_nfd`'s own inlining.
599#[inline]
600fn feed_combining_mark(ch: char, ccc: u8, state: &mut NormState, out: &mut String, composes: bool) {
601    if composes {
602        state.feed_entry(ch, ccc, out, true);
603    } else {
604        state.feed_entry_nfd(ch, ccc, out);
605    }
606}
607
608/// Drive a decoded codepoint through the form-specific accumulation state.
609///
610/// Replaces the four-call sub-pipeline (decode → ccc → decompose → process)
611/// at each bit-walk position with a single dispatch on `DecodedCodepoint`.
612#[inline(always)]
613fn process_codepoint(dc: &DecodedCodepoint, state: &mut NormState, out: &mut String, form: Form) {
614    let composes = form.composes();
615    match dc.decomp_kind {
616        DecompKind::None => {
617            // No decomposition: feed the codepoint with its CCC.
618            // SAFETY: `cp` came from a valid `&str`, so it's a valid scalar.
619            let ch = unsafe { char::from_u32_unchecked(dc.cp) };
620            if dc.ccc == 0 {
621                if composes {
622                    state.feed_entry(ch, 0, out, true);
623                } else {
624                    state.feed_entry_nfd(ch, 0, out);
625                }
626            } else {
627                feed_combining_mark(ch, dc.ccc, state, out, composes);
628            }
629        },
630        DecompKind::Canonical | DecompKind::Compat => {
631            if !dc.decomp.is_empty() {
632                feed_expansion(dc.decomp, state, out, composes);
633            } else {
634                feed_singleton(dc.tv, state, out, composes);
635            }
636        },
637    }
638}
639
640/// Compose-mode passthrough flush. Called from both the chunk loop and scalar
641/// tail after `state.flush(out, true)` when `composes == true`. Peeks at the
642/// upcoming codepoint via its already-fetched trie value (`next_tv`) and
643/// decides whether to copy the whole `pass` run verbatim or feed the final
644/// ASCII starter through `NormState` so subsequent combining marks can still
645/// see it. The caller passes `next_tv` from the unified `decode_at` so we
646/// avoid a redundant trie lookup on every SIMD-hit codepoint (notably the
647/// dense emoji / Arabic / Hangul streams where this is hit per codepoint).
648#[inline(always)]
649fn flush_compose_passthrough(pass: &str, next_tv: u32, state: &mut NormState, out: &mut String) {
650    if tables::needs_starter_shadow(next_tv) {
651        let n = pass.len();
652        if n > 1 {
653            out.push_str(&pass[..n - 1]);
654        }
655        let last_ch = pass.as_bytes()[n - 1] as char;
656        state.feed_entry(last_ch, 0, out, true);
657    } else {
658        out.push_str(pass);
659    }
660}
661
662// ---------------------------------------------------------------------------
663// normalize_scalar -- fallback for short inputs
664// ---------------------------------------------------------------------------
665
666/// Normalize a string using pure scalar processing (no SIMD).
667fn normalize_scalar<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
668    if input.is_empty() {
669        return Cow::Borrowed(input);
670    }
671
672    // Quick-check: if the string is definitely already normalized, return early.
673    if form.quick_check(input) == quick_check::IsNormalized::Yes {
674        return Cow::Borrowed(input);
675    }
676
677    let mut out = String::with_capacity(input.len());
678    let mut state = NormState::new();
679    let mut decomp_buf = CccBuffer::new();
680
681    for ch in input.chars() {
682        process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
683    }
684
685    // Flush any remaining state.
686    state.flush(&mut out, form.composes());
687
688    if out == input {
689        Cow::Borrowed(input)
690    } else {
691        Cow::Owned(out)
692    }
693}
694
695// ---------------------------------------------------------------------------
696// normalize_impl -- main SIMD-accelerated loop
697// ---------------------------------------------------------------------------
698
699/// Core normalization function.
700///
701/// Uses SIMD scanning for inputs >= 64 bytes, with scalar fallback for shorter
702/// inputs and tails. Returns `Cow::Borrowed` if the input was already normalized.
703///
704/// `#[inline]` is intentional: `Form` is dispatched as a runtime parameter
705/// from four single-form public entry points (`Nfc`/`Nfd`/`Nfkc`/`Nfkd`), and
706/// inlining lets the compiler fold all `match form { ... }` branches inside
707/// `process_codepoint`, `feed_expansion`, `decode_at`, etc. into the constant
708/// chosen at the call site. This collapses the inner loop's per-codepoint
709/// `composes` / `decomp_form()` checks to a no-op. The compile-time cost is
710/// four monomorphic copies of the function (~2× current `.text`), which we
711/// accept for the per-codepoint speedup on combining-mark-dense fixtures
712/// (`worst_case`, `already_nfc`).
713#[inline]
714fn normalize_impl<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
715    let bytes = input.as_bytes();
716    let len = bytes.len();
717
718    // Short inputs: use scalar path directly (includes quick_check).
719    if len < 64 {
720        return normalize_scalar(input, form);
721    }
722
723    // Single upfront quick-check. If definitely normalized, return early.
724    let qc = form.quick_check(input);
725    if qc == quick_check::IsNormalized::Yes {
726        return Cow::Borrowed(input);
727    }
728
729    // QC = No or Maybe: allocate and normalize.
730    let bound = form.passthrough_bound();
731    let composes = form.composes();
732    let mut out = String::with_capacity(form.estimated_capacity(len));
733    let mut last_written: usize = 0;
734    let mut state = NormState::new();
735
736    let mut pos: usize = 0;
737    let ptr = bytes.as_ptr();
738
739    // Helper: prefetch the output buffer write-head so write-allocate fills
740    // overlap the SIMD scanner read on the source.
741    macro_rules! prefetch_write_head {
742        ($out:expr) => {
743            unsafe {
744                let write_head = $out.len();
745                let distance = prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE;
746                if write_head + distance <= $out.capacity() {
747                    prefetch::prefetch_write($out.as_ptr().wrapping_add(write_head + distance));
748                }
749            }
750        };
751    }
752
753    // Bit-walk for one already-scanned 64-byte chunk. Implemented as a macro
754    // so the body inlines verbatim at every call site -- this preserves the
755    // per-`form` monomorphization and keeps the bit-walk's many early-exit
756    // branches transparent to LLVM's loop-carried-state analysis. A regular
757    // closure here introduced a measurable regression on the latin1/worst_case
758    // rows because its `&mut`-parameter borrows hid the loop carries.
759    macro_rules! process_chunk {
760        ($chunk_start:expr, $mask:expr) => {{
761            let chunk_start: usize = $chunk_start;
762            let mask: u64 = $mask;
763            if mask != 0 {
764                let mut chunk_mask = mask;
765                while chunk_mask != 0 {
766                    let bit_pos = chunk_mask.trailing_zeros() as usize;
767                    chunk_mask &= chunk_mask.wrapping_sub(1);
768
769                    let byte_pos = chunk_start + bit_pos;
770
771                    if byte_pos < last_written {
772                        continue;
773                    }
774
775                    if utf8::is_continuation_byte(bytes[byte_pos]) {
776                        continue;
777                    }
778
779                    // Latin-1 supplement NFD/NFKD fast path.
780                    if !composes && bytes[byte_pos] == 0xC3 {
781                        if let Some((starter, mark, mark_ccc)) =
782                            unsafe { latin1_supplement_nfd(ptr, byte_pos) }
783                        {
784                            if byte_pos > last_written {
785                                state.flush_nfd(&mut out);
786                                out.push_str(&input[last_written..byte_pos]);
787                            }
788                            last_written = byte_pos + 2;
789                            state.flush_nfd(&mut out);
790                            out.push(starter as char);
791                            state.ccc_buf.push(mark, mark_ccc);
792                            continue;
793                        }
794                    }
795
796                    // SAFETY: byte_pos < len and points to a UTF-8 leading byte.
797                    let dc = unsafe { decode_at(ptr, byte_pos, len, form) };
798                    let width = dc.cp_len as usize;
799
800                    if !composes {
801                        // Hangul algorithmic decomposition.
802                        if (hangul::S_BASE..hangul::S_BASE + hangul::S_COUNT).contains(&dc.cp) {
803                            if byte_pos > last_written {
804                                state.flush_nfd(&mut out);
805                                out.push_str(&input[last_written..byte_pos]);
806                            }
807                            last_written = byte_pos + width;
808                            state.flush_nfd(&mut out);
809                            let ch = unsafe { char::from_u32_unchecked(dc.cp) };
810                            // Single fused UTF-8 write (6 or 9 bytes) instead
811                            // of three `String::push(char)` calls; saves the
812                            // per-push capacity check + per-char encode loop
813                            // on dense Hangul fixtures.
814                            hangul::push_decomposed_hangul(ch, &mut out);
815                            continue;
816                        }
817                        // Non-decomposing starter: bulk-passthrough.
818                        if dc.decomp_kind == DecompKind::None && dc.ccc == 0 {
819                            continue;
820                        }
821                        if byte_pos > last_written {
822                            state.flush_nfd(&mut out);
823                            out.push_str(&input[last_written..byte_pos]);
824                        }
825                        last_written = byte_pos + width;
826                        process_codepoint(&dc, &mut state, &mut out, form);
827                        continue;
828                    }
829
830                    // Compose mode.
831                    if byte_pos > last_written {
832                        state.flush(&mut out, composes);
833                        let pass = &input[last_written..byte_pos];
834                        flush_compose_passthrough(pass, dc.tv, &mut state, &mut out);
835                    }
836                    last_written = byte_pos + width;
837                    process_codepoint(&dc, &mut state, &mut out, form);
838                }
839            }
840        }};
841    }
842
843    // Software-pipelined pair loop: process two adjacent 64-byte chunks per
844    // iteration. The paired scanner issues all 8 NEON loads / 8 compares /
845    // 8 reductions in one block, letting the compiler schedule across NEON
846    // pipes so chunk B's load+compare overlaps chunk A's reduce. On dense
847    // (CJK / Arabic / Hangul / emoji) inputs this is a measurable win; on
848    // ASCII the coarse any-set check inside `scan_chunk_pair` preserves
849    // the empty-mask fast skip.
850    while pos + 128 <= len {
851        let chunk_a_start = pos;
852        let chunk_b_start = pos + 64;
853
854        let (mask_a, mask_b) = unsafe {
855            let prefetch_l1 =
856                ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
857            let prefetch_l2 =
858                ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
859            simd::scan_pair_and_prefetch(
860                ptr.add(chunk_a_start),
861                ptr.add(chunk_b_start),
862                prefetch_l1,
863                prefetch_l2,
864                bound,
865            )
866        };
867
868        prefetch_write_head!(out);
869        process_chunk!(chunk_a_start, mask_a);
870        process_chunk!(chunk_b_start, mask_b);
871
872        pos += 128;
873    }
874
875    // Trailing single chunk (when len % 128 in [64, 128)).
876    while pos + 64 <= len {
877        let chunk_start = pos;
878
879        let mask = unsafe {
880            let prefetch_l1 =
881                ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
882            let prefetch_l2 =
883                ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
884            simd::scan_and_prefetch(ptr.add(pos), prefetch_l1, prefetch_l2, bound)
885        };
886
887        prefetch_write_head!(out);
888        process_chunk!(chunk_start, mask);
889
890        pos += 64;
891    }
892
893    // Scalar tail: remaining bytes after the last full chunk.
894    if pos < len {
895        // Check if the tail has any non-passthrough bytes.
896        let tail_has_work = bytes[pos..].iter().any(|&b| b >= bound);
897
898        if tail_has_work {
899            // Process remaining bytes character-by-character via the unified
900            // `decode_at` + `process_codepoint` pipeline (same shape as the
901            // SIMD bit-walk above).
902            let mut tail_pos = pos;
903            while tail_pos < len {
904                if tail_pos < last_written {
905                    tail_pos += 1;
906                    continue;
907                }
908
909                if utf8::is_continuation_byte(bytes[tail_pos]) {
910                    tail_pos += 1;
911                    continue;
912                }
913
914                // Latin-1 supplement NFD/NFKD fast path (mirrors bit-walk).
915                if !composes && bytes[tail_pos] == 0xC3 {
916                    // SAFETY: tail_pos + 1 < len by UTF-8 validity.
917                    if let Some((starter, mark, mark_ccc)) =
918                        unsafe { latin1_supplement_nfd(ptr, tail_pos) }
919                    {
920                        if tail_pos > last_written {
921                            state.flush_nfd(&mut out);
922                            out.push_str(&input[last_written..tail_pos]);
923                        }
924                        last_written = tail_pos + 2;
925                        state.flush_nfd(&mut out);
926                        out.push(starter as char);
927                        state.ccc_buf.push(mark, mark_ccc);
928                        tail_pos += 2;
929                        continue;
930                    }
931                }
932
933                // SAFETY: `tail_pos < len` and the byte is a UTF-8 leading
934                // byte (continuation filter above). Input is a valid `&str`.
935                let dc = unsafe { decode_at(ptr, tail_pos, len, form) };
936                let width = dc.cp_len as usize;
937
938                // Extended passthrough (NFD/NFKD): bulk-copy non-decomposing
939                // starters; algorithmic Hangul written directly to output.
940                if !composes {
941                    if (hangul::S_BASE..hangul::S_BASE + hangul::S_COUNT).contains(&dc.cp) {
942                        if tail_pos > last_written {
943                            state.flush_nfd(&mut out);
944                            out.push_str(&input[last_written..tail_pos]);
945                        }
946                        last_written = tail_pos + width;
947                        state.flush_nfd(&mut out);
948                        // SAFETY: dc.cp is a valid Hangul syllable codepoint.
949                        let ch = unsafe { char::from_u32_unchecked(dc.cp) };
950                        hangul::push_decomposed_hangul(ch, &mut out);
951                        tail_pos += width;
952                        continue;
953                    }
954                    if dc.decomp_kind == DecompKind::None && dc.ccc == 0 {
955                        tail_pos += width;
956                        continue;
957                    }
958                    if tail_pos > last_written {
959                        state.flush_nfd(&mut out);
960                        out.push_str(&input[last_written..tail_pos]);
961                    }
962                    last_written = tail_pos + width;
963                    process_codepoint(&dc, &mut state, &mut out, form);
964                    tail_pos += width;
965                    continue;
966                }
967
968                // Compose mode: copy passthrough bytes before this char.
969                if tail_pos > last_written {
970                    state.flush(&mut out, composes);
971                    let pass = &input[last_written..tail_pos];
972                    // Reuse the trie value already fetched by `decode_at`.
973                    flush_compose_passthrough(pass, dc.tv, &mut state, &mut out);
974                }
975
976                last_written = tail_pos + width;
977                process_codepoint(&dc, &mut state, &mut out, form);
978
979                tail_pos += width;
980            }
981        }
982    }
983
984    // Flush any remaining state.
985    if composes {
986        state.flush(&mut out, true);
987    } else {
988        state.flush_nfd(&mut out);
989    }
990
991    // Copy any trailing passthrough bytes.
992    if last_written < len {
993        out.push_str(&input[last_written..len]);
994    }
995
996    // For the Maybe case (NFC/NFKC only), normalization might not have changed
997    // anything. Check and return Borrowed if so.
998    if qc == quick_check::IsNormalized::Maybe && out == input {
999        Cow::Borrowed(input)
1000    } else {
1001        Cow::Owned(out)
1002    }
1003}
1004
1005// ---------------------------------------------------------------------------
1006// Public normalizer types
1007// ---------------------------------------------------------------------------
1008
1009/// NFC normalizer: Canonical Decomposition, followed by Canonical Composition.
1010pub struct NfcNormalizer;
1011
1012/// NFD normalizer: Canonical Decomposition.
1013pub struct NfdNormalizer;
1014
1015/// NFKC normalizer: Compatibility Decomposition, followed by Canonical Composition.
1016pub struct NfkcNormalizer;
1017
1018/// NFKD normalizer: Compatibility Decomposition.
1019pub struct NfkdNormalizer;
1020
1021impl Default for NfcNormalizer {
1022    fn default() -> Self {
1023        Self::new()
1024    }
1025}
1026
1027impl Default for NfdNormalizer {
1028    fn default() -> Self {
1029        Self::new()
1030    }
1031}
1032
1033impl Default for NfkcNormalizer {
1034    fn default() -> Self {
1035        Self::new()
1036    }
1037}
1038
1039impl Default for NfkdNormalizer {
1040    fn default() -> Self {
1041        Self::new()
1042    }
1043}
1044
1045impl NfcNormalizer {
1046    /// Create a new NFC normalizer.
1047    pub fn new() -> Self {
1048        NfcNormalizer
1049    }
1050
1051    /// Run the NFC quick-check algorithm on `input`.
1052    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1053        quick_check::quick_check_nfc(input)
1054    }
1055
1056    /// Normalize the input string to NFC form.
1057    ///
1058    /// Returns `Cow::Borrowed` if the input is already in NFC.
1059    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1060        normalize_impl(input, Form::Nfc)
1061    }
1062
1063    /// Normalize the input string to NFC form, appending to `out`.
1064    ///
1065    /// Returns `true` if the input was already normalized (nothing was modified).
1066    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1067        let result = normalize_impl(input, Form::Nfc);
1068        let already_normalized = matches!(&result, Cow::Borrowed(_));
1069        out.push_str(&result);
1070        already_normalized
1071    }
1072
1073    /// Check if the input is already in NFC form.
1074    pub fn is_normalized(&self, input: &str) -> bool {
1075        quick_check::is_normalized_nfc(input)
1076    }
1077}
1078
1079impl NfdNormalizer {
1080    /// Create a new NFD normalizer.
1081    pub fn new() -> Self {
1082        NfdNormalizer
1083    }
1084
1085    /// Run the NFD quick-check algorithm on `input`.
1086    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1087        quick_check::quick_check_nfd(input)
1088    }
1089
1090    /// Normalize the input string to NFD form.
1091    ///
1092    /// Returns `Cow::Borrowed` if the input is already in NFD.
1093    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1094        normalize_impl(input, Form::Nfd)
1095    }
1096
1097    /// Normalize the input string to NFD form, appending to `out`.
1098    ///
1099    /// Returns `true` if the input was already normalized (nothing was modified).
1100    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1101        let result = normalize_impl(input, Form::Nfd);
1102        let already_normalized = matches!(&result, Cow::Borrowed(_));
1103        out.push_str(&result);
1104        already_normalized
1105    }
1106
1107    /// Check if the input is already in NFD form.
1108    pub fn is_normalized(&self, input: &str) -> bool {
1109        quick_check::is_normalized_nfd(input)
1110    }
1111}
1112
1113impl NfkcNormalizer {
1114    /// Create a new NFKC normalizer.
1115    pub fn new() -> Self {
1116        NfkcNormalizer
1117    }
1118
1119    /// Run the NFKC quick-check algorithm on `input`.
1120    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1121        quick_check::quick_check_nfkc(input)
1122    }
1123
1124    /// Normalize the input string to NFKC form.
1125    ///
1126    /// Returns `Cow::Borrowed` if the input is already in NFKC.
1127    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1128        normalize_impl(input, Form::Nfkc)
1129    }
1130
1131    /// Normalize the input string to NFKC form, appending to `out`.
1132    ///
1133    /// Returns `true` if the input was already normalized (nothing was modified).
1134    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1135        let result = normalize_impl(input, Form::Nfkc);
1136        let already_normalized = matches!(&result, Cow::Borrowed(_));
1137        out.push_str(&result);
1138        already_normalized
1139    }
1140
1141    /// Check if the input is already in NFKC form.
1142    pub fn is_normalized(&self, input: &str) -> bool {
1143        quick_check::is_normalized_nfkc(input)
1144    }
1145}
1146
1147impl NfkdNormalizer {
1148    /// Create a new NFKD normalizer.
1149    pub fn new() -> Self {
1150        NfkdNormalizer
1151    }
1152
1153    /// Run the NFKD quick-check algorithm on `input`.
1154    pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1155        quick_check::quick_check_nfkd(input)
1156    }
1157
1158    /// Normalize the input string to NFKD form.
1159    ///
1160    /// Returns `Cow::Borrowed` if the input is already in NFKD.
1161    pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1162        normalize_impl(input, Form::Nfkd)
1163    }
1164
1165    /// Normalize the input string to NFKD form, appending to `out`.
1166    ///
1167    /// Returns `true` if the input was already normalized (nothing was modified).
1168    pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1169        let result = normalize_impl(input, Form::Nfkd);
1170        let already_normalized = matches!(&result, Cow::Borrowed(_));
1171        out.push_str(&result);
1172        already_normalized
1173    }
1174
1175    /// Check if the input is already in NFKD form.
1176    pub fn is_normalized(&self, input: &str) -> bool {
1177        quick_check::is_normalized_nfkd(input)
1178    }
1179}
1180
1181// ---------------------------------------------------------------------------
1182// Unit tests
1183// ---------------------------------------------------------------------------
1184
1185#[cfg(test)]
1186mod tests {
1187    use super::*;
1188    use alloc::borrow::Cow;
1189    use alloc::string::String;
1190    use alloc::vec::Vec;
1191
1192    // ===================================================================
1193    // 0. Latin-1 NFD/NFKD fast-path table verification
1194    // ===================================================================
1195
1196    #[test]
1197    fn latin1_table_matches_runtime_lookup_nfd() {
1198        // For every codepoint in U+00C0..=U+00FF, verify that our hand-rolled
1199        // table entry produces the same NFD output as feeding the codepoint
1200        // through the general trie-driven pipeline.
1201        for cp in 0xC0u32..=0xFF {
1202            let ch = char::from_u32(cp).unwrap();
1203            let mut buf = String::new();
1204            buf.push(ch);
1205            let general: Cow<'_, str> = normalize_impl(&buf, Form::Nfd);
1206
1207            let entry = LATIN1_NFD_TABLE[(cp - 0xC0) as usize];
1208            let mut fast = String::new();
1209            if entry.0 == 0 {
1210                // Self-mapping: must equal input.
1211                fast.push(ch);
1212            } else {
1213                fast.push(entry.0 as char);
1214                fast.push(char::from_u32(entry.1 as u32).unwrap());
1215            }
1216            assert_eq!(
1217                &*general, fast,
1218                "NFD mismatch for U+{:04X}: trie={:?} table={:?}",
1219                cp, &*general, fast
1220            );
1221        }
1222    }
1223
1224    #[test]
1225    fn latin1_table_matches_runtime_lookup_nfkd() {
1226        // NFKD == NFD on this range; verify explicitly.
1227        for cp in 0xC0u32..=0xFF {
1228            let ch = char::from_u32(cp).unwrap();
1229            let mut buf = String::new();
1230            buf.push(ch);
1231            let nfd: Cow<'_, str> = normalize_impl(&buf, Form::Nfd);
1232            let nfkd: Cow<'_, str> = normalize_impl(&buf, Form::Nfkd);
1233            assert_eq!(
1234                &*nfd, &*nfkd,
1235                "NFD/NFKD diverge for U+{:04X}: nfd={:?} nfkd={:?}",
1236                cp, &*nfd, &*nfkd
1237            );
1238        }
1239    }
1240
1241    // ===================================================================
1242    // 1. Form enum methods
1243    // ===================================================================
1244
1245    #[test]
1246    fn passthrough_bound_all_forms_return_0xc0() {
1247        assert_eq!(Form::Nfc.passthrough_bound(), 0xC0);
1248        assert_eq!(Form::Nfd.passthrough_bound(), 0xC0);
1249        assert_eq!(Form::Nfkc.passthrough_bound(), 0xC0);
1250        assert_eq!(Form::Nfkd.passthrough_bound(), 0xC0);
1251    }
1252
1253    #[test]
1254    fn composes_nfc_nfkc_true_nfd_nfkd_false() {
1255        assert!(Form::Nfc.composes());
1256        assert!(Form::Nfkc.composes());
1257        assert!(!Form::Nfd.composes());
1258        assert!(!Form::Nfkd.composes());
1259    }
1260
1261    #[test]
1262    fn decomp_form_canonical_vs_compatible() {
1263        assert_eq!(Form::Nfc.decomp_form(), DecompForm::Canonical);
1264        assert_eq!(Form::Nfd.decomp_form(), DecompForm::Canonical);
1265        assert_eq!(Form::Nfkc.decomp_form(), DecompForm::Compatible);
1266        assert_eq!(Form::Nfkd.decomp_form(), DecompForm::Compatible);
1267    }
1268
1269    #[test]
1270    fn estimated_capacity_nfc_nfkc_same_nfd_nfkd_larger() {
1271        let input_len = 100;
1272        assert_eq!(Form::Nfc.estimated_capacity(input_len), 100);
1273        assert_eq!(Form::Nfkc.estimated_capacity(input_len), 100);
1274        assert_eq!(Form::Nfd.estimated_capacity(input_len), 150);
1275        assert_eq!(Form::Nfkd.estimated_capacity(input_len), 150);
1276    }
1277
1278    #[test]
1279    fn estimated_capacity_zero_length() {
1280        assert_eq!(Form::Nfc.estimated_capacity(0), 0);
1281        assert_eq!(Form::Nfd.estimated_capacity(0), 0);
1282    }
1283
1284    #[test]
1285    fn quick_check_ascii_is_yes_for_all_forms() {
1286        let ascii = "Hello, World!";
1287        assert_eq!(Form::Nfc.quick_check(ascii), quick_check::IsNormalized::Yes);
1288        assert_eq!(Form::Nfd.quick_check(ascii), quick_check::IsNormalized::Yes);
1289        assert_eq!(
1290            Form::Nfkc.quick_check(ascii),
1291            quick_check::IsNormalized::Yes
1292        );
1293        assert_eq!(
1294            Form::Nfkd.quick_check(ascii),
1295            quick_check::IsNormalized::Yes
1296        );
1297    }
1298
1299    // ===================================================================
1300    // 2. NormState state machine
1301    // ===================================================================
1302
1303    #[test]
1304    fn normstate_new_has_no_starter_empty_ccc_buf() {
1305        let state = NormState::new();
1306        assert!(state.current_starter.is_none());
1307        assert!(state.ccc_buf.is_empty());
1308    }
1309
1310    #[test]
1311    fn feed_entry_single_starter_sets_current_starter() {
1312        let mut state = NormState::new();
1313        let mut out = String::new();
1314        // Feed a starter (CCC=0)
1315        state.feed_entry('A', 0, &mut out, false);
1316        assert_eq!(state.current_starter, Some('A'));
1317        assert!(state.ccc_buf.is_empty());
1318        assert!(out.is_empty()); // No flush yet
1319    }
1320
1321    #[test]
1322    fn feed_entry_combining_mark_buffers_in_ccc_buf() {
1323        let mut state = NormState::new();
1324        let mut out = String::new();
1325        // Set up a starter first
1326        state.feed_entry('e', 0, &mut out, false);
1327        // Feed combining acute (CCC=230)
1328        state.feed_entry('\u{0301}', 230, &mut out, false);
1329        assert_eq!(state.current_starter, Some('e'));
1330        assert!(!state.ccc_buf.is_empty());
1331        assert_eq!(state.ccc_buf.len(), 1);
1332        assert_eq!(state.ccc_buf.as_slice()[0].ch, '\u{0301}');
1333        assert_eq!(state.ccc_buf.as_slice()[0].ccc, 230);
1334    }
1335
1336    #[test]
1337    fn feed_entry_two_starters_first_gets_flushed() {
1338        let mut state = NormState::new();
1339        let mut out = String::new();
1340        // Feed first starter
1341        state.feed_entry('A', 0, &mut out, false);
1342        assert!(out.is_empty());
1343        // Feed second starter -- first should be flushed to `out`
1344        state.feed_entry('B', 0, &mut out, false);
1345        assert_eq!(out, "A");
1346        assert_eq!(state.current_starter, Some('B'));
1347    }
1348
1349    #[test]
1350    fn feed_entry_starter_to_starter_composition_hangul_lv() {
1351        let mut state = NormState::new();
1352        let mut out = String::new();
1353        // Hangul L
1354        state.feed_entry('\u{1100}', 0, &mut out, true);
1355        // Hangul V -- should compose with L in compose mode
1356        state.feed_entry('\u{1161}', 0, &mut out, true);
1357        // The composed syllable should be the current starter
1358        assert_eq!(state.current_starter, Some('\u{AC00}'));
1359        // Nothing flushed yet
1360        assert!(out.is_empty());
1361    }
1362
1363    #[test]
1364    fn feed_entry_starter_to_starter_composition_e_acute() {
1365        let mut state = NormState::new();
1366        let mut out = String::new();
1367        // In compose mode, 'e' followed by combining acute (CCC=230)
1368        // is not starter-to-starter, but let's test the compose path
1369        // with a combining mark that composes.
1370        state.feed_entry('e', 0, &mut out, true);
1371        state.feed_entry('\u{0301}', 230, &mut out, true);
1372        // Now flush to get the composed result
1373        state.flush(&mut out, true);
1374        assert_eq!(out, "\u{00E9}"); // e-acute
1375    }
1376
1377    #[test]
1378    fn feed_entry_nfd_starters_and_combining_marks() {
1379        let mut state = NormState::new();
1380        let mut out = String::new();
1381        // Feed starter
1382        state.feed_entry_nfd('A', 0, &mut out);
1383        assert_eq!(state.current_starter, Some('A'));
1384        // Feed combining grave (CCC=230)
1385        state.feed_entry_nfd('\u{0300}', 230, &mut out);
1386        assert_eq!(state.ccc_buf.len(), 1);
1387        // Feed new starter -- flushes A + combining grave
1388        state.feed_entry_nfd('B', 0, &mut out);
1389        assert_eq!(out, "A\u{0300}");
1390        assert_eq!(state.current_starter, Some('B'));
1391    }
1392
1393    // ===================================================================
1394    // 3. NormState flush() and flush_nfd()
1395    // ===================================================================
1396
1397    #[test]
1398    fn flush_no_starter_no_marks_nothing_emitted() {
1399        let mut state = NormState::new();
1400        let mut out = String::new();
1401        state.flush(&mut out, false);
1402        assert!(out.is_empty());
1403        state.flush(&mut out, true);
1404        assert!(out.is_empty());
1405    }
1406
1407    #[test]
1408    fn flush_starter_only_emits_starter() {
1409        let mut state = NormState::new();
1410        let mut out = String::new();
1411        state.current_starter = Some('X');
1412        state.flush(&mut out, false);
1413        assert_eq!(out, "X");
1414    }
1415
1416    #[test]
1417    fn flush_starter_one_combining_mark_no_compose() {
1418        let mut state = NormState::new();
1419        let mut out = String::new();
1420        state.current_starter = Some('e');
1421        state.ccc_buf.push('\u{0301}', 230); // combining acute
1422        state.flush(&mut out, false);
1423        assert_eq!(out, "e\u{0301}");
1424    }
1425
1426    #[test]
1427    fn flush_starter_one_combining_mark_with_compose() {
1428        let mut state = NormState::new();
1429        let mut out = String::new();
1430        state.current_starter = Some('e');
1431        state.ccc_buf.push('\u{0301}', 230); // combining acute
1432        state.flush(&mut out, true);
1433        assert_eq!(out, "\u{00E9}"); // e-acute composed
1434    }
1435
1436    #[test]
1437    fn flush_starter_multiple_ccc_disordered_marks_emits_sorted() {
1438        let mut state = NormState::new();
1439        let mut out = String::new();
1440        state.current_starter = Some('a');
1441        // Push marks in wrong CCC order: 230, 220, 202
1442        state.ccc_buf.push('\u{0301}', 230); // combining acute, CCC=230
1443        state.ccc_buf.push('\u{0323}', 220); // combining dot below, CCC=220
1444        state.ccc_buf.push('\u{0327}', 202); // combining cedilla, CCC=202
1445        state.flush(&mut out, false);
1446        // Should emit starter + marks sorted by CCC: 202, 220, 230
1447        let chars: Vec<char> = out.chars().collect();
1448        assert_eq!(chars[0], 'a');
1449        assert_eq!(chars[1], '\u{0327}'); // CCC=202
1450        assert_eq!(chars[2], '\u{0323}'); // CCC=220
1451        assert_eq!(chars[3], '\u{0301}'); // CCC=230
1452    }
1453
1454    #[test]
1455    fn flush_orphan_combining_marks_no_starter_emits_sorted() {
1456        let mut state = NormState::new();
1457        let mut out = String::new();
1458        // No starter set, just orphan combining marks
1459        state.ccc_buf.push('\u{0301}', 230); // CCC=230
1460        state.ccc_buf.push('\u{0327}', 202); // CCC=202
1461        state.flush(&mut out, false);
1462        let chars: Vec<char> = out.chars().collect();
1463        assert_eq!(chars.len(), 2);
1464        assert_eq!(chars[0], '\u{0327}'); // CCC=202 first
1465        assert_eq!(chars[1], '\u{0301}'); // CCC=230 second
1466    }
1467
1468    #[test]
1469    fn flush_nfd_no_starter_no_marks_nothing_emitted() {
1470        let mut state = NormState::new();
1471        let mut out = String::new();
1472        state.flush_nfd(&mut out);
1473        assert!(out.is_empty());
1474    }
1475
1476    #[test]
1477    fn flush_nfd_starter_only_emits_starter() {
1478        let mut state = NormState::new();
1479        let mut out = String::new();
1480        state.current_starter = Some('Z');
1481        state.flush_nfd(&mut out);
1482        assert_eq!(out, "Z");
1483    }
1484
1485    #[test]
1486    fn flush_nfd_single_mark_fast_path_take_single_inline() {
1487        let mut state = NormState::new();
1488        let mut out = String::new();
1489        state.current_starter = Some('e');
1490        state.ccc_buf.push('\u{0301}', 230); // single combining mark
1491        // This should hit the take_single_inline fast path in flush_nfd
1492        state.flush_nfd(&mut out);
1493        assert_eq!(out, "e\u{0301}");
1494        // Buffer should be cleared
1495        assert!(state.ccc_buf.is_empty());
1496    }
1497
1498    #[test]
1499    fn flush_nfd_multiple_marks_sorted() {
1500        let mut state = NormState::new();
1501        let mut out = String::new();
1502        state.current_starter = Some('o');
1503        state.ccc_buf.push('\u{0301}', 230); // CCC=230
1504        state.ccc_buf.push('\u{0327}', 202); // CCC=202
1505        state.flush_nfd(&mut out);
1506        let chars: Vec<char> = out.chars().collect();
1507        assert_eq!(chars[0], 'o');
1508        assert_eq!(chars[1], '\u{0327}'); // CCC=202
1509        assert_eq!(chars[2], '\u{0301}'); // CCC=230
1510    }
1511
1512    #[test]
1513    fn flush_nfd_orphan_combining_marks_no_starter() {
1514        let mut state = NormState::new();
1515        let mut out = String::new();
1516        state.ccc_buf.push('\u{0301}', 230);
1517        state.ccc_buf.push('\u{0323}', 220);
1518        state.flush_nfd(&mut out);
1519        let chars: Vec<char> = out.chars().collect();
1520        assert_eq!(chars.len(), 2);
1521        assert_eq!(chars[0], '\u{0323}'); // CCC=220
1522        assert_eq!(chars[1], '\u{0301}'); // CCC=230
1523    }
1524
1525    // ===================================================================
1526    // 4. normalize_impl() Cow::Borrowed path
1527    // ===================================================================
1528
1529    #[test]
1530    fn normalize_impl_nfc_already_normalized_returns_borrowed() {
1531        // U+00C5 (A with ring) followed by U+0300 (combining grave).
1532        // This is already in NFC -- the quick check should return Maybe
1533        // (because U+0300 has NFC_QC=Maybe), but after normalization,
1534        // the output equals input, so Cow::Borrowed is returned.
1535        let input = "\u{00C5}\u{0300}";
1536        let result = normalize_impl(input, Form::Nfc);
1537        assert!(
1538            matches!(result, Cow::Borrowed(_)),
1539            "Expected Cow::Borrowed for already-NFC input with Maybe QC, got Cow::Owned({:?})",
1540            result
1541        );
1542        assert_eq!(&*result, input);
1543    }
1544
1545    #[test]
1546    fn normalize_impl_nfc_maybe_borrowed_simd_path() {
1547        // Exercise the SIMD normalize_impl Maybe->Borrowed code path (line 720-721).
1548        // Input must be >= 64 bytes and trigger QC=Maybe but produce identical output.
1549        // 60 bytes of ASCII padding + "\u{00C5}\u{0300}" (already NFC, QC=Maybe).
1550        let mut input = String::new();
1551        input.push_str(&"a".repeat(60));
1552        input.push_str("\u{00C5}\u{0300}"); // Å + combining grave, already NFC
1553        assert!(input.len() >= 64, "input must be >= 64 bytes for SIMD path");
1554        let result = normalize_impl(&input, Form::Nfc);
1555        assert!(
1556            matches!(result, Cow::Borrowed(_)),
1557            "Expected Cow::Borrowed for >=64 byte already-NFC input with Maybe QC, got Cow::Owned({:?})",
1558            result
1559        );
1560        assert_eq!(&*result, &*input);
1561    }
1562
1563    #[test]
1564    fn normalize_impl_ascii_returns_borrowed() {
1565        let input = "Hello, world!";
1566        let result = normalize_impl(input, Form::Nfc);
1567        assert!(matches!(result, Cow::Borrowed(_)));
1568        assert_eq!(&*result, input);
1569    }
1570
1571    #[test]
1572    fn normalize_impl_nfd_already_decomposed_returns_borrowed() {
1573        // "e" + combining acute is already NFD
1574        let input = "e\u{0301}";
1575        let result = normalize_impl(input, Form::Nfd);
1576        assert!(
1577            matches!(result, Cow::Borrowed(_)),
1578            "Expected Cow::Borrowed for already-NFD input"
1579        );
1580    }
1581
1582    #[test]
1583    fn normalize_impl_nfc_not_normalized_returns_owned() {
1584        // NFD form of e-acute: "e" + combining acute -- not NFC
1585        let input = "e\u{0301}";
1586        let result = normalize_impl(input, Form::Nfc);
1587        assert!(matches!(result, Cow::Owned(_)));
1588        assert_eq!(&*result, "\u{00E9}");
1589    }
1590
1591    // ===================================================================
1592    // 5. is_cjk_unified() boundary tests
1593    // ===================================================================
1594
1595    #[test]
1596    fn cjk_unified_extension_a_start() {
1597        assert!(is_cjk_unified(0x3400));
1598    }
1599
1600    #[test]
1601    fn cjk_unified_extension_a_end() {
1602        assert!(is_cjk_unified(0x4DBF));
1603    }
1604
1605    #[test]
1606    fn cjk_unified_main_start() {
1607        assert!(is_cjk_unified(0x4E00));
1608    }
1609
1610    #[test]
1611    fn cjk_unified_main_end() {
1612        assert!(is_cjk_unified(0x9FFF));
1613    }
1614
1615    #[test]
1616    fn cjk_unified_just_before_extension_a() {
1617        assert!(!is_cjk_unified(0x33FF));
1618    }
1619
1620    #[test]
1621    fn cjk_unified_gap_between_extension_a_and_main() {
1622        assert!(!is_cjk_unified(0x4DC0));
1623    }
1624
1625    #[test]
1626    fn cjk_unified_just_after_main() {
1627        assert!(!is_cjk_unified(0xA000));
1628    }
1629}
simd_normalizer/normalizer.rs

simd_normalizer/
normalizer.rs