Skip to main content

simd_normalizer/
quick_check.rs

1// src/quick_check.rs
2
3//! Quick-check for normalization forms (UAX#15 Section 9).
4//!
5//! Uses SIMD scanning to skip safe chunks in bulk for inputs >= 64 bytes.
6//! Form-specific SIMD bounds and code-point range fast paths avoid trie
7//! lookups for the vast majority of BMP characters.
8
9use crate::simd;
10use crate::tables;
11use crate::utf8;
12
13/// Result of a quick-check test.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum IsNormalized {
16    /// The string is definitely in the target normalization form.
17    Yes,
18    /// The string is definitely *not* in the target normalization form.
19    No,
20    /// The string *might* not be normalized; a full check is required.
21    Maybe,
22}
23
24/// Convert a QC trie value (0=Y, 1=M, 2=N) to IsNormalized.
25#[inline]
26fn qc_value_to_result(v: u8) -> IsNormalized {
27    match v {
28        0 => IsNormalized::Yes,
29        1 => IsNormalized::Maybe,
30        _ => IsNormalized::No,
31    }
32}
33
34/// Check if a code point is a CJK Unified Ideograph (CCC=0, QC=Yes for all forms).
35#[inline(always)]
36fn is_cjk_unified(cp: u32) -> bool {
37    // BMP: CJK Unified Ideographs + Extension A (most common)
38    (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
39}
40
41/// Check if a supplementary code point (cp >= 0x10000) is safe for all
42/// normalization forms (CCC=0 and QC=Yes). Returns false only for narrow
43/// exception ranges that may have decompositions or non-zero CCC.
44#[inline(always)]
45fn is_supp_safe(cp: u32) -> bool {
46    if cp >= 0x20000 {
47        // Plane 2+: safe except CJK Compatibility Ideographs Supplement
48        return !(0x2F800..=0x2FA1F).contains(&cp);
49    }
50    // Plane 1: core emoji and symbols block (U+1F252-U+1FBEF) is safe.
51    // Verified: no decompositions and CCC=0 for all normalization forms.
52    (0x1F252..=0x1FBEF).contains(&cp)
53}
54
55/// Check if a code point is Hiragana or Katakana (CCC=0, QC=Yes for NFC/NFKC).
56/// Excludes: combining marks U+3099-309A (CCC>0), NFKC-decomposing U+309B-309C,
57/// U+309F (ゟ), U+30FF (ヿ).
58#[inline(always)]
59fn is_kana(cp: u32) -> bool {
60    // Hiragana base (U+3041-3098)
61    (0x3041..0x3099).contains(&cp)
62        // Hiragana iteration marks (U+309D-309E)
63        || cp == 0x309D
64        || cp == 0x309E
65        // Katakana (U+30A0-30FE)
66        || (0x30A0..=0x30FE).contains(&cp)
67}
68
69/// Generic quick-check implementation.
70///
71/// For inputs >= 64 bytes, uses SIMD scanning to skip chunks in bulk.
72/// For shorter inputs, falls back to a scalar character-by-character loop.
73/// Returns as soon as a definitive No is found.
74///
75/// # Parameters
76/// - `qc_shift`: bit shift to extract this form's 2-bit QC from the fused CCC+QC trie.
77/// - `simd_bound`: SIMD scan threshold; bytes below this are skipped in bulk.
78///   For NFC this is 0xCC (all chars below U+0300 are safe), for other forms 0xC0.
79/// - `safe_below`: code point below which CCC=0 and QC=Yes is guaranteed.
80/// - `hangul_safe`: whether Hangul Syllables (U+AC00..U+D7A3) are QC=Yes for this form.
81/// - `kana_safe`: whether Hiragana/Katakana (U+3040..U+30FF) are QC=Yes for this form.
82#[inline]
83fn quick_check_impl(
84    input: &str,
85    qc_shift: u32,
86    simd_bound: u8,
87    safe_below: u32,
88    hangul_safe: bool,
89    kana_safe: bool,
90) -> IsNormalized {
91    let bytes = input.as_bytes();
92    let len = bytes.len();
93
94    if len < 64 {
95        return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe);
96    }
97
98    let ptr = bytes.as_ptr();
99
100    let mut last_ccc: u8 = 0;
101    let mut result = IsNormalized::Yes;
102    // Byte offset past the last character we've examined.
103    let mut processed_up_to: usize = 0;
104    let mut pos: usize = 0;
105
106    // SIMD chunk loop: skip chunks where all bytes < simd_bound in bulk.
107    while pos + 64 <= len {
108        // SAFETY: pos + 64 <= len, so ptr.add(pos) is valid for 64 bytes.
109        let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
110        let chunk_end = pos + 64;
111
112        if mask == 0 {
113            // All bytes < simd_bound — characters in this chunk are either ASCII
114            // or known-safe non-ASCII (CCC=0, QC=Yes). CCC resets to 0.
115            last_ccc = 0;
116            processed_up_to = chunk_end;
117            pos = chunk_end;
118            continue;
119        }
120
121        // Walk set bits — each is a lead byte of a character that needs inspection.
122        let chunk_start = pos;
123        let mut chunk_mask = mask;
124        while chunk_mask != 0 {
125            let bit_pos = chunk_mask.trailing_zeros() as usize;
126            chunk_mask &= chunk_mask.wrapping_sub(1); // clear lowest set bit
127
128            let byte_pos = chunk_start + bit_pos;
129
130            // Skip bytes already covered by a previous multi-byte decode.
131            if byte_pos < processed_up_to {
132                continue;
133            }
134
135            // Gap before this lead byte → safe characters → CCC resets to 0.
136            if byte_pos > processed_up_to {
137                last_ccc = 0;
138            }
139
140            // Decode the character at this position.
141            let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
142            processed_up_to = byte_pos + width;
143
144            // Fast path: known-safe code point ranges (CCC=0 and QC=Yes).
145            let cp = ch as u32;
146            if cp < safe_below
147                || is_cjk_unified(cp)
148                || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
149                || (kana_safe && is_kana(cp))
150                || (cp >= 0x10000 && is_supp_safe(cp))
151            {
152                last_ccc = 0;
153                continue;
154            }
155
156            // Fused CCC + QC lookup (single trie access).
157            let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
158            if ccc != 0 && last_ccc > ccc {
159                return IsNormalized::No;
160            }
161
162            // Check QC property.
163            match qc_value_to_result(qc) {
164                IsNormalized::No => return IsNormalized::No,
165                IsNormalized::Maybe => result = IsNormalized::Maybe,
166                IsNormalized::Yes => {},
167            }
168
169            last_ccc = ccc;
170        }
171
172        // Trailing safe bytes in this chunk after the last flagged char.
173        if processed_up_to < chunk_end {
174            last_ccc = 0;
175            processed_up_to = chunk_end;
176        }
177
178        pos = chunk_end;
179    }
180
181    // Scalar tail for remaining bytes after the last full 64-byte chunk.
182    let tail_start = processed_up_to.max(pos);
183    if tail_start > processed_up_to {
184        // Gap of safe characters between last processed char and tail start.
185        last_ccc = 0;
186    }
187    let mut tail_pos = tail_start;
188    while tail_pos < len {
189        let b = bytes[tail_pos];
190        if b < 0x80 {
191            // ASCII: CCC=0, QC=Yes for all forms.
192            last_ccc = 0;
193            tail_pos += 1;
194            continue;
195        }
196        // Skip continuation bytes from a character that crossed the chunk/tail
197        // boundary. Its lead byte was < simd_bound, so it is safe (CCC=0, QC=Yes).
198        if utf8::is_continuation_byte(b) {
199            tail_pos += 1;
200            continue;
201        }
202        // Lead byte of a non-ASCII character.
203        let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
204
205        // Fast path: known-safe code point ranges.
206        let cp = ch as u32;
207        if cp < safe_below
208            || is_cjk_unified(cp)
209            || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
210            || (cp >= 0x10000 && is_supp_safe(cp))
211        {
212            last_ccc = 0;
213            tail_pos += width;
214            continue;
215        }
216
217        let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
218        if ccc != 0 && last_ccc > ccc {
219            return IsNormalized::No;
220        }
221        match qc_value_to_result(qc) {
222            IsNormalized::No => return IsNormalized::No,
223            IsNormalized::Maybe => result = IsNormalized::Maybe,
224            IsNormalized::Yes => {},
225        }
226        last_ccc = ccc;
227        tail_pos += width;
228    }
229
230    result
231}
232
233/// Scalar quick-check for short inputs (< 64 bytes).
234#[inline]
235fn quick_check_scalar(
236    input: &str,
237    qc_shift: u32,
238    safe_below: u32,
239    hangul_safe: bool,
240    kana_safe: bool,
241) -> IsNormalized {
242    let mut last_ccc: u8 = 0;
243    let mut result = IsNormalized::Yes;
244
245    for ch in input.chars() {
246        let cp = ch as u32;
247
248        // ASCII fast path
249        if cp <= 0x7F {
250            last_ccc = 0;
251            continue;
252        }
253
254        // Fast path: known-safe code point ranges (CCC=0 and QC=Yes).
255        if cp < safe_below
256            || is_cjk_unified(cp)
257            || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
258            || (kana_safe && is_kana(cp))
259            || (cp >= 0x10000 && is_supp_safe(cp))
260        {
261            last_ccc = 0;
262            continue;
263        }
264
265        let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
266
267        // CCC must be non-decreasing among non-zero values.
268        if ccc != 0 && last_ccc > ccc {
269            return IsNormalized::No;
270        }
271
272        match qc_value_to_result(qc) {
273            IsNormalized::No => return IsNormalized::No,
274            IsNormalized::Maybe => result = IsNormalized::Maybe,
275            IsNormalized::Yes => {},
276        }
277
278        last_ccc = ccc;
279    }
280
281    result
282}
283
284// ---------------------------------------------------------------------------
285// SIMD bound and safe-below thresholds by normalization form
286// ---------------------------------------------------------------------------
287//
288// NFC:  simd_bound=0xCC, safe_below=0x0300, hangul_safe=true, kana_safe=true
289//       All chars U+0000..U+02FF have CCC=0 and NFC_QC=Yes.
290//       The first CCC != 0 is U+0300 (lead byte 0xCC).
291//       CJK Unified, Hangul Syllables, and Hiragana/Katakana are NFC-safe.
292//
293// NFD:  simd_bound=0xC3, safe_below=0x00C0, hangul_safe=false, kana_safe=false
294//       U+00C0 is first NFD_QC=No (lead byte 0xC3).
295//       Hangul Syllables and some kana have NFD_QC=No (they decompose).
296//
297// NFKC: simd_bound=0xC0, safe_below=0x00A0, hangul_safe=true, kana_safe=true
298//       U+00A0 is first NFKC_QC=No (NBSP → SPACE).
299//       Kana are NFKC-safe (only halfwidth/enclosed forms decompose, in other blocks).
300//
301// NFKD: simd_bound=0xC0, safe_below=0x00A0, hangul_safe=false, kana_safe=false
302//       Same as NFKC threshold, but Hangul and some kana decompose.
303
304/// Quick-check whether `input` is in NFC.
305#[cfg(not(feature = "quick_check_oracle"))]
306pub(crate) fn quick_check_nfc(input: &str) -> IsNormalized {
307    quick_check_impl(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
308}
309
310/// Quick-check whether `input` is in NFC.
311#[cfg(feature = "quick_check_oracle")]
312pub fn quick_check_nfc(input: &str) -> IsNormalized {
313    quick_check_impl(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
314}
315
316/// Quick-check whether `input` is in NFD.
317#[cfg(not(feature = "quick_check_oracle"))]
318pub(crate) fn quick_check_nfd(input: &str) -> IsNormalized {
319    quick_check_impl(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
320}
321
322/// Quick-check whether `input` is in NFD.
323#[cfg(feature = "quick_check_oracle")]
324pub fn quick_check_nfd(input: &str) -> IsNormalized {
325    quick_check_impl(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
326}
327
328/// Quick-check whether `input` is in NFKC.
329#[cfg(not(feature = "quick_check_oracle"))]
330pub(crate) fn quick_check_nfkc(input: &str) -> IsNormalized {
331    quick_check_impl(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
332}
333
334/// Quick-check whether `input` is in NFKC.
335#[cfg(feature = "quick_check_oracle")]
336pub fn quick_check_nfkc(input: &str) -> IsNormalized {
337    quick_check_impl(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
338}
339
340/// Quick-check whether `input` is in NFKD.
341#[cfg(not(feature = "quick_check_oracle"))]
342pub(crate) fn quick_check_nfkd(input: &str) -> IsNormalized {
343    quick_check_impl(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
344}
345
346/// Quick-check whether `input` is in NFKD.
347#[cfg(feature = "quick_check_oracle")]
348pub fn quick_check_nfkd(input: &str) -> IsNormalized {
349    quick_check_impl(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
350}
351
352// ---------------------------------------------------------------------------
353// Oracle (slow-path) implementation for differential testing.
354// ---------------------------------------------------------------------------
355//
356// The oracle deliberately routes every flagged byte through the Layer-2
357// decode + range + trie path, i.e. it calls `simd::scan_chunk` (no
358// safe_lead_mask) and omits the short-circuit. It exists so that
359// tests/quick_check_fastpath_equivalence.rs can assert
360// `quick_check_X(s) == quick_check_X_oracle(s)` for every form, 8192
361// cases per form, on arbitrary Unicode input.
362
363#[cfg(feature = "quick_check_oracle")]
364#[inline]
365fn quick_check_impl_oracle(
366    input: &str,
367    qc_shift: u32,
368    simd_bound: u8,
369    safe_below: u32,
370    hangul_safe: bool,
371    kana_safe: bool,
372) -> IsNormalized {
373    let bytes = input.as_bytes();
374    let len = bytes.len();
375
376    if len < 64 {
377        return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe);
378    }
379
380    let ptr = bytes.as_ptr();
381    let mut last_ccc: u8 = 0;
382    let mut result = IsNormalized::Yes;
383    let mut processed_up_to: usize = 0;
384    let mut pos: usize = 0;
385
386    while pos + 64 <= len {
387        // SAFETY: pos + 64 <= len, so ptr.add(pos) is valid for 64 bytes.
388        let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
389        let chunk_end = pos + 64;
390
391        if mask == 0 {
392            last_ccc = 0;
393            processed_up_to = chunk_end;
394            pos = chunk_end;
395            continue;
396        }
397
398        let chunk_start = pos;
399        let mut chunk_mask = mask;
400        while chunk_mask != 0 {
401            let bit_pos = chunk_mask.trailing_zeros() as usize;
402            chunk_mask &= chunk_mask.wrapping_sub(1);
403
404            let byte_pos = chunk_start + bit_pos;
405            if byte_pos < processed_up_to {
406                continue;
407            }
408            if byte_pos > processed_up_to {
409                last_ccc = 0;
410            }
411
412            let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
413            processed_up_to = byte_pos + width;
414
415            let cp = ch as u32;
416            if cp < safe_below
417                || is_cjk_unified(cp)
418                || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
419                || (kana_safe && is_kana(cp))
420                || (cp >= 0x10000 && is_supp_safe(cp))
421            {
422                last_ccc = 0;
423                continue;
424            }
425
426            let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
427            if ccc != 0 && last_ccc > ccc {
428                return IsNormalized::No;
429            }
430            match qc_value_to_result(qc) {
431                IsNormalized::No => return IsNormalized::No,
432                IsNormalized::Maybe => result = IsNormalized::Maybe,
433                IsNormalized::Yes => {},
434            }
435            last_ccc = ccc;
436        }
437
438        if processed_up_to < chunk_end {
439            last_ccc = 0;
440            processed_up_to = chunk_end;
441        }
442        pos = chunk_end;
443    }
444
445    // Scalar tail (identical to quick_check_impl; duplicated verbatim so
446    // the oracle stays a single self-contained function).
447    let tail_start = processed_up_to.max(pos);
448    if tail_start > processed_up_to {
449        last_ccc = 0;
450    }
451    let mut tail_pos = tail_start;
452    while tail_pos < len {
453        let b = bytes[tail_pos];
454        if b < 0x80 {
455            last_ccc = 0;
456            tail_pos += 1;
457            continue;
458        }
459        if utf8::is_continuation_byte(b) {
460            tail_pos += 1;
461            continue;
462        }
463        let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
464        let cp = ch as u32;
465        if cp < safe_below
466            || is_cjk_unified(cp)
467            || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
468            || (cp >= 0x10000 && is_supp_safe(cp))
469        {
470            last_ccc = 0;
471            tail_pos += width;
472            continue;
473        }
474        let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
475        if ccc != 0 && last_ccc > ccc {
476            return IsNormalized::No;
477        }
478        match qc_value_to_result(qc) {
479            IsNormalized::No => return IsNormalized::No,
480            IsNormalized::Maybe => result = IsNormalized::Maybe,
481            IsNormalized::Yes => {},
482        }
483        last_ccc = ccc;
484        tail_pos += width;
485    }
486
487    result
488}
489
490/// Oracle NFC quick-check. Differential-testing only.
491#[cfg(feature = "quick_check_oracle")]
492pub fn quick_check_nfc_oracle(input: &str) -> IsNormalized {
493    quick_check_impl_oracle(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
494}
495
496/// Oracle NFD quick-check. Differential-testing only.
497#[cfg(feature = "quick_check_oracle")]
498pub fn quick_check_nfd_oracle(input: &str) -> IsNormalized {
499    quick_check_impl_oracle(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
500}
501
502/// Oracle NFKC quick-check. Differential-testing only.
503#[cfg(feature = "quick_check_oracle")]
504pub fn quick_check_nfkc_oracle(input: &str) -> IsNormalized {
505    quick_check_impl_oracle(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
506}
507
508/// Oracle NFKD quick-check. Differential-testing only.
509#[cfg(feature = "quick_check_oracle")]
510pub fn quick_check_nfkd_oracle(input: &str) -> IsNormalized {
511    quick_check_impl_oracle(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
512}
513
514// ---------------------------------------------------------------------------
515// Definitive is_normalized checks (resolve Maybe via full normalization)
516// ---------------------------------------------------------------------------
517//
518// These delegate to the main normalizer for the Maybe case, ensuring the
519// quick-check resolution uses the same code path as actual normalization.
520
521/// Definitive NFC check.
522pub(crate) fn is_normalized_nfc(input: &str) -> bool {
523    match quick_check_nfc(input) {
524        IsNormalized::Yes => true,
525        IsNormalized::No => false,
526        IsNormalized::Maybe => &*crate::nfc().normalize(input) == input,
527    }
528}
529
530/// Definitive NFD check.
531pub(crate) fn is_normalized_nfd(input: &str) -> bool {
532    match quick_check_nfd(input) {
533        IsNormalized::Yes => true,
534        IsNormalized::No => false,
535        IsNormalized::Maybe => &*crate::nfd().normalize(input) == input,
536    }
537}
538
539/// Definitive NFKC check.
540pub(crate) fn is_normalized_nfkc(input: &str) -> bool {
541    match quick_check_nfkc(input) {
542        IsNormalized::Yes => true,
543        IsNormalized::No => false,
544        IsNormalized::Maybe => &*crate::nfkc().normalize(input) == input,
545    }
546}
547
548/// Definitive NFKD check.
549pub(crate) fn is_normalized_nfkd(input: &str) -> bool {
550    match quick_check_nfkd(input) {
551        IsNormalized::Yes => true,
552        IsNormalized::No => false,
553        IsNormalized::Maybe => &*crate::nfkd().normalize(input) == input,
554    }
555}
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560    use alloc::format;
561    use alloc::string::String;
562
563    // ---- ASCII fast path ----
564
565    #[test]
566    fn ascii_is_nfc() {
567        assert_eq!(quick_check_nfc("Hello, world!"), IsNormalized::Yes);
568    }
569
570    #[test]
571    fn ascii_is_nfd() {
572        assert_eq!(quick_check_nfd("Hello, world!"), IsNormalized::Yes);
573    }
574
575    #[test]
576    fn ascii_is_nfkc() {
577        assert_eq!(quick_check_nfkc("Hello, world!"), IsNormalized::Yes);
578    }
579
580    #[test]
581    fn ascii_is_nfkd() {
582        assert_eq!(quick_check_nfkd("Hello, world!"), IsNormalized::Yes);
583    }
584
585    #[test]
586    fn empty_string_is_normalized() {
587        assert_eq!(quick_check_nfc(""), IsNormalized::Yes);
588        assert_eq!(quick_check_nfd(""), IsNormalized::Yes);
589        assert_eq!(quick_check_nfkc(""), IsNormalized::Yes);
590        assert_eq!(quick_check_nfkd(""), IsNormalized::Yes);
591    }
592
593    // ---- NFC checks ----
594
595    #[test]
596    fn precomposed_is_nfc_yes() {
597        assert_eq!(quick_check_nfc("\u{00E9}"), IsNormalized::Yes);
598    }
599
600    #[test]
601    fn decomposed_is_not_nfc() {
602        let nfd = "e\u{0301}";
603        let result = quick_check_nfc(nfd);
604        assert!(
605            result == IsNormalized::No || result == IsNormalized::Maybe,
606            "NFD form must not be Yes for NFC, got {:?}",
607            result,
608        );
609    }
610
611    // ---- NFD checks ----
612
613    #[test]
614    fn precomposed_is_not_nfd() {
615        assert_eq!(quick_check_nfd("\u{00E9}"), IsNormalized::No);
616    }
617
618    // ---- CCC ordering ----
619
620    #[test]
621    fn wrong_ccc_order_is_no() {
622        let bad_order = "a\u{0301}\u{0327}"; // acute(230) then cedilla(202)
623        assert_eq!(quick_check_nfc(bad_order), IsNormalized::No);
624        assert_eq!(quick_check_nfd(bad_order), IsNormalized::No);
625    }
626
627    #[test]
628    fn correct_ccc_order_not_rejected() {
629        // Use Hebrew accents which are NFC_QC=Yes but have non-zero CCC.
630        // U+0591 HEBREW ACCENT ETNAHTA (CCC=220), U+05A1 HEBREW ACCENT PAZER (CCC=230)
631        let good_order = "a\u{0591}\u{05A1}";
632        let result = quick_check_nfc(good_order);
633        assert_ne!(result, IsNormalized::No);
634    }
635
636    // ---- Range fast path tests ----
637
638    #[test]
639    fn latin1_supplement_is_nfc() {
640        // U+00C0..U+00FF are all NFC_QC=Yes
641        let latin1 = "\u{00C0}\u{00E9}\u{00F6}\u{00FC}\u{00FF}";
642        assert_eq!(quick_check_nfc(latin1), IsNormalized::Yes);
643    }
644
645    #[test]
646    fn latin_extended_is_nfc() {
647        // U+0100..U+02FF are all NFC_QC=Yes
648        let extended = "\u{0100}\u{017E}\u{0250}\u{02FF}";
649        assert_eq!(quick_check_nfc(extended), IsNormalized::Yes);
650    }
651
652    #[test]
653    fn cjk_is_nfc() {
654        let cjk = "\u{4E00}\u{9FFF}\u{3400}\u{4DBF}";
655        assert_eq!(quick_check_nfc(cjk), IsNormalized::Yes);
656    }
657
658    #[test]
659    fn hangul_syllable_is_nfc() {
660        let hangul = "\u{AC00}\u{D7A3}";
661        assert_eq!(quick_check_nfc(hangul), IsNormalized::Yes);
662    }
663
664    #[test]
665    fn hangul_syllable_is_not_nfd() {
666        let hangul = "\u{AC00}";
667        assert_eq!(quick_check_nfd(hangul), IsNormalized::No);
668    }
669
670    #[test]
671    fn latin1_is_not_nfd() {
672        // U+00C0 decomposes in NFD
673        assert_eq!(quick_check_nfd("\u{00C0}"), IsNormalized::No);
674    }
675
676    #[test]
677    fn nbsp_is_not_nfkc() {
678        // U+00A0 (NBSP) → U+0020 (SPACE) in NFKC
679        assert_eq!(quick_check_nfkc("\u{00A0}"), IsNormalized::No);
680    }
681
682    // ---- is_normalized definitive checks ----
683
684    #[test]
685    fn is_normalized_nfc_ascii() {
686        assert!(is_normalized_nfc("Hello"));
687    }
688
689    #[test]
690    fn is_normalized_nfc_precomposed() {
691        assert!(is_normalized_nfc("\u{00E9}"));
692    }
693
694    #[test]
695    fn is_normalized_nfd_decomposed() {
696        assert!(is_normalized_nfd("e\u{0301}"));
697    }
698
699    #[test]
700    fn is_normalized_nfc_rejects_nfd() {
701        assert!(!is_normalized_nfc("e\u{0301}"));
702    }
703
704    #[test]
705    fn is_normalized_nfd_rejects_nfc() {
706        assert!(!is_normalized_nfd("\u{00E9}"));
707    }
708
709    #[test]
710    fn safe_lead_interleaved_with_combining_marks_across_chunk() {
711        // 128 bytes spanning two SIMD chunks.
712        // Pattern: CJK ideograph (3 bytes, lead 0xE4..=0xE9, safe-lead) +
713        //          'a' (1 byte, ASCII) +
714        //          U+0591 HEBREW ACCENT ETNAHTA (CCC=220, NFC_QC=Yes, lead 0xD6 -> decode path).
715        // The U+0591 must be observed after a safe-lead reset of last_ccc so that
716        // the *next* non-zero CCC mark is accepted as non-decreasing.
717        //
718        // 16 repetitions of (CJK=3 + 'a'=1 + U+0591=2 + 'b'=1 + 'b'=1) = 16 * 8 = 128 bytes.
719        let unit = "\u{4E2D}a\u{0591}bb";
720        let s: String = unit.repeat(16);
721        assert_eq!(s.len(), 128);
722        // All code points are NFC_QC=Yes with monotonic (or zero) CCC, so NFC=Yes.
723        assert_eq!(quick_check_nfc(&s), IsNormalized::Yes);
724        // NFD: U+0591 is NFD_QC=Yes, CJK Unified is NFD_QC=Yes, ASCII is safe.
725        assert_eq!(quick_check_nfd(&s), IsNormalized::Yes);
726        assert_eq!(quick_check_nfkc(&s), IsNormalized::Yes);
727        assert_eq!(quick_check_nfkd(&s), IsNormalized::Yes);
728    }
729
730    #[test]
731    fn safe_lead_then_out_of_order_combining_is_no() {
732        // Regression: if the safe-lead short-circuit fails to set last_ccc=0,
733        // a subsequent same-position CCC check could mis-order. Build an input
734        // where CJK (CCC=0 safe-lead) is followed by a correctly-ordered
735        // combining sequence, then a mis-ordered one; expect No.
736        // U+0301 ACUTE (CCC=230), U+0327 CEDILLA (CCC=202).
737        let unit = "\u{4E2D}a\u{0301}\u{0327}"; // bad order after safe-lead + ASCII
738        let padding = "x".repeat(64); // force >= 64-byte path
739        let s = format!("{}{}", padding, unit);
740        assert!(s.len() >= 64);
741        assert_eq!(quick_check_nfc(&s), IsNormalized::No);
742    }
743
744    #[cfg(feature = "quick_check_oracle")]
745    #[test]
746    fn oracle_matches_fastpath_on_fixed_input() {
747        let s = "\u{4E2D}a\u{0591}bb".repeat(16);
748        assert_eq!(quick_check_nfc(&s), super::quick_check_nfc_oracle(&s));
749        assert_eq!(quick_check_nfd(&s), super::quick_check_nfd_oracle(&s));
750        assert_eq!(quick_check_nfkc(&s), super::quick_check_nfkc_oracle(&s));
751        assert_eq!(quick_check_nfkd(&s), super::quick_check_nfkd_oracle(&s));
752    }
753}