simd_normalizer/
quick_check.rs

1// src/quick_check.rs
2
3//! Quick-check for normalization forms (UAX#15 Section 9).
4//!
5//! Uses SIMD scanning to skip safe chunks in bulk for inputs >= 64 bytes.
6//! Form-specific SIMD bounds and code-point range fast paths avoid trie
7//! lookups for the vast majority of BMP characters.
8
9use crate::simd;
10use crate::tables;
11use crate::utf8;
12
13/// Result of a quick-check test.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum IsNormalized {
16    /// The string is definitely in the target normalization form.
17    Yes,
18    /// The string is definitely *not* in the target normalization form.
19    No,
20    /// The string *might* not be normalized; a full check is required.
21    Maybe,
22}
23
24/// Convert a QC trie value (0=Y, 1=M, 2=N) to IsNormalized.
25#[inline]
26fn qc_value_to_result(v: u8) -> IsNormalized {
27    match v {
28        0 => IsNormalized::Yes,
29        1 => IsNormalized::Maybe,
30        _ => IsNormalized::No,
31    }
32}
33
34/// Check if a code point is a CJK Unified Ideograph (CCC=0, QC=Yes for all forms).
35#[inline(always)]
36fn is_cjk_unified(cp: u32) -> bool {
37    // BMP: CJK Unified Ideographs + Extension A (most common)
38    (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
39}
40
41/// Check if a supplementary code point (cp >= 0x10000) is safe for all
42/// normalization forms (CCC=0 and QC=Yes). Returns false only for narrow
43/// exception ranges that may have decompositions or non-zero CCC.
44#[inline(always)]
45fn is_supp_safe(cp: u32) -> bool {
46    if cp >= 0x20000 {
47        // Plane 2+: safe except CJK Compatibility Ideographs Supplement
48        return !(0x2F800..=0x2FA1F).contains(&cp);
49    }
50    // Plane 1: core emoji and symbols block (U+1F252-U+1FBEF) is safe.
51    // Verified: no decompositions and CCC=0 for all normalization forms.
52    (0x1F252..=0x1FBEF).contains(&cp)
53}
54
55/// Check if a code point is Hiragana or Katakana (CCC=0, QC=Yes for NFC/NFKC).
56/// Excludes: combining marks U+3099-309A (CCC>0), NFKC-decomposing U+309B-309C,
57/// U+309F (ゟ), U+30FF (ヿ).
58#[inline(always)]
59fn is_kana(cp: u32) -> bool {
60    // Hiragana base (U+3041-3098)
61    (0x3041..0x3099).contains(&cp)
62        // Hiragana iteration marks (U+309D-309E)
63        || cp == 0x309D
64        || cp == 0x309E
65        // Katakana (U+30A0-30FE)
66        || (0x30A0..=0x30FE).contains(&cp)
67}
68
69/// Generic quick-check implementation.
70///
71/// For inputs >= 64 bytes, uses SIMD scanning to skip chunks in bulk.
72/// For shorter inputs, falls back to a scalar character-by-character loop.
73/// Returns as soon as a definitive No is found.
74///
75/// # Parameters
76/// - `qc_shift`: bit shift to extract this form's 2-bit QC from the fused CCC+QC trie.
77/// - `simd_bound`: SIMD scan threshold; bytes below this are skipped in bulk.
78///   For NFC this is 0xCC (all chars below U+0300 are safe), for other forms 0xC0.
79/// - `safe_below`: code point below which CCC=0 and QC=Yes is guaranteed.
80/// - `hangul_safe`: whether Hangul Syllables (U+AC00..U+D7A3) are QC=Yes for this form.
81/// - `kana_safe`: whether Hiragana/Katakana (U+3040..U+30FF) are QC=Yes for this form.
82/// - `latin1_upper_safe`: whether U+00C0..U+00FF (precomposed Latin Supplement upper)
83///   is uniformly CCC=0 and QC=Yes. True for NFC/NFKC (precomposed forms are kept
84///   precomposed under both compose-normalizing forms). False for NFD/NFKD where
85///   they all decompose. NFC already has `simd_bound=0xCC` which skips this range
86///   in bulk; the flag is consulted on the bit-walk for forms whose `simd_bound`
87///   does flag bytes in the U+00C0..U+00FF lead-byte range.
88#[inline]
89fn quick_check_impl(
90    input: &str,
91    qc_shift: u32,
92    simd_bound: u8,
93    safe_below: u32,
94    hangul_safe: bool,
95    kana_safe: bool,
96    latin1_upper_safe: bool,
97) -> IsNormalized {
98    let bytes = input.as_bytes();
99    let len = bytes.len();
100
101    if len < 64 {
102        return quick_check_scalar(
103            input,
104            qc_shift,
105            safe_below,
106            hangul_safe,
107            kana_safe,
108            latin1_upper_safe,
109        );
110    }
111
112    let ptr = bytes.as_ptr();
113
114    let mut last_ccc: u8 = 0;
115    let mut result = IsNormalized::Yes;
116    // Byte offset past the last character we've examined.
117    let mut processed_up_to: usize = 0;
118    let mut pos: usize = 0;
119
120    // SIMD chunk loop: skip chunks where all bytes < simd_bound in bulk.
121    while pos + 64 <= len {
122        // SAFETY: pos + 64 <= len, so ptr.add(pos) is valid for 64 bytes.
123        let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
124        let chunk_end = pos + 64;
125
126        if mask == 0 {
127            // All bytes < simd_bound — characters in this chunk are either ASCII
128            // or known-safe non-ASCII (CCC=0, QC=Yes). CCC resets to 0.
129            last_ccc = 0;
130            processed_up_to = chunk_end;
131            pos = chunk_end;
132            continue;
133        }
134
135        // Walk set bits — each is a lead byte of a character that needs inspection.
136        let chunk_start = pos;
137        let mut chunk_mask = mask;
138        while chunk_mask != 0 {
139            let bit_pos = chunk_mask.trailing_zeros() as usize;
140            chunk_mask &= chunk_mask.wrapping_sub(1); // clear lowest set bit
141
142            let byte_pos = chunk_start + bit_pos;
143
144            // Skip bytes already covered by a previous multi-byte decode.
145            if byte_pos < processed_up_to {
146                continue;
147            }
148
149            // Gap before this lead byte → safe characters → CCC resets to 0.
150            if byte_pos > processed_up_to {
151                last_ccc = 0;
152            }
153
154            // Decode the character at this position.
155            let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
156            processed_up_to = byte_pos + width;
157
158            // Fast path: known-safe code point ranges (CCC=0 and QC=Yes).
159            let cp = ch as u32;
160            if cp < safe_below
161                || (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
162                || is_cjk_unified(cp)
163                || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
164                || (kana_safe && is_kana(cp))
165                || (cp >= 0x10000 && is_supp_safe(cp))
166            {
167                last_ccc = 0;
168                continue;
169            }
170
171            // Fused CCC + QC lookup (single trie access).
172            let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
173            if ccc != 0 && last_ccc > ccc {
174                return IsNormalized::No;
175            }
176
177            // Check QC property.
178            match qc_value_to_result(qc) {
179                IsNormalized::No => return IsNormalized::No,
180                IsNormalized::Maybe => result = IsNormalized::Maybe,
181                IsNormalized::Yes => {},
182            }
183
184            last_ccc = ccc;
185        }
186
187        // Trailing safe bytes in this chunk after the last flagged char.
188        if processed_up_to < chunk_end {
189            last_ccc = 0;
190            processed_up_to = chunk_end;
191        }
192
193        pos = chunk_end;
194    }
195
196    // Scalar tail for remaining bytes after the last full 64-byte chunk.
197    let tail_start = processed_up_to.max(pos);
198    if tail_start > processed_up_to {
199        // Gap of safe characters between last processed char and tail start.
200        last_ccc = 0;
201    }
202    let mut tail_pos = tail_start;
203    while tail_pos < len {
204        let b = bytes[tail_pos];
205        if b < 0x80 {
206            // ASCII: CCC=0, QC=Yes for all forms.
207            last_ccc = 0;
208            tail_pos += 1;
209            continue;
210        }
211        // Skip continuation bytes from a character that crossed the chunk/tail
212        // boundary. Its lead byte was < simd_bound, so it is safe (CCC=0, QC=Yes).
213        if utf8::is_continuation_byte(b) {
214            tail_pos += 1;
215            continue;
216        }
217        // Lead byte of a non-ASCII character.
218        let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
219
220        // Fast path: known-safe code point ranges.
221        let cp = ch as u32;
222        if cp < safe_below
223            || (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
224            || is_cjk_unified(cp)
225            || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
226            || (cp >= 0x10000 && is_supp_safe(cp))
227        {
228            last_ccc = 0;
229            tail_pos += width;
230            continue;
231        }
232
233        let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
234        if ccc != 0 && last_ccc > ccc {
235            return IsNormalized::No;
236        }
237        match qc_value_to_result(qc) {
238            IsNormalized::No => return IsNormalized::No,
239            IsNormalized::Maybe => result = IsNormalized::Maybe,
240            IsNormalized::Yes => {},
241        }
242        last_ccc = ccc;
243        tail_pos += width;
244    }
245
246    result
247}
248
249/// Scalar quick-check for short inputs (< 64 bytes).
250#[inline]
251fn quick_check_scalar(
252    input: &str,
253    qc_shift: u32,
254    safe_below: u32,
255    hangul_safe: bool,
256    kana_safe: bool,
257    latin1_upper_safe: bool,
258) -> IsNormalized {
259    let mut last_ccc: u8 = 0;
260    let mut result = IsNormalized::Yes;
261
262    for ch in input.chars() {
263        let cp = ch as u32;
264
265        // ASCII fast path
266        if cp <= 0x7F {
267            last_ccc = 0;
268            continue;
269        }
270
271        // Fast path: known-safe code point ranges (CCC=0 and QC=Yes).
272        if cp < safe_below
273            || (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
274            || is_cjk_unified(cp)
275            || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
276            || (kana_safe && is_kana(cp))
277            || (cp >= 0x10000 && is_supp_safe(cp))
278        {
279            last_ccc = 0;
280            continue;
281        }
282
283        let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
284
285        // CCC must be non-decreasing among non-zero values.
286        if ccc != 0 && last_ccc > ccc {
287            return IsNormalized::No;
288        }
289
290        match qc_value_to_result(qc) {
291            IsNormalized::No => return IsNormalized::No,
292            IsNormalized::Maybe => result = IsNormalized::Maybe,
293            IsNormalized::Yes => {},
294        }
295
296        last_ccc = ccc;
297    }
298
299    result
300}
301
302// ---------------------------------------------------------------------------
303// SIMD bound and safe-below thresholds by normalization form
304// ---------------------------------------------------------------------------
305//
306// NFC:  simd_bound=0xCC, safe_below=0x0300, hangul_safe=true, kana_safe=true
307//       All chars U+0000..U+02FF have CCC=0 and NFC_QC=Yes.
308//       The first CCC != 0 is U+0300 (lead byte 0xCC).
309//       CJK Unified, Hangul Syllables, and Hiragana/Katakana are NFC-safe.
310//
311// NFD:  simd_bound=0xC3, safe_below=0x00C0, hangul_safe=false, kana_safe=false
312//       U+00C0 is first NFD_QC=No (lead byte 0xC3).
313//       Hangul Syllables and some kana have NFD_QC=No (they decompose).
314//
315// NFKC: simd_bound=0xC0, safe_below=0x00A0, hangul_safe=true, kana_safe=true
316//       U+00A0 is first NFKC_QC=No (NBSP → SPACE).
317//       Kana are NFKC-safe (only halfwidth/enclosed forms decompose, in other blocks).
318//
319// NFKD: simd_bound=0xC0, safe_below=0x00A0, hangul_safe=false, kana_safe=false
320//       Same as NFKC threshold, but Hangul and some kana decompose.
321
322/// Quick-check whether `input` is in NFC.
323#[cfg(not(feature = "quick_check_oracle"))]
324pub(crate) fn quick_check_nfc(input: &str) -> IsNormalized {
325    quick_check_impl(
326        input,
327        tables::CCC_QC_NFC_SHIFT,
328        0xCC,
329        0x0300,
330        true,
331        true,
332        true,
333    )
334}
335
336/// Quick-check whether `input` is in NFC.
337#[cfg(feature = "quick_check_oracle")]
338pub fn quick_check_nfc(input: &str) -> IsNormalized {
339    quick_check_impl(
340        input,
341        tables::CCC_QC_NFC_SHIFT,
342        0xCC,
343        0x0300,
344        true,
345        true,
346        true,
347    )
348}
349
350/// Quick-check whether `input` is in NFD.
351#[cfg(not(feature = "quick_check_oracle"))]
352pub(crate) fn quick_check_nfd(input: &str) -> IsNormalized {
353    quick_check_impl(
354        input,
355        tables::CCC_QC_NFD_SHIFT,
356        0xC3,
357        0x00C0,
358        false,
359        false,
360        false,
361    )
362}
363
364/// Quick-check whether `input` is in NFD.
365#[cfg(feature = "quick_check_oracle")]
366pub fn quick_check_nfd(input: &str) -> IsNormalized {
367    quick_check_impl(
368        input,
369        tables::CCC_QC_NFD_SHIFT,
370        0xC3,
371        0x00C0,
372        false,
373        false,
374        false,
375    )
376}
377
378/// Quick-check whether `input` is in NFKC.
379#[cfg(not(feature = "quick_check_oracle"))]
380pub(crate) fn quick_check_nfkc(input: &str) -> IsNormalized {
381    quick_check_impl(
382        input,
383        tables::CCC_QC_NFKC_SHIFT,
384        0xC0,
385        0x00A0,
386        true,
387        true,
388        true,
389    )
390}
391
392/// Quick-check whether `input` is in NFKC.
393#[cfg(feature = "quick_check_oracle")]
394pub fn quick_check_nfkc(input: &str) -> IsNormalized {
395    quick_check_impl(
396        input,
397        tables::CCC_QC_NFKC_SHIFT,
398        0xC0,
399        0x00A0,
400        true,
401        true,
402        true,
403    )
404}
405
406/// Quick-check whether `input` is in NFKD.
407#[cfg(not(feature = "quick_check_oracle"))]
408pub(crate) fn quick_check_nfkd(input: &str) -> IsNormalized {
409    quick_check_impl(
410        input,
411        tables::CCC_QC_NFKD_SHIFT,
412        0xC0,
413        0x00A0,
414        false,
415        false,
416        false,
417    )
418}
419
420/// Quick-check whether `input` is in NFKD.
421#[cfg(feature = "quick_check_oracle")]
422pub fn quick_check_nfkd(input: &str) -> IsNormalized {
423    quick_check_impl(
424        input,
425        tables::CCC_QC_NFKD_SHIFT,
426        0xC0,
427        0x00A0,
428        false,
429        false,
430        false,
431    )
432}
433
434// ---------------------------------------------------------------------------
435// Oracle (slow-path) implementation for differential testing.
436// ---------------------------------------------------------------------------
437//
438// The oracle deliberately routes every flagged byte through the Layer-2
439// decode + range + trie path, i.e. it calls `simd::scan_chunk` (no
440// safe_lead_mask) and omits the short-circuit. It exists so that
441// tests/quick_check_fastpath_equivalence.rs can assert
442// `quick_check_X(s) == quick_check_X_oracle(s)` for every form, 8192
443// cases per form, on arbitrary Unicode input.
444
445#[cfg(feature = "quick_check_oracle")]
446#[inline]
447fn quick_check_impl_oracle(
448    input: &str,
449    qc_shift: u32,
450    simd_bound: u8,
451    safe_below: u32,
452    hangul_safe: bool,
453    kana_safe: bool,
454) -> IsNormalized {
455    let bytes = input.as_bytes();
456    let len = bytes.len();
457
458    if len < 64 {
459        return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe, false);
460    }
461
462    let ptr = bytes.as_ptr();
463    let mut last_ccc: u8 = 0;
464    let mut result = IsNormalized::Yes;
465    let mut processed_up_to: usize = 0;
466    let mut pos: usize = 0;
467
468    while pos + 64 <= len {
469        // SAFETY: pos + 64 <= len, so ptr.add(pos) is valid for 64 bytes.
470        let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
471        let chunk_end = pos + 64;
472
473        if mask == 0 {
474            last_ccc = 0;
475            processed_up_to = chunk_end;
476            pos = chunk_end;
477            continue;
478        }
479
480        let chunk_start = pos;
481        let mut chunk_mask = mask;
482        while chunk_mask != 0 {
483            let bit_pos = chunk_mask.trailing_zeros() as usize;
484            chunk_mask &= chunk_mask.wrapping_sub(1);
485
486            let byte_pos = chunk_start + bit_pos;
487            if byte_pos < processed_up_to {
488                continue;
489            }
490            if byte_pos > processed_up_to {
491                last_ccc = 0;
492            }
493
494            let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
495            processed_up_to = byte_pos + width;
496
497            let cp = ch as u32;
498            if cp < safe_below
499                || is_cjk_unified(cp)
500                || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
501                || (kana_safe && is_kana(cp))
502                || (cp >= 0x10000 && is_supp_safe(cp))
503            {
504                last_ccc = 0;
505                continue;
506            }
507
508            let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
509            if ccc != 0 && last_ccc > ccc {
510                return IsNormalized::No;
511            }
512            match qc_value_to_result(qc) {
513                IsNormalized::No => return IsNormalized::No,
514                IsNormalized::Maybe => result = IsNormalized::Maybe,
515                IsNormalized::Yes => {},
516            }
517            last_ccc = ccc;
518        }
519
520        if processed_up_to < chunk_end {
521            last_ccc = 0;
522            processed_up_to = chunk_end;
523        }
524        pos = chunk_end;
525    }
526
527    // Scalar tail (identical to quick_check_impl; duplicated verbatim so
528    // the oracle stays a single self-contained function).
529    let tail_start = processed_up_to.max(pos);
530    if tail_start > processed_up_to {
531        last_ccc = 0;
532    }
533    let mut tail_pos = tail_start;
534    while tail_pos < len {
535        let b = bytes[tail_pos];
536        if b < 0x80 {
537            last_ccc = 0;
538            tail_pos += 1;
539            continue;
540        }
541        if utf8::is_continuation_byte(b) {
542            tail_pos += 1;
543            continue;
544        }
545        let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
546        let cp = ch as u32;
547        if cp < safe_below
548            || is_cjk_unified(cp)
549            || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
550            || (cp >= 0x10000 && is_supp_safe(cp))
551        {
552            last_ccc = 0;
553            tail_pos += width;
554            continue;
555        }
556        let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
557        if ccc != 0 && last_ccc > ccc {
558            return IsNormalized::No;
559        }
560        match qc_value_to_result(qc) {
561            IsNormalized::No => return IsNormalized::No,
562            IsNormalized::Maybe => result = IsNormalized::Maybe,
563            IsNormalized::Yes => {},
564        }
565        last_ccc = ccc;
566        tail_pos += width;
567    }
568
569    result
570}
571
572/// Oracle NFC quick-check. Differential-testing only.
573#[cfg(feature = "quick_check_oracle")]
574pub fn quick_check_nfc_oracle(input: &str) -> IsNormalized {
575    quick_check_impl_oracle(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
576}
577
578/// Oracle NFD quick-check. Differential-testing only.
579#[cfg(feature = "quick_check_oracle")]
580pub fn quick_check_nfd_oracle(input: &str) -> IsNormalized {
581    quick_check_impl_oracle(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
582}
583
584/// Oracle NFKC quick-check. Differential-testing only.
585#[cfg(feature = "quick_check_oracle")]
586pub fn quick_check_nfkc_oracle(input: &str) -> IsNormalized {
587    quick_check_impl_oracle(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
588}
589
590/// Oracle NFKD quick-check. Differential-testing only.
591#[cfg(feature = "quick_check_oracle")]
592pub fn quick_check_nfkd_oracle(input: &str) -> IsNormalized {
593    quick_check_impl_oracle(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
594}
595
596// ---------------------------------------------------------------------------
597// Definitive is_normalized checks (resolve Maybe via full normalization)
598// ---------------------------------------------------------------------------
599//
600// These delegate to the main normalizer for the Maybe case, ensuring the
601// quick-check resolution uses the same code path as actual normalization.
602
603/// Definitive NFC check.
604pub(crate) fn is_normalized_nfc(input: &str) -> bool {
605    match quick_check_nfc(input) {
606        IsNormalized::Yes => true,
607        IsNormalized::No => false,
608        IsNormalized::Maybe => &*crate::nfc().normalize(input) == input,
609    }
610}
611
612/// Definitive NFD check.
613pub(crate) fn is_normalized_nfd(input: &str) -> bool {
614    match quick_check_nfd(input) {
615        IsNormalized::Yes => true,
616        IsNormalized::No => false,
617        IsNormalized::Maybe => &*crate::nfd().normalize(input) == input,
618    }
619}
620
621/// Definitive NFKC check.
622pub(crate) fn is_normalized_nfkc(input: &str) -> bool {
623    match quick_check_nfkc(input) {
624        IsNormalized::Yes => true,
625        IsNormalized::No => false,
626        IsNormalized::Maybe => &*crate::nfkc().normalize(input) == input,
627    }
628}
629
630/// Definitive NFKD check.
631pub(crate) fn is_normalized_nfkd(input: &str) -> bool {
632    match quick_check_nfkd(input) {
633        IsNormalized::Yes => true,
634        IsNormalized::No => false,
635        IsNormalized::Maybe => &*crate::nfkd().normalize(input) == input,
636    }
637}
638
639#[cfg(test)]
640mod tests {
641    use super::*;
642    use alloc::format;
643    use alloc::string::String;
644
645    // ---- ASCII fast path ----
646
647    #[test]
648    fn ascii_is_nfc() {
649        assert_eq!(quick_check_nfc("Hello, world!"), IsNormalized::Yes);
650    }
651
652    #[test]
653    fn ascii_is_nfd() {
654        assert_eq!(quick_check_nfd("Hello, world!"), IsNormalized::Yes);
655    }
656
657    #[test]
658    fn ascii_is_nfkc() {
659        assert_eq!(quick_check_nfkc("Hello, world!"), IsNormalized::Yes);
660    }
661
662    #[test]
663    fn ascii_is_nfkd() {
664        assert_eq!(quick_check_nfkd("Hello, world!"), IsNormalized::Yes);
665    }
666
667    #[test]
668    fn empty_string_is_normalized() {
669        assert_eq!(quick_check_nfc(""), IsNormalized::Yes);
670        assert_eq!(quick_check_nfd(""), IsNormalized::Yes);
671        assert_eq!(quick_check_nfkc(""), IsNormalized::Yes);
672        assert_eq!(quick_check_nfkd(""), IsNormalized::Yes);
673    }
674
675    // ---- NFC checks ----
676
677    #[test]
678    fn precomposed_is_nfc_yes() {
679        assert_eq!(quick_check_nfc("\u{00E9}"), IsNormalized::Yes);
680    }
681
682    #[test]
683    fn decomposed_is_not_nfc() {
684        let nfd = "e\u{0301}";
685        let result = quick_check_nfc(nfd);
686        assert!(
687            result == IsNormalized::No || result == IsNormalized::Maybe,
688            "NFD form must not be Yes for NFC, got {:?}",
689            result,
690        );
691    }
692
693    // ---- NFD checks ----
694
695    #[test]
696    fn precomposed_is_not_nfd() {
697        assert_eq!(quick_check_nfd("\u{00E9}"), IsNormalized::No);
698    }
699
700    // ---- CCC ordering ----
701
702    #[test]
703    fn wrong_ccc_order_is_no() {
704        let bad_order = "a\u{0301}\u{0327}"; // acute(230) then cedilla(202)
705        assert_eq!(quick_check_nfc(bad_order), IsNormalized::No);
706        assert_eq!(quick_check_nfd(bad_order), IsNormalized::No);
707    }
708
709    #[test]
710    fn correct_ccc_order_not_rejected() {
711        // Use Hebrew accents which are NFC_QC=Yes but have non-zero CCC.
712        // U+0591 HEBREW ACCENT ETNAHTA (CCC=220), U+05A1 HEBREW ACCENT PAZER (CCC=230)
713        let good_order = "a\u{0591}\u{05A1}";
714        let result = quick_check_nfc(good_order);
715        assert_ne!(result, IsNormalized::No);
716    }
717
718    // ---- Range fast path tests ----
719
720    #[test]
721    fn latin1_supplement_is_nfc() {
722        // U+00C0..U+00FF are all NFC_QC=Yes
723        let latin1 = "\u{00C0}\u{00E9}\u{00F6}\u{00FC}\u{00FF}";
724        assert_eq!(quick_check_nfc(latin1), IsNormalized::Yes);
725    }
726
727    #[test]
728    fn latin_extended_is_nfc() {
729        // U+0100..U+02FF are all NFC_QC=Yes
730        let extended = "\u{0100}\u{017E}\u{0250}\u{02FF}";
731        assert_eq!(quick_check_nfc(extended), IsNormalized::Yes);
732    }
733
734    #[test]
735    fn cjk_is_nfc() {
736        let cjk = "\u{4E00}\u{9FFF}\u{3400}\u{4DBF}";
737        assert_eq!(quick_check_nfc(cjk), IsNormalized::Yes);
738    }
739
740    #[test]
741    fn hangul_syllable_is_nfc() {
742        let hangul = "\u{AC00}\u{D7A3}";
743        assert_eq!(quick_check_nfc(hangul), IsNormalized::Yes);
744    }
745
746    #[test]
747    fn hangul_syllable_is_not_nfd() {
748        let hangul = "\u{AC00}";
749        assert_eq!(quick_check_nfd(hangul), IsNormalized::No);
750    }
751
752    #[test]
753    fn latin1_is_not_nfd() {
754        // U+00C0 decomposes in NFD
755        assert_eq!(quick_check_nfd("\u{00C0}"), IsNormalized::No);
756    }
757
758    #[test]
759    fn nbsp_is_not_nfkc() {
760        // U+00A0 (NBSP) → U+0020 (SPACE) in NFKC
761        assert_eq!(quick_check_nfkc("\u{00A0}"), IsNormalized::No);
762    }
763
764    // ---- is_normalized definitive checks ----
765
766    #[test]
767    fn is_normalized_nfc_ascii() {
768        assert!(is_normalized_nfc("Hello"));
769    }
770
771    #[test]
772    fn is_normalized_nfc_precomposed() {
773        assert!(is_normalized_nfc("\u{00E9}"));
774    }
775
776    #[test]
777    fn is_normalized_nfd_decomposed() {
778        assert!(is_normalized_nfd("e\u{0301}"));
779    }
780
781    #[test]
782    fn is_normalized_nfc_rejects_nfd() {
783        assert!(!is_normalized_nfc("e\u{0301}"));
784    }
785
786    #[test]
787    fn is_normalized_nfd_rejects_nfc() {
788        assert!(!is_normalized_nfd("\u{00E9}"));
789    }
790
791    #[test]
792    fn safe_lead_interleaved_with_combining_marks_across_chunk() {
793        // 128 bytes spanning two SIMD chunks.
794        // Pattern: CJK ideograph (3 bytes, lead 0xE4..=0xE9, safe-lead) +
795        //          'a' (1 byte, ASCII) +
796        //          U+0591 HEBREW ACCENT ETNAHTA (CCC=220, NFC_QC=Yes, lead 0xD6 -> decode path).
797        // The U+0591 must be observed after a safe-lead reset of last_ccc so that
798        // the *next* non-zero CCC mark is accepted as non-decreasing.
799        //
800        // 16 repetitions of (CJK=3 + 'a'=1 + U+0591=2 + 'b'=1 + 'b'=1) = 16 * 8 = 128 bytes.
801        let unit = "\u{4E2D}a\u{0591}bb";
802        let s: String = unit.repeat(16);
803        assert_eq!(s.len(), 128);
804        // All code points are NFC_QC=Yes with monotonic (or zero) CCC, so NFC=Yes.
805        assert_eq!(quick_check_nfc(&s), IsNormalized::Yes);
806        // NFD: U+0591 is NFD_QC=Yes, CJK Unified is NFD_QC=Yes, ASCII is safe.
807        assert_eq!(quick_check_nfd(&s), IsNormalized::Yes);
808        assert_eq!(quick_check_nfkc(&s), IsNormalized::Yes);
809        assert_eq!(quick_check_nfkd(&s), IsNormalized::Yes);
810    }
811
812    #[test]
813    fn safe_lead_then_out_of_order_combining_is_no() {
814        // Regression: if the safe-lead short-circuit fails to set last_ccc=0,
815        // a subsequent same-position CCC check could mis-order. Build an input
816        // where CJK (CCC=0 safe-lead) is followed by a correctly-ordered
817        // combining sequence, then a mis-ordered one; expect No.
818        // U+0301 ACUTE (CCC=230), U+0327 CEDILLA (CCC=202).
819        let unit = "\u{4E2D}a\u{0301}\u{0327}"; // bad order after safe-lead + ASCII
820        let padding = "x".repeat(64); // force >= 64-byte path
821        let s = format!("{}{}", padding, unit);
822        assert!(s.len() >= 64);
823        assert_eq!(quick_check_nfc(&s), IsNormalized::No);
824    }
825
826    #[cfg(feature = "quick_check_oracle")]
827    #[test]
828    fn oracle_matches_fastpath_on_fixed_input() {
829        let s = "\u{4E2D}a\u{0591}bb".repeat(16);
830        assert_eq!(quick_check_nfc(&s), super::quick_check_nfc_oracle(&s));
831        assert_eq!(quick_check_nfd(&s), super::quick_check_nfd_oracle(&s));
832        assert_eq!(quick_check_nfkc(&s), super::quick_check_nfkc_oracle(&s));
833        assert_eq!(quick_check_nfkd(&s), super::quick_check_nfkd_oracle(&s));
834    }
835}
simd_normalizer/quick_check.rs

simd_normalizer/
quick_check.rs