1use crate::simd;
10use crate::tables;
11use crate::utf8;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum IsNormalized {
16 Yes,
18 No,
20 Maybe,
22}
23
24#[inline]
26fn qc_value_to_result(v: u8) -> IsNormalized {
27 match v {
28 0 => IsNormalized::Yes,
29 1 => IsNormalized::Maybe,
30 _ => IsNormalized::No,
31 }
32}
33
34#[inline(always)]
36fn is_cjk_unified(cp: u32) -> bool {
37 (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
39}
40
41#[inline(always)]
45fn is_supp_safe(cp: u32) -> bool {
46 if cp >= 0x20000 {
47 return !(0x2F800..=0x2FA1F).contains(&cp);
49 }
50 (0x1F252..=0x1FBEF).contains(&cp)
53}
54
55#[inline(always)]
59fn is_kana(cp: u32) -> bool {
60 (0x3041..0x3099).contains(&cp)
62 || cp == 0x309D
64 || cp == 0x309E
65 || (0x30A0..=0x30FE).contains(&cp)
67}
68
69#[inline]
89fn quick_check_impl(
90 input: &str,
91 qc_shift: u32,
92 simd_bound: u8,
93 safe_below: u32,
94 hangul_safe: bool,
95 kana_safe: bool,
96 latin1_upper_safe: bool,
97) -> IsNormalized {
98 let bytes = input.as_bytes();
99 let len = bytes.len();
100
101 if len < 64 {
102 return quick_check_scalar(
103 input,
104 qc_shift,
105 safe_below,
106 hangul_safe,
107 kana_safe,
108 latin1_upper_safe,
109 );
110 }
111
112 let ptr = bytes.as_ptr();
113
114 let mut last_ccc: u8 = 0;
115 let mut result = IsNormalized::Yes;
116 let mut processed_up_to: usize = 0;
118 let mut pos: usize = 0;
119
120 while pos + 64 <= len {
122 let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
124 let chunk_end = pos + 64;
125
126 if mask == 0 {
127 last_ccc = 0;
130 processed_up_to = chunk_end;
131 pos = chunk_end;
132 continue;
133 }
134
135 let chunk_start = pos;
137 let mut chunk_mask = mask;
138 while chunk_mask != 0 {
139 let bit_pos = chunk_mask.trailing_zeros() as usize;
140 chunk_mask &= chunk_mask.wrapping_sub(1); let byte_pos = chunk_start + bit_pos;
143
144 if byte_pos < processed_up_to {
146 continue;
147 }
148
149 if byte_pos > processed_up_to {
151 last_ccc = 0;
152 }
153
154 let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
156 processed_up_to = byte_pos + width;
157
158 let cp = ch as u32;
160 if cp < safe_below
161 || (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
162 || is_cjk_unified(cp)
163 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
164 || (kana_safe && is_kana(cp))
165 || (cp >= 0x10000 && is_supp_safe(cp))
166 {
167 last_ccc = 0;
168 continue;
169 }
170
171 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
173 if ccc != 0 && last_ccc > ccc {
174 return IsNormalized::No;
175 }
176
177 match qc_value_to_result(qc) {
179 IsNormalized::No => return IsNormalized::No,
180 IsNormalized::Maybe => result = IsNormalized::Maybe,
181 IsNormalized::Yes => {},
182 }
183
184 last_ccc = ccc;
185 }
186
187 if processed_up_to < chunk_end {
189 last_ccc = 0;
190 processed_up_to = chunk_end;
191 }
192
193 pos = chunk_end;
194 }
195
196 let tail_start = processed_up_to.max(pos);
198 if tail_start > processed_up_to {
199 last_ccc = 0;
201 }
202 let mut tail_pos = tail_start;
203 while tail_pos < len {
204 let b = bytes[tail_pos];
205 if b < 0x80 {
206 last_ccc = 0;
208 tail_pos += 1;
209 continue;
210 }
211 if utf8::is_continuation_byte(b) {
214 tail_pos += 1;
215 continue;
216 }
217 let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
219
220 let cp = ch as u32;
222 if cp < safe_below
223 || (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
224 || is_cjk_unified(cp)
225 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
226 || (cp >= 0x10000 && is_supp_safe(cp))
227 {
228 last_ccc = 0;
229 tail_pos += width;
230 continue;
231 }
232
233 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
234 if ccc != 0 && last_ccc > ccc {
235 return IsNormalized::No;
236 }
237 match qc_value_to_result(qc) {
238 IsNormalized::No => return IsNormalized::No,
239 IsNormalized::Maybe => result = IsNormalized::Maybe,
240 IsNormalized::Yes => {},
241 }
242 last_ccc = ccc;
243 tail_pos += width;
244 }
245
246 result
247}
248
249#[inline]
251fn quick_check_scalar(
252 input: &str,
253 qc_shift: u32,
254 safe_below: u32,
255 hangul_safe: bool,
256 kana_safe: bool,
257 latin1_upper_safe: bool,
258) -> IsNormalized {
259 let mut last_ccc: u8 = 0;
260 let mut result = IsNormalized::Yes;
261
262 for ch in input.chars() {
263 let cp = ch as u32;
264
265 if cp <= 0x7F {
267 last_ccc = 0;
268 continue;
269 }
270
271 if cp < safe_below
273 || (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
274 || is_cjk_unified(cp)
275 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
276 || (kana_safe && is_kana(cp))
277 || (cp >= 0x10000 && is_supp_safe(cp))
278 {
279 last_ccc = 0;
280 continue;
281 }
282
283 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
284
285 if ccc != 0 && last_ccc > ccc {
287 return IsNormalized::No;
288 }
289
290 match qc_value_to_result(qc) {
291 IsNormalized::No => return IsNormalized::No,
292 IsNormalized::Maybe => result = IsNormalized::Maybe,
293 IsNormalized::Yes => {},
294 }
295
296 last_ccc = ccc;
297 }
298
299 result
300}
301
302#[cfg(not(feature = "quick_check_oracle"))]
324pub(crate) fn quick_check_nfc(input: &str) -> IsNormalized {
325 quick_check_impl(
326 input,
327 tables::CCC_QC_NFC_SHIFT,
328 0xCC,
329 0x0300,
330 true,
331 true,
332 true,
333 )
334}
335
336#[cfg(feature = "quick_check_oracle")]
338pub fn quick_check_nfc(input: &str) -> IsNormalized {
339 quick_check_impl(
340 input,
341 tables::CCC_QC_NFC_SHIFT,
342 0xCC,
343 0x0300,
344 true,
345 true,
346 true,
347 )
348}
349
350#[cfg(not(feature = "quick_check_oracle"))]
352pub(crate) fn quick_check_nfd(input: &str) -> IsNormalized {
353 quick_check_impl(
354 input,
355 tables::CCC_QC_NFD_SHIFT,
356 0xC3,
357 0x00C0,
358 false,
359 false,
360 false,
361 )
362}
363
364#[cfg(feature = "quick_check_oracle")]
366pub fn quick_check_nfd(input: &str) -> IsNormalized {
367 quick_check_impl(
368 input,
369 tables::CCC_QC_NFD_SHIFT,
370 0xC3,
371 0x00C0,
372 false,
373 false,
374 false,
375 )
376}
377
378#[cfg(not(feature = "quick_check_oracle"))]
380pub(crate) fn quick_check_nfkc(input: &str) -> IsNormalized {
381 quick_check_impl(
382 input,
383 tables::CCC_QC_NFKC_SHIFT,
384 0xC0,
385 0x00A0,
386 true,
387 true,
388 true,
389 )
390}
391
392#[cfg(feature = "quick_check_oracle")]
394pub fn quick_check_nfkc(input: &str) -> IsNormalized {
395 quick_check_impl(
396 input,
397 tables::CCC_QC_NFKC_SHIFT,
398 0xC0,
399 0x00A0,
400 true,
401 true,
402 true,
403 )
404}
405
406#[cfg(not(feature = "quick_check_oracle"))]
408pub(crate) fn quick_check_nfkd(input: &str) -> IsNormalized {
409 quick_check_impl(
410 input,
411 tables::CCC_QC_NFKD_SHIFT,
412 0xC0,
413 0x00A0,
414 false,
415 false,
416 false,
417 )
418}
419
420#[cfg(feature = "quick_check_oracle")]
422pub fn quick_check_nfkd(input: &str) -> IsNormalized {
423 quick_check_impl(
424 input,
425 tables::CCC_QC_NFKD_SHIFT,
426 0xC0,
427 0x00A0,
428 false,
429 false,
430 false,
431 )
432}
433
434#[cfg(feature = "quick_check_oracle")]
446#[inline]
447fn quick_check_impl_oracle(
448 input: &str,
449 qc_shift: u32,
450 simd_bound: u8,
451 safe_below: u32,
452 hangul_safe: bool,
453 kana_safe: bool,
454) -> IsNormalized {
455 let bytes = input.as_bytes();
456 let len = bytes.len();
457
458 if len < 64 {
459 return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe, false);
460 }
461
462 let ptr = bytes.as_ptr();
463 let mut last_ccc: u8 = 0;
464 let mut result = IsNormalized::Yes;
465 let mut processed_up_to: usize = 0;
466 let mut pos: usize = 0;
467
468 while pos + 64 <= len {
469 let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
471 let chunk_end = pos + 64;
472
473 if mask == 0 {
474 last_ccc = 0;
475 processed_up_to = chunk_end;
476 pos = chunk_end;
477 continue;
478 }
479
480 let chunk_start = pos;
481 let mut chunk_mask = mask;
482 while chunk_mask != 0 {
483 let bit_pos = chunk_mask.trailing_zeros() as usize;
484 chunk_mask &= chunk_mask.wrapping_sub(1);
485
486 let byte_pos = chunk_start + bit_pos;
487 if byte_pos < processed_up_to {
488 continue;
489 }
490 if byte_pos > processed_up_to {
491 last_ccc = 0;
492 }
493
494 let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
495 processed_up_to = byte_pos + width;
496
497 let cp = ch as u32;
498 if cp < safe_below
499 || is_cjk_unified(cp)
500 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
501 || (kana_safe && is_kana(cp))
502 || (cp >= 0x10000 && is_supp_safe(cp))
503 {
504 last_ccc = 0;
505 continue;
506 }
507
508 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
509 if ccc != 0 && last_ccc > ccc {
510 return IsNormalized::No;
511 }
512 match qc_value_to_result(qc) {
513 IsNormalized::No => return IsNormalized::No,
514 IsNormalized::Maybe => result = IsNormalized::Maybe,
515 IsNormalized::Yes => {},
516 }
517 last_ccc = ccc;
518 }
519
520 if processed_up_to < chunk_end {
521 last_ccc = 0;
522 processed_up_to = chunk_end;
523 }
524 pos = chunk_end;
525 }
526
527 let tail_start = processed_up_to.max(pos);
530 if tail_start > processed_up_to {
531 last_ccc = 0;
532 }
533 let mut tail_pos = tail_start;
534 while tail_pos < len {
535 let b = bytes[tail_pos];
536 if b < 0x80 {
537 last_ccc = 0;
538 tail_pos += 1;
539 continue;
540 }
541 if utf8::is_continuation_byte(b) {
542 tail_pos += 1;
543 continue;
544 }
545 let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
546 let cp = ch as u32;
547 if cp < safe_below
548 || is_cjk_unified(cp)
549 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
550 || (cp >= 0x10000 && is_supp_safe(cp))
551 {
552 last_ccc = 0;
553 tail_pos += width;
554 continue;
555 }
556 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
557 if ccc != 0 && last_ccc > ccc {
558 return IsNormalized::No;
559 }
560 match qc_value_to_result(qc) {
561 IsNormalized::No => return IsNormalized::No,
562 IsNormalized::Maybe => result = IsNormalized::Maybe,
563 IsNormalized::Yes => {},
564 }
565 last_ccc = ccc;
566 tail_pos += width;
567 }
568
569 result
570}
571
572#[cfg(feature = "quick_check_oracle")]
574pub fn quick_check_nfc_oracle(input: &str) -> IsNormalized {
575 quick_check_impl_oracle(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
576}
577
578#[cfg(feature = "quick_check_oracle")]
580pub fn quick_check_nfd_oracle(input: &str) -> IsNormalized {
581 quick_check_impl_oracle(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
582}
583
584#[cfg(feature = "quick_check_oracle")]
586pub fn quick_check_nfkc_oracle(input: &str) -> IsNormalized {
587 quick_check_impl_oracle(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
588}
589
590#[cfg(feature = "quick_check_oracle")]
592pub fn quick_check_nfkd_oracle(input: &str) -> IsNormalized {
593 quick_check_impl_oracle(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
594}
595
596pub(crate) fn is_normalized_nfc(input: &str) -> bool {
605 match quick_check_nfc(input) {
606 IsNormalized::Yes => true,
607 IsNormalized::No => false,
608 IsNormalized::Maybe => &*crate::nfc().normalize(input) == input,
609 }
610}
611
612pub(crate) fn is_normalized_nfd(input: &str) -> bool {
614 match quick_check_nfd(input) {
615 IsNormalized::Yes => true,
616 IsNormalized::No => false,
617 IsNormalized::Maybe => &*crate::nfd().normalize(input) == input,
618 }
619}
620
621pub(crate) fn is_normalized_nfkc(input: &str) -> bool {
623 match quick_check_nfkc(input) {
624 IsNormalized::Yes => true,
625 IsNormalized::No => false,
626 IsNormalized::Maybe => &*crate::nfkc().normalize(input) == input,
627 }
628}
629
630pub(crate) fn is_normalized_nfkd(input: &str) -> bool {
632 match quick_check_nfkd(input) {
633 IsNormalized::Yes => true,
634 IsNormalized::No => false,
635 IsNormalized::Maybe => &*crate::nfkd().normalize(input) == input,
636 }
637}
638
639#[cfg(test)]
640mod tests {
641 use super::*;
642 use alloc::format;
643 use alloc::string::String;
644
645 #[test]
648 fn ascii_is_nfc() {
649 assert_eq!(quick_check_nfc("Hello, world!"), IsNormalized::Yes);
650 }
651
652 #[test]
653 fn ascii_is_nfd() {
654 assert_eq!(quick_check_nfd("Hello, world!"), IsNormalized::Yes);
655 }
656
657 #[test]
658 fn ascii_is_nfkc() {
659 assert_eq!(quick_check_nfkc("Hello, world!"), IsNormalized::Yes);
660 }
661
662 #[test]
663 fn ascii_is_nfkd() {
664 assert_eq!(quick_check_nfkd("Hello, world!"), IsNormalized::Yes);
665 }
666
667 #[test]
668 fn empty_string_is_normalized() {
669 assert_eq!(quick_check_nfc(""), IsNormalized::Yes);
670 assert_eq!(quick_check_nfd(""), IsNormalized::Yes);
671 assert_eq!(quick_check_nfkc(""), IsNormalized::Yes);
672 assert_eq!(quick_check_nfkd(""), IsNormalized::Yes);
673 }
674
675 #[test]
678 fn precomposed_is_nfc_yes() {
679 assert_eq!(quick_check_nfc("\u{00E9}"), IsNormalized::Yes);
680 }
681
682 #[test]
683 fn decomposed_is_not_nfc() {
684 let nfd = "e\u{0301}";
685 let result = quick_check_nfc(nfd);
686 assert!(
687 result == IsNormalized::No || result == IsNormalized::Maybe,
688 "NFD form must not be Yes for NFC, got {:?}",
689 result,
690 );
691 }
692
693 #[test]
696 fn precomposed_is_not_nfd() {
697 assert_eq!(quick_check_nfd("\u{00E9}"), IsNormalized::No);
698 }
699
700 #[test]
703 fn wrong_ccc_order_is_no() {
704 let bad_order = "a\u{0301}\u{0327}"; assert_eq!(quick_check_nfc(bad_order), IsNormalized::No);
706 assert_eq!(quick_check_nfd(bad_order), IsNormalized::No);
707 }
708
709 #[test]
710 fn correct_ccc_order_not_rejected() {
711 let good_order = "a\u{0591}\u{05A1}";
714 let result = quick_check_nfc(good_order);
715 assert_ne!(result, IsNormalized::No);
716 }
717
718 #[test]
721 fn latin1_supplement_is_nfc() {
722 let latin1 = "\u{00C0}\u{00E9}\u{00F6}\u{00FC}\u{00FF}";
724 assert_eq!(quick_check_nfc(latin1), IsNormalized::Yes);
725 }
726
727 #[test]
728 fn latin_extended_is_nfc() {
729 let extended = "\u{0100}\u{017E}\u{0250}\u{02FF}";
731 assert_eq!(quick_check_nfc(extended), IsNormalized::Yes);
732 }
733
734 #[test]
735 fn cjk_is_nfc() {
736 let cjk = "\u{4E00}\u{9FFF}\u{3400}\u{4DBF}";
737 assert_eq!(quick_check_nfc(cjk), IsNormalized::Yes);
738 }
739
740 #[test]
741 fn hangul_syllable_is_nfc() {
742 let hangul = "\u{AC00}\u{D7A3}";
743 assert_eq!(quick_check_nfc(hangul), IsNormalized::Yes);
744 }
745
746 #[test]
747 fn hangul_syllable_is_not_nfd() {
748 let hangul = "\u{AC00}";
749 assert_eq!(quick_check_nfd(hangul), IsNormalized::No);
750 }
751
752 #[test]
753 fn latin1_is_not_nfd() {
754 assert_eq!(quick_check_nfd("\u{00C0}"), IsNormalized::No);
756 }
757
758 #[test]
759 fn nbsp_is_not_nfkc() {
760 assert_eq!(quick_check_nfkc("\u{00A0}"), IsNormalized::No);
762 }
763
764 #[test]
767 fn is_normalized_nfc_ascii() {
768 assert!(is_normalized_nfc("Hello"));
769 }
770
771 #[test]
772 fn is_normalized_nfc_precomposed() {
773 assert!(is_normalized_nfc("\u{00E9}"));
774 }
775
776 #[test]
777 fn is_normalized_nfd_decomposed() {
778 assert!(is_normalized_nfd("e\u{0301}"));
779 }
780
781 #[test]
782 fn is_normalized_nfc_rejects_nfd() {
783 assert!(!is_normalized_nfc("e\u{0301}"));
784 }
785
786 #[test]
787 fn is_normalized_nfd_rejects_nfc() {
788 assert!(!is_normalized_nfd("\u{00E9}"));
789 }
790
791 #[test]
792 fn safe_lead_interleaved_with_combining_marks_across_chunk() {
793 let unit = "\u{4E2D}a\u{0591}bb";
802 let s: String = unit.repeat(16);
803 assert_eq!(s.len(), 128);
804 assert_eq!(quick_check_nfc(&s), IsNormalized::Yes);
806 assert_eq!(quick_check_nfd(&s), IsNormalized::Yes);
808 assert_eq!(quick_check_nfkc(&s), IsNormalized::Yes);
809 assert_eq!(quick_check_nfkd(&s), IsNormalized::Yes);
810 }
811
812 #[test]
813 fn safe_lead_then_out_of_order_combining_is_no() {
814 let unit = "\u{4E2D}a\u{0301}\u{0327}"; let padding = "x".repeat(64); let s = format!("{}{}", padding, unit);
822 assert!(s.len() >= 64);
823 assert_eq!(quick_check_nfc(&s), IsNormalized::No);
824 }
825
826 #[cfg(feature = "quick_check_oracle")]
827 #[test]
828 fn oracle_matches_fastpath_on_fixed_input() {
829 let s = "\u{4E2D}a\u{0591}bb".repeat(16);
830 assert_eq!(quick_check_nfc(&s), super::quick_check_nfc_oracle(&s));
831 assert_eq!(quick_check_nfd(&s), super::quick_check_nfd_oracle(&s));
832 assert_eq!(quick_check_nfkc(&s), super::quick_check_nfkc_oracle(&s));
833 assert_eq!(quick_check_nfkd(&s), super::quick_check_nfkd_oracle(&s));
834 }
835}