1use crate::simd;
10use crate::tables;
11use crate::utf8;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum IsNormalized {
16 Yes,
18 No,
20 Maybe,
22}
23
24#[inline]
26fn qc_value_to_result(v: u8) -> IsNormalized {
27 match v {
28 0 => IsNormalized::Yes,
29 1 => IsNormalized::Maybe,
30 _ => IsNormalized::No,
31 }
32}
33
34#[inline(always)]
36fn is_cjk_unified(cp: u32) -> bool {
37 (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
39}
40
41#[inline(always)]
45fn is_supp_safe(cp: u32) -> bool {
46 if cp >= 0x20000 {
47 return !(0x2F800..=0x2FA1F).contains(&cp);
49 }
50 (0x1F252..=0x1FBEF).contains(&cp)
53}
54
55#[inline(always)]
59fn is_kana(cp: u32) -> bool {
60 (0x3041..0x3099).contains(&cp)
62 || cp == 0x309D
64 || cp == 0x309E
65 || (0x30A0..=0x30FE).contains(&cp)
67}
68
69#[inline]
83fn quick_check_impl(
84 input: &str,
85 qc_shift: u32,
86 simd_bound: u8,
87 safe_below: u32,
88 hangul_safe: bool,
89 kana_safe: bool,
90) -> IsNormalized {
91 let bytes = input.as_bytes();
92 let len = bytes.len();
93
94 if len < 64 {
95 return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe);
96 }
97
98 let ptr = bytes.as_ptr();
99
100 let mut last_ccc: u8 = 0;
101 let mut result = IsNormalized::Yes;
102 let mut processed_up_to: usize = 0;
104 let mut pos: usize = 0;
105
106 while pos + 64 <= len {
108 let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
110 let chunk_end = pos + 64;
111
112 if mask == 0 {
113 last_ccc = 0;
116 processed_up_to = chunk_end;
117 pos = chunk_end;
118 continue;
119 }
120
121 let chunk_start = pos;
123 let mut chunk_mask = mask;
124 while chunk_mask != 0 {
125 let bit_pos = chunk_mask.trailing_zeros() as usize;
126 chunk_mask &= chunk_mask.wrapping_sub(1); let byte_pos = chunk_start + bit_pos;
129
130 if byte_pos < processed_up_to {
132 continue;
133 }
134
135 if byte_pos > processed_up_to {
137 last_ccc = 0;
138 }
139
140 let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
142 processed_up_to = byte_pos + width;
143
144 let cp = ch as u32;
146 if cp < safe_below
147 || is_cjk_unified(cp)
148 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
149 || (kana_safe && is_kana(cp))
150 || (cp >= 0x10000 && is_supp_safe(cp))
151 {
152 last_ccc = 0;
153 continue;
154 }
155
156 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
158 if ccc != 0 && last_ccc > ccc {
159 return IsNormalized::No;
160 }
161
162 match qc_value_to_result(qc) {
164 IsNormalized::No => return IsNormalized::No,
165 IsNormalized::Maybe => result = IsNormalized::Maybe,
166 IsNormalized::Yes => {},
167 }
168
169 last_ccc = ccc;
170 }
171
172 if processed_up_to < chunk_end {
174 last_ccc = 0;
175 processed_up_to = chunk_end;
176 }
177
178 pos = chunk_end;
179 }
180
181 let tail_start = processed_up_to.max(pos);
183 if tail_start > processed_up_to {
184 last_ccc = 0;
186 }
187 let mut tail_pos = tail_start;
188 while tail_pos < len {
189 let b = bytes[tail_pos];
190 if b < 0x80 {
191 last_ccc = 0;
193 tail_pos += 1;
194 continue;
195 }
196 if utf8::is_continuation_byte(b) {
199 tail_pos += 1;
200 continue;
201 }
202 let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
204
205 let cp = ch as u32;
207 if cp < safe_below
208 || is_cjk_unified(cp)
209 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
210 || (cp >= 0x10000 && is_supp_safe(cp))
211 {
212 last_ccc = 0;
213 tail_pos += width;
214 continue;
215 }
216
217 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
218 if ccc != 0 && last_ccc > ccc {
219 return IsNormalized::No;
220 }
221 match qc_value_to_result(qc) {
222 IsNormalized::No => return IsNormalized::No,
223 IsNormalized::Maybe => result = IsNormalized::Maybe,
224 IsNormalized::Yes => {},
225 }
226 last_ccc = ccc;
227 tail_pos += width;
228 }
229
230 result
231}
232
233#[inline]
235fn quick_check_scalar(
236 input: &str,
237 qc_shift: u32,
238 safe_below: u32,
239 hangul_safe: bool,
240 kana_safe: bool,
241) -> IsNormalized {
242 let mut last_ccc: u8 = 0;
243 let mut result = IsNormalized::Yes;
244
245 for ch in input.chars() {
246 let cp = ch as u32;
247
248 if cp <= 0x7F {
250 last_ccc = 0;
251 continue;
252 }
253
254 if cp < safe_below
256 || is_cjk_unified(cp)
257 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
258 || (kana_safe && is_kana(cp))
259 || (cp >= 0x10000 && is_supp_safe(cp))
260 {
261 last_ccc = 0;
262 continue;
263 }
264
265 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
266
267 if ccc != 0 && last_ccc > ccc {
269 return IsNormalized::No;
270 }
271
272 match qc_value_to_result(qc) {
273 IsNormalized::No => return IsNormalized::No,
274 IsNormalized::Maybe => result = IsNormalized::Maybe,
275 IsNormalized::Yes => {},
276 }
277
278 last_ccc = ccc;
279 }
280
281 result
282}
283
284#[cfg(not(feature = "quick_check_oracle"))]
306pub(crate) fn quick_check_nfc(input: &str) -> IsNormalized {
307 quick_check_impl(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
308}
309
310#[cfg(feature = "quick_check_oracle")]
312pub fn quick_check_nfc(input: &str) -> IsNormalized {
313 quick_check_impl(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
314}
315
316#[cfg(not(feature = "quick_check_oracle"))]
318pub(crate) fn quick_check_nfd(input: &str) -> IsNormalized {
319 quick_check_impl(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
320}
321
322#[cfg(feature = "quick_check_oracle")]
324pub fn quick_check_nfd(input: &str) -> IsNormalized {
325 quick_check_impl(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
326}
327
328#[cfg(not(feature = "quick_check_oracle"))]
330pub(crate) fn quick_check_nfkc(input: &str) -> IsNormalized {
331 quick_check_impl(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
332}
333
334#[cfg(feature = "quick_check_oracle")]
336pub fn quick_check_nfkc(input: &str) -> IsNormalized {
337 quick_check_impl(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
338}
339
340#[cfg(not(feature = "quick_check_oracle"))]
342pub(crate) fn quick_check_nfkd(input: &str) -> IsNormalized {
343 quick_check_impl(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
344}
345
346#[cfg(feature = "quick_check_oracle")]
348pub fn quick_check_nfkd(input: &str) -> IsNormalized {
349 quick_check_impl(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
350}
351
352#[cfg(feature = "quick_check_oracle")]
364#[inline]
365fn quick_check_impl_oracle(
366 input: &str,
367 qc_shift: u32,
368 simd_bound: u8,
369 safe_below: u32,
370 hangul_safe: bool,
371 kana_safe: bool,
372) -> IsNormalized {
373 let bytes = input.as_bytes();
374 let len = bytes.len();
375
376 if len < 64 {
377 return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe);
378 }
379
380 let ptr = bytes.as_ptr();
381 let mut last_ccc: u8 = 0;
382 let mut result = IsNormalized::Yes;
383 let mut processed_up_to: usize = 0;
384 let mut pos: usize = 0;
385
386 while pos + 64 <= len {
387 let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
389 let chunk_end = pos + 64;
390
391 if mask == 0 {
392 last_ccc = 0;
393 processed_up_to = chunk_end;
394 pos = chunk_end;
395 continue;
396 }
397
398 let chunk_start = pos;
399 let mut chunk_mask = mask;
400 while chunk_mask != 0 {
401 let bit_pos = chunk_mask.trailing_zeros() as usize;
402 chunk_mask &= chunk_mask.wrapping_sub(1);
403
404 let byte_pos = chunk_start + bit_pos;
405 if byte_pos < processed_up_to {
406 continue;
407 }
408 if byte_pos > processed_up_to {
409 last_ccc = 0;
410 }
411
412 let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
413 processed_up_to = byte_pos + width;
414
415 let cp = ch as u32;
416 if cp < safe_below
417 || is_cjk_unified(cp)
418 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
419 || (kana_safe && is_kana(cp))
420 || (cp >= 0x10000 && is_supp_safe(cp))
421 {
422 last_ccc = 0;
423 continue;
424 }
425
426 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
427 if ccc != 0 && last_ccc > ccc {
428 return IsNormalized::No;
429 }
430 match qc_value_to_result(qc) {
431 IsNormalized::No => return IsNormalized::No,
432 IsNormalized::Maybe => result = IsNormalized::Maybe,
433 IsNormalized::Yes => {},
434 }
435 last_ccc = ccc;
436 }
437
438 if processed_up_to < chunk_end {
439 last_ccc = 0;
440 processed_up_to = chunk_end;
441 }
442 pos = chunk_end;
443 }
444
445 let tail_start = processed_up_to.max(pos);
448 if tail_start > processed_up_to {
449 last_ccc = 0;
450 }
451 let mut tail_pos = tail_start;
452 while tail_pos < len {
453 let b = bytes[tail_pos];
454 if b < 0x80 {
455 last_ccc = 0;
456 tail_pos += 1;
457 continue;
458 }
459 if utf8::is_continuation_byte(b) {
460 tail_pos += 1;
461 continue;
462 }
463 let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
464 let cp = ch as u32;
465 if cp < safe_below
466 || is_cjk_unified(cp)
467 || (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
468 || (cp >= 0x10000 && is_supp_safe(cp))
469 {
470 last_ccc = 0;
471 tail_pos += width;
472 continue;
473 }
474 let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
475 if ccc != 0 && last_ccc > ccc {
476 return IsNormalized::No;
477 }
478 match qc_value_to_result(qc) {
479 IsNormalized::No => return IsNormalized::No,
480 IsNormalized::Maybe => result = IsNormalized::Maybe,
481 IsNormalized::Yes => {},
482 }
483 last_ccc = ccc;
484 tail_pos += width;
485 }
486
487 result
488}
489
490#[cfg(feature = "quick_check_oracle")]
492pub fn quick_check_nfc_oracle(input: &str) -> IsNormalized {
493 quick_check_impl_oracle(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
494}
495
496#[cfg(feature = "quick_check_oracle")]
498pub fn quick_check_nfd_oracle(input: &str) -> IsNormalized {
499 quick_check_impl_oracle(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
500}
501
502#[cfg(feature = "quick_check_oracle")]
504pub fn quick_check_nfkc_oracle(input: &str) -> IsNormalized {
505 quick_check_impl_oracle(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
506}
507
508#[cfg(feature = "quick_check_oracle")]
510pub fn quick_check_nfkd_oracle(input: &str) -> IsNormalized {
511 quick_check_impl_oracle(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
512}
513
514pub(crate) fn is_normalized_nfc(input: &str) -> bool {
523 match quick_check_nfc(input) {
524 IsNormalized::Yes => true,
525 IsNormalized::No => false,
526 IsNormalized::Maybe => &*crate::nfc().normalize(input) == input,
527 }
528}
529
530pub(crate) fn is_normalized_nfd(input: &str) -> bool {
532 match quick_check_nfd(input) {
533 IsNormalized::Yes => true,
534 IsNormalized::No => false,
535 IsNormalized::Maybe => &*crate::nfd().normalize(input) == input,
536 }
537}
538
539pub(crate) fn is_normalized_nfkc(input: &str) -> bool {
541 match quick_check_nfkc(input) {
542 IsNormalized::Yes => true,
543 IsNormalized::No => false,
544 IsNormalized::Maybe => &*crate::nfkc().normalize(input) == input,
545 }
546}
547
548pub(crate) fn is_normalized_nfkd(input: &str) -> bool {
550 match quick_check_nfkd(input) {
551 IsNormalized::Yes => true,
552 IsNormalized::No => false,
553 IsNormalized::Maybe => &*crate::nfkd().normalize(input) == input,
554 }
555}
556
557#[cfg(test)]
558mod tests {
559 use super::*;
560 use alloc::format;
561 use alloc::string::String;
562
563 #[test]
566 fn ascii_is_nfc() {
567 assert_eq!(quick_check_nfc("Hello, world!"), IsNormalized::Yes);
568 }
569
570 #[test]
571 fn ascii_is_nfd() {
572 assert_eq!(quick_check_nfd("Hello, world!"), IsNormalized::Yes);
573 }
574
575 #[test]
576 fn ascii_is_nfkc() {
577 assert_eq!(quick_check_nfkc("Hello, world!"), IsNormalized::Yes);
578 }
579
580 #[test]
581 fn ascii_is_nfkd() {
582 assert_eq!(quick_check_nfkd("Hello, world!"), IsNormalized::Yes);
583 }
584
585 #[test]
586 fn empty_string_is_normalized() {
587 assert_eq!(quick_check_nfc(""), IsNormalized::Yes);
588 assert_eq!(quick_check_nfd(""), IsNormalized::Yes);
589 assert_eq!(quick_check_nfkc(""), IsNormalized::Yes);
590 assert_eq!(quick_check_nfkd(""), IsNormalized::Yes);
591 }
592
593 #[test]
596 fn precomposed_is_nfc_yes() {
597 assert_eq!(quick_check_nfc("\u{00E9}"), IsNormalized::Yes);
598 }
599
600 #[test]
601 fn decomposed_is_not_nfc() {
602 let nfd = "e\u{0301}";
603 let result = quick_check_nfc(nfd);
604 assert!(
605 result == IsNormalized::No || result == IsNormalized::Maybe,
606 "NFD form must not be Yes for NFC, got {:?}",
607 result,
608 );
609 }
610
611 #[test]
614 fn precomposed_is_not_nfd() {
615 assert_eq!(quick_check_nfd("\u{00E9}"), IsNormalized::No);
616 }
617
618 #[test]
621 fn wrong_ccc_order_is_no() {
622 let bad_order = "a\u{0301}\u{0327}"; assert_eq!(quick_check_nfc(bad_order), IsNormalized::No);
624 assert_eq!(quick_check_nfd(bad_order), IsNormalized::No);
625 }
626
627 #[test]
628 fn correct_ccc_order_not_rejected() {
629 let good_order = "a\u{0591}\u{05A1}";
632 let result = quick_check_nfc(good_order);
633 assert_ne!(result, IsNormalized::No);
634 }
635
636 #[test]
639 fn latin1_supplement_is_nfc() {
640 let latin1 = "\u{00C0}\u{00E9}\u{00F6}\u{00FC}\u{00FF}";
642 assert_eq!(quick_check_nfc(latin1), IsNormalized::Yes);
643 }
644
645 #[test]
646 fn latin_extended_is_nfc() {
647 let extended = "\u{0100}\u{017E}\u{0250}\u{02FF}";
649 assert_eq!(quick_check_nfc(extended), IsNormalized::Yes);
650 }
651
652 #[test]
653 fn cjk_is_nfc() {
654 let cjk = "\u{4E00}\u{9FFF}\u{3400}\u{4DBF}";
655 assert_eq!(quick_check_nfc(cjk), IsNormalized::Yes);
656 }
657
658 #[test]
659 fn hangul_syllable_is_nfc() {
660 let hangul = "\u{AC00}\u{D7A3}";
661 assert_eq!(quick_check_nfc(hangul), IsNormalized::Yes);
662 }
663
664 #[test]
665 fn hangul_syllable_is_not_nfd() {
666 let hangul = "\u{AC00}";
667 assert_eq!(quick_check_nfd(hangul), IsNormalized::No);
668 }
669
670 #[test]
671 fn latin1_is_not_nfd() {
672 assert_eq!(quick_check_nfd("\u{00C0}"), IsNormalized::No);
674 }
675
676 #[test]
677 fn nbsp_is_not_nfkc() {
678 assert_eq!(quick_check_nfkc("\u{00A0}"), IsNormalized::No);
680 }
681
682 #[test]
685 fn is_normalized_nfc_ascii() {
686 assert!(is_normalized_nfc("Hello"));
687 }
688
689 #[test]
690 fn is_normalized_nfc_precomposed() {
691 assert!(is_normalized_nfc("\u{00E9}"));
692 }
693
694 #[test]
695 fn is_normalized_nfd_decomposed() {
696 assert!(is_normalized_nfd("e\u{0301}"));
697 }
698
699 #[test]
700 fn is_normalized_nfc_rejects_nfd() {
701 assert!(!is_normalized_nfc("e\u{0301}"));
702 }
703
704 #[test]
705 fn is_normalized_nfd_rejects_nfc() {
706 assert!(!is_normalized_nfd("\u{00E9}"));
707 }
708
709 #[test]
710 fn safe_lead_interleaved_with_combining_marks_across_chunk() {
711 let unit = "\u{4E2D}a\u{0591}bb";
720 let s: String = unit.repeat(16);
721 assert_eq!(s.len(), 128);
722 assert_eq!(quick_check_nfc(&s), IsNormalized::Yes);
724 assert_eq!(quick_check_nfd(&s), IsNormalized::Yes);
726 assert_eq!(quick_check_nfkc(&s), IsNormalized::Yes);
727 assert_eq!(quick_check_nfkd(&s), IsNormalized::Yes);
728 }
729
730 #[test]
731 fn safe_lead_then_out_of_order_combining_is_no() {
732 let unit = "\u{4E2D}a\u{0301}\u{0327}"; let padding = "x".repeat(64); let s = format!("{}{}", padding, unit);
740 assert!(s.len() >= 64);
741 assert_eq!(quick_check_nfc(&s), IsNormalized::No);
742 }
743
744 #[cfg(feature = "quick_check_oracle")]
745 #[test]
746 fn oracle_matches_fastpath_on_fixed_input() {
747 let s = "\u{4E2D}a\u{0591}bb".repeat(16);
748 assert_eq!(quick_check_nfc(&s), super::quick_check_nfc_oracle(&s));
749 assert_eq!(quick_check_nfd(&s), super::quick_check_nfd_oracle(&s));
750 assert_eq!(quick_check_nfkc(&s), super::quick_check_nfkc_oracle(&s));
751 assert_eq!(quick_check_nfkd(&s), super::quick_check_nfkd_oracle(&s));
752 }
753}