1use alloc::borrow::Cow;
8use alloc::string::String;
9
10use crate::ccc::CccBuffer;
11use crate::compose;
12use crate::decompose::{self, DecompForm};
13use crate::hangul;
14use crate::quick_check;
15use crate::simd;
16use crate::simd::prefetch;
17use crate::tables;
18use crate::utf8;
19
20#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum Form {
27 Nfc,
29 Nfd,
31 Nfkc,
33 Nfkd,
35}
36
37impl Form {
38 #[inline]
47 fn passthrough_bound(self) -> u8 {
48 match self {
49 Form::Nfc | Form::Nfkc => 0xC0,
50 Form::Nfd | Form::Nfkd => 0xC0,
51 }
52 }
53
54 #[inline]
56 fn composes(self) -> bool {
57 matches!(self, Form::Nfc | Form::Nfkc)
58 }
59
60 #[inline]
62 fn decomp_form(self) -> DecompForm {
63 match self {
64 Form::Nfc | Form::Nfd => DecompForm::Canonical,
65 Form::Nfkc | Form::Nfkd => DecompForm::Compatible,
66 }
67 }
68
69 #[inline]
71 fn estimated_capacity(self, input_len: usize) -> usize {
72 match self {
73 Form::Nfc | Form::Nfkc => input_len,
74 Form::Nfd | Form::Nfkd => input_len + input_len / 2,
75 }
76 }
77
78 #[inline]
80 fn quick_check(self, input: &str) -> quick_check::IsNormalized {
81 match self {
82 Form::Nfc => quick_check::quick_check_nfc(input),
83 Form::Nfd => quick_check::quick_check_nfd(input),
84 Form::Nfkc => quick_check::quick_check_nfkc(input),
85 Form::Nfkd => quick_check::quick_check_nfkd(input),
86 }
87 }
88}
89
90struct NormState {
95 current_starter: Option<char>,
97 ccc_buf: CccBuffer,
99}
100
101impl NormState {
102 #[inline]
103 fn new() -> Self {
104 NormState {
105 current_starter: None,
106 ccc_buf: CccBuffer::new(),
107 }
108 }
109
110 #[inline]
114 fn flush(&mut self, out: &mut String, composes: bool) {
115 let starter = match self.current_starter.take() {
116 Some(s) => s,
117 None => {
118 if !self.ccc_buf.is_empty() {
120 self.ccc_buf.sort_in_place();
121 for entry in self.ccc_buf.as_slice() {
122 out.push(entry.ch);
123 }
124 self.ccc_buf.clear();
125 }
126 return;
127 },
128 };
129
130 if self.ccc_buf.is_empty() {
131 out.push(starter);
133 return;
134 }
135
136 self.ccc_buf.sort_in_place();
138
139 if composes {
140 compose::compose_combining_sequence_into(starter, self.ccc_buf.as_slice(), out);
141 } else {
142 out.push(starter);
144 for entry in self.ccc_buf.as_slice() {
145 out.push(entry.ch);
146 }
147 }
148 self.ccc_buf.clear();
149 }
150
151 #[inline]
157 fn feed_entry(&mut self, ch: char, ccc: u8, out: &mut String, composes: bool) {
158 if ccc == 0 {
159 if composes && self.ccc_buf.is_empty() {
161 if let Some(prev) = self.current_starter
163 && let Some(composed) = compose::compose(prev, ch)
164 {
165 self.current_starter = Some(composed);
166 return;
167 }
168 }
169 self.flush(out, composes);
171 self.current_starter = Some(ch);
172 } else {
173 self.ccc_buf.push(ch, ccc);
175 }
176 }
177
178 #[inline]
180 fn flush_nfd(&mut self, out: &mut String) {
181 let starter = match self.current_starter.take() {
182 Some(s) => s,
183 None => {
184 if !self.ccc_buf.is_empty() {
185 self.ccc_buf.sort_in_place();
186 for entry in self.ccc_buf.as_slice() {
187 out.push(entry.ch);
188 }
189 self.ccc_buf.clear();
190 }
191 return;
192 },
193 };
194
195 if let Some(entry) = self.ccc_buf.take_single_inline() {
198 out.push(starter);
199 out.push(entry.ch);
200 return;
201 }
202
203 if self.ccc_buf.is_empty() {
204 out.push(starter);
205 return;
206 }
207
208 self.ccc_buf.sort_in_place();
210 out.push(starter);
211 for entry in self.ccc_buf.as_slice() {
212 out.push(entry.ch);
213 }
214 self.ccc_buf.clear();
215 }
216
217 #[inline]
219 fn feed_entry_nfd(&mut self, ch: char, ccc: u8, out: &mut String) {
220 if ccc == 0 {
221 self.flush_nfd(out);
222 self.current_starter = Some(ch);
223 } else {
224 self.ccc_buf.push(ch, ccc);
225 }
226 }
227}
228
229#[inline(always)]
236fn is_cjk_unified(cp: u32) -> bool {
237 (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
238}
239
240const LATIN1_SELF_MAPPING: (u8, u16, u8) = (0, 0, 0);
267
268#[rustfmt::skip]
277static LATIN1_NFD_TABLE: [(u8, u16, u8); 0x40] = [
278 (b'A', 0x0300, 230), (b'A', 0x0301, 230), (b'A', 0x0302, 230), (b'A', 0x0303, 230),
280 (b'A', 0x0308, 230), (b'A', 0x030A, 230), LATIN1_SELF_MAPPING, (b'C', 0x0327, 202),
281 (b'E', 0x0300, 230), (b'E', 0x0301, 230), (b'E', 0x0302, 230), (b'E', 0x0308, 230),
282 (b'I', 0x0300, 230), (b'I', 0x0301, 230), (b'I', 0x0302, 230), (b'I', 0x0308, 230),
283 LATIN1_SELF_MAPPING, (b'N', 0x0303, 230), (b'O', 0x0300, 230), (b'O', 0x0301, 230),
285 (b'O', 0x0302, 230), (b'O', 0x0303, 230), (b'O', 0x0308, 230), LATIN1_SELF_MAPPING,
286 LATIN1_SELF_MAPPING, (b'U', 0x0300, 230), (b'U', 0x0301, 230), (b'U', 0x0302, 230),
287 (b'U', 0x0308, 230), (b'Y', 0x0301, 230), LATIN1_SELF_MAPPING, LATIN1_SELF_MAPPING,
288 (b'a', 0x0300, 230), (b'a', 0x0301, 230), (b'a', 0x0302, 230), (b'a', 0x0303, 230),
290 (b'a', 0x0308, 230), (b'a', 0x030A, 230), LATIN1_SELF_MAPPING, (b'c', 0x0327, 202),
291 (b'e', 0x0300, 230), (b'e', 0x0301, 230), (b'e', 0x0302, 230), (b'e', 0x0308, 230),
292 (b'i', 0x0300, 230), (b'i', 0x0301, 230), (b'i', 0x0302, 230), (b'i', 0x0308, 230),
293 LATIN1_SELF_MAPPING, (b'n', 0x0303, 230), (b'o', 0x0300, 230), (b'o', 0x0301, 230),
295 (b'o', 0x0302, 230), (b'o', 0x0303, 230), (b'o', 0x0308, 230), LATIN1_SELF_MAPPING,
296 LATIN1_SELF_MAPPING, (b'u', 0x0300, 230), (b'u', 0x0301, 230), (b'u', 0x0302, 230),
297 (b'u', 0x0308, 230), (b'y', 0x0301, 230), LATIN1_SELF_MAPPING, (b'y', 0x0308, 230),
298];
299
300#[inline(always)]
317unsafe fn latin1_supplement_nfd(bytes: *const u8, byte_pos: usize) -> Option<(u8, char, u8)> {
318 let b1 = unsafe { *bytes.add(byte_pos + 1) };
320 let idx = (b1 & 0x3F) as usize; let entry = LATIN1_NFD_TABLE[idx];
322 if entry.0 == 0 {
323 return None;
324 }
325 let mark = unsafe { char::from_u32_unchecked(entry.1 as u32) };
328 Some((entry.0, mark, entry.2))
329}
330
331#[inline]
336fn process_char(
337 ch: char,
338 state: &mut NormState,
339 out: &mut String,
340 form: Form,
341 decomp_buf: &mut CccBuffer,
342) {
343 let cp = ch as u32;
344
345 if cp >= 0x3400 && is_cjk_unified(cp) {
348 state.flush(out, form.composes());
349 state.current_starter = Some(ch);
350 return;
351 }
352
353 if hangul::is_hangul_syllable(ch) {
355 let (l, v, t) = hangul::decompose_hangul(ch);
356 state.feed_entry(l, 0, out, form.composes());
357 state.feed_entry(v, 0, out, form.composes());
358 if let Some(t_char) = t {
359 state.feed_entry(t_char, 0, out, form.composes());
360 }
361 return;
362 }
363
364 let trie_value = tables::raw_decomp_trie_value(ch, form.decomp_form());
366
367 if !tables::has_decomposition(trie_value) {
371 let ccc = tables::ccc_from_trie_value(trie_value);
372 state.feed_entry(ch, ccc, out, form.composes());
373 return;
374 }
375
376 decomp_buf.clear();
378 decompose::decompose_from_trie_value(ch, trie_value, decomp_buf, form.decomp_form());
379 for entry in decomp_buf.as_slice() {
380 state.feed_entry(entry.ch, entry.ccc, out, form.composes());
381 }
382}
383
384#[derive(Clone, Copy, Debug, PartialEq, Eq)]
399enum DecompKind {
400 None,
403 Canonical,
405 Compat,
407}
408
409struct DecodedCodepoint {
417 cp: u32,
419 cp_len: u8,
421 ccc: u8,
423 decomp_kind: DecompKind,
425 decomp: &'static [u32],
428 tv: u32,
431}
432
433#[inline(always)]
450unsafe fn decode_at(bytes: *const u8, idx: usize, len: usize, form: Form) -> DecodedCodepoint {
451 debug_assert!(idx < len);
452 let b0 = unsafe { *bytes.add(idx) };
454 let cp_len = utf8::utf8_char_width(b0);
455 debug_assert!(cp_len > 0, "decode_at called on continuation/invalid byte");
456 debug_assert!(idx + cp_len <= len, "UTF-8 sequence runs past end of input");
457
458 let cp = match cp_len {
463 1 => b0 as u32,
464 2 => {
465 let b1 = unsafe { *bytes.add(idx + 1) } as u32;
468 ((b0 as u32 & 0x1F) << 6) | (b1 & 0x3F)
469 },
470 3 => {
471 let b1 = unsafe { *bytes.add(idx + 1) } as u32;
473 let b2 = unsafe { *bytes.add(idx + 2) } as u32;
474 ((b0 as u32 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)
475 },
476 4 => {
477 let b1 = unsafe { *bytes.add(idx + 1) } as u32;
479 let b2 = unsafe { *bytes.add(idx + 2) } as u32;
480 let b3 = unsafe { *bytes.add(idx + 3) } as u32;
481 ((b0 as u32 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
482 },
483 _ => unsafe { core::hint::unreachable_unchecked() },
486 };
487
488 let decomp_form = form.decomp_form();
492 let tv = if cp >= 0x10000 {
493 unsafe { tables::raw_decomp_trie_value_supplementary(cp, decomp_form) }
495 } else {
496 let ch = unsafe { char::from_u32_unchecked(cp) };
498 tables::raw_decomp_trie_value(ch, decomp_form)
499 };
500 let ccc = tables::ccc_from_trie_value(tv);
501
502 let (decomp_kind, decomp) = if !tables::has_decomposition(tv) {
503 (DecompKind::None, &[][..])
504 } else {
505 let kind = match decomp_form {
506 DecompForm::Canonical => DecompKind::Canonical,
507 DecompForm::Compatible => DecompKind::Compat,
508 };
509 let slice = tables::expansion_data_from_trie_value(tv, decomp_form).unwrap_or(&[]);
511 (kind, slice)
512 };
513
514 DecodedCodepoint {
515 cp,
516 cp_len: cp_len as u8,
517 ccc,
518 decomp_kind,
519 decomp,
520 tv,
521 }
522}
523
524#[inline(always)]
531fn feed_expansion(decomp: &'static [u32], state: &mut NormState, out: &mut String, composes: bool) {
532 if !composes && decomp.len() == 2 {
533 let e0 = decomp[0];
534 let ccc0 = (e0 >> tables::EXPANSION_CCC_SHIFT) as u8;
535 if ccc0 == 0 {
536 state.flush_nfd(out);
538 let cp0 = e0 & tables::EXPANSION_CP_MASK;
539 debug_assert!(cp0 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp0));
540 state.current_starter = Some(unsafe { char::from_u32_unchecked(cp0) });
542 let e1 = decomp[1];
543 let cp1 = e1 & tables::EXPANSION_CP_MASK;
544 let ccc1 = (e1 >> tables::EXPANSION_CCC_SHIFT) as u8;
545 debug_assert!(cp1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp1));
546 let ch1 = unsafe { char::from_u32_unchecked(cp1) };
548 if ccc1 != 0 {
549 state.ccc_buf.push(ch1, ccc1);
550 } else {
551 state.feed_entry_nfd(ch1, 0, out);
552 }
553 return;
554 }
555 }
556 for &entry in decomp {
557 let cp = entry & tables::EXPANSION_CP_MASK;
558 let ccc = (entry >> tables::EXPANSION_CCC_SHIFT) as u8;
559 debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
560 let exp_ch = unsafe { char::from_u32_unchecked(cp) };
562 if composes {
563 state.feed_entry(exp_ch, ccc, out, true);
564 } else {
565 state.feed_entry_nfd(exp_ch, ccc, out);
566 }
567 }
568}
569
570#[cold]
575#[inline(never)]
576fn feed_singleton(tv: u32, state: &mut NormState, out: &mut String, composes: bool) {
577 let info = tv & 0xFFFF;
578 debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
579 let decomposed = unsafe { char::from_u32_unchecked(info) };
581 let ccc = if info <= 0x7F {
582 0
583 } else {
584 tables::lookup_ccc(decomposed)
585 };
586 if composes {
587 state.feed_entry(decomposed, ccc, out, true);
588 } else {
589 state.feed_entry_nfd(decomposed, ccc, out);
590 }
591}
592
593#[inline]
600fn feed_combining_mark(ch: char, ccc: u8, state: &mut NormState, out: &mut String, composes: bool) {
601 if composes {
602 state.feed_entry(ch, ccc, out, true);
603 } else {
604 state.feed_entry_nfd(ch, ccc, out);
605 }
606}
607
608#[inline(always)]
613fn process_codepoint(dc: &DecodedCodepoint, state: &mut NormState, out: &mut String, form: Form) {
614 let composes = form.composes();
615 match dc.decomp_kind {
616 DecompKind::None => {
617 let ch = unsafe { char::from_u32_unchecked(dc.cp) };
620 if dc.ccc == 0 {
621 if composes {
622 state.feed_entry(ch, 0, out, true);
623 } else {
624 state.feed_entry_nfd(ch, 0, out);
625 }
626 } else {
627 feed_combining_mark(ch, dc.ccc, state, out, composes);
628 }
629 },
630 DecompKind::Canonical | DecompKind::Compat => {
631 if !dc.decomp.is_empty() {
632 feed_expansion(dc.decomp, state, out, composes);
633 } else {
634 feed_singleton(dc.tv, state, out, composes);
635 }
636 },
637 }
638}
639
640#[inline(always)]
649fn flush_compose_passthrough(pass: &str, next_tv: u32, state: &mut NormState, out: &mut String) {
650 if tables::needs_starter_shadow(next_tv) {
651 let n = pass.len();
652 if n > 1 {
653 out.push_str(&pass[..n - 1]);
654 }
655 let last_ch = pass.as_bytes()[n - 1] as char;
656 state.feed_entry(last_ch, 0, out, true);
657 } else {
658 out.push_str(pass);
659 }
660}
661
662fn normalize_scalar<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
668 if input.is_empty() {
669 return Cow::Borrowed(input);
670 }
671
672 if form.quick_check(input) == quick_check::IsNormalized::Yes {
674 return Cow::Borrowed(input);
675 }
676
677 let mut out = String::with_capacity(input.len());
678 let mut state = NormState::new();
679 let mut decomp_buf = CccBuffer::new();
680
681 for ch in input.chars() {
682 process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
683 }
684
685 state.flush(&mut out, form.composes());
687
688 if out == input {
689 Cow::Borrowed(input)
690 } else {
691 Cow::Owned(out)
692 }
693}
694
695#[inline]
714fn normalize_impl<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
715 let bytes = input.as_bytes();
716 let len = bytes.len();
717
718 if len < 64 {
720 return normalize_scalar(input, form);
721 }
722
723 let qc = form.quick_check(input);
725 if qc == quick_check::IsNormalized::Yes {
726 return Cow::Borrowed(input);
727 }
728
729 let bound = form.passthrough_bound();
731 let composes = form.composes();
732 let mut out = String::with_capacity(form.estimated_capacity(len));
733 let mut last_written: usize = 0;
734 let mut state = NormState::new();
735
736 let mut pos: usize = 0;
737 let ptr = bytes.as_ptr();
738
739 macro_rules! prefetch_write_head {
742 ($out:expr) => {
743 unsafe {
744 let write_head = $out.len();
745 let distance = prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE;
746 if write_head + distance <= $out.capacity() {
747 prefetch::prefetch_write($out.as_ptr().wrapping_add(write_head + distance));
748 }
749 }
750 };
751 }
752
753 macro_rules! process_chunk {
760 ($chunk_start:expr, $mask:expr) => {{
761 let chunk_start: usize = $chunk_start;
762 let mask: u64 = $mask;
763 if mask != 0 {
764 let mut chunk_mask = mask;
765 while chunk_mask != 0 {
766 let bit_pos = chunk_mask.trailing_zeros() as usize;
767 chunk_mask &= chunk_mask.wrapping_sub(1);
768
769 let byte_pos = chunk_start + bit_pos;
770
771 if byte_pos < last_written {
772 continue;
773 }
774
775 if utf8::is_continuation_byte(bytes[byte_pos]) {
776 continue;
777 }
778
779 if !composes && bytes[byte_pos] == 0xC3 {
781 if let Some((starter, mark, mark_ccc)) =
782 unsafe { latin1_supplement_nfd(ptr, byte_pos) }
783 {
784 if byte_pos > last_written {
785 state.flush_nfd(&mut out);
786 out.push_str(&input[last_written..byte_pos]);
787 }
788 last_written = byte_pos + 2;
789 state.flush_nfd(&mut out);
790 out.push(starter as char);
791 state.ccc_buf.push(mark, mark_ccc);
792 continue;
793 }
794 }
795
796 let dc = unsafe { decode_at(ptr, byte_pos, len, form) };
798 let width = dc.cp_len as usize;
799
800 if !composes {
801 if (hangul::S_BASE..hangul::S_BASE + hangul::S_COUNT).contains(&dc.cp) {
803 if byte_pos > last_written {
804 state.flush_nfd(&mut out);
805 out.push_str(&input[last_written..byte_pos]);
806 }
807 last_written = byte_pos + width;
808 state.flush_nfd(&mut out);
809 let ch = unsafe { char::from_u32_unchecked(dc.cp) };
810 hangul::push_decomposed_hangul(ch, &mut out);
815 continue;
816 }
817 if dc.decomp_kind == DecompKind::None && dc.ccc == 0 {
819 continue;
820 }
821 if byte_pos > last_written {
822 state.flush_nfd(&mut out);
823 out.push_str(&input[last_written..byte_pos]);
824 }
825 last_written = byte_pos + width;
826 process_codepoint(&dc, &mut state, &mut out, form);
827 continue;
828 }
829
830 if byte_pos > last_written {
832 state.flush(&mut out, composes);
833 let pass = &input[last_written..byte_pos];
834 flush_compose_passthrough(pass, dc.tv, &mut state, &mut out);
835 }
836 last_written = byte_pos + width;
837 process_codepoint(&dc, &mut state, &mut out, form);
838 }
839 }
840 }};
841 }
842
843 while pos + 128 <= len {
851 let chunk_a_start = pos;
852 let chunk_b_start = pos + 64;
853
854 let (mask_a, mask_b) = unsafe {
855 let prefetch_l1 =
856 ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
857 let prefetch_l2 =
858 ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
859 simd::scan_pair_and_prefetch(
860 ptr.add(chunk_a_start),
861 ptr.add(chunk_b_start),
862 prefetch_l1,
863 prefetch_l2,
864 bound,
865 )
866 };
867
868 prefetch_write_head!(out);
869 process_chunk!(chunk_a_start, mask_a);
870 process_chunk!(chunk_b_start, mask_b);
871
872 pos += 128;
873 }
874
875 while pos + 64 <= len {
877 let chunk_start = pos;
878
879 let mask = unsafe {
880 let prefetch_l1 =
881 ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
882 let prefetch_l2 =
883 ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
884 simd::scan_and_prefetch(ptr.add(pos), prefetch_l1, prefetch_l2, bound)
885 };
886
887 prefetch_write_head!(out);
888 process_chunk!(chunk_start, mask);
889
890 pos += 64;
891 }
892
893 if pos < len {
895 let tail_has_work = bytes[pos..].iter().any(|&b| b >= bound);
897
898 if tail_has_work {
899 let mut tail_pos = pos;
903 while tail_pos < len {
904 if tail_pos < last_written {
905 tail_pos += 1;
906 continue;
907 }
908
909 if utf8::is_continuation_byte(bytes[tail_pos]) {
910 tail_pos += 1;
911 continue;
912 }
913
914 if !composes && bytes[tail_pos] == 0xC3 {
916 if let Some((starter, mark, mark_ccc)) =
918 unsafe { latin1_supplement_nfd(ptr, tail_pos) }
919 {
920 if tail_pos > last_written {
921 state.flush_nfd(&mut out);
922 out.push_str(&input[last_written..tail_pos]);
923 }
924 last_written = tail_pos + 2;
925 state.flush_nfd(&mut out);
926 out.push(starter as char);
927 state.ccc_buf.push(mark, mark_ccc);
928 tail_pos += 2;
929 continue;
930 }
931 }
932
933 let dc = unsafe { decode_at(ptr, tail_pos, len, form) };
936 let width = dc.cp_len as usize;
937
938 if !composes {
941 if (hangul::S_BASE..hangul::S_BASE + hangul::S_COUNT).contains(&dc.cp) {
942 if tail_pos > last_written {
943 state.flush_nfd(&mut out);
944 out.push_str(&input[last_written..tail_pos]);
945 }
946 last_written = tail_pos + width;
947 state.flush_nfd(&mut out);
948 let ch = unsafe { char::from_u32_unchecked(dc.cp) };
950 hangul::push_decomposed_hangul(ch, &mut out);
951 tail_pos += width;
952 continue;
953 }
954 if dc.decomp_kind == DecompKind::None && dc.ccc == 0 {
955 tail_pos += width;
956 continue;
957 }
958 if tail_pos > last_written {
959 state.flush_nfd(&mut out);
960 out.push_str(&input[last_written..tail_pos]);
961 }
962 last_written = tail_pos + width;
963 process_codepoint(&dc, &mut state, &mut out, form);
964 tail_pos += width;
965 continue;
966 }
967
968 if tail_pos > last_written {
970 state.flush(&mut out, composes);
971 let pass = &input[last_written..tail_pos];
972 flush_compose_passthrough(pass, dc.tv, &mut state, &mut out);
974 }
975
976 last_written = tail_pos + width;
977 process_codepoint(&dc, &mut state, &mut out, form);
978
979 tail_pos += width;
980 }
981 }
982 }
983
984 if composes {
986 state.flush(&mut out, true);
987 } else {
988 state.flush_nfd(&mut out);
989 }
990
991 if last_written < len {
993 out.push_str(&input[last_written..len]);
994 }
995
996 if qc == quick_check::IsNormalized::Maybe && out == input {
999 Cow::Borrowed(input)
1000 } else {
1001 Cow::Owned(out)
1002 }
1003}
1004
1005pub struct NfcNormalizer;
1011
1012pub struct NfdNormalizer;
1014
1015pub struct NfkcNormalizer;
1017
1018pub struct NfkdNormalizer;
1020
1021impl Default for NfcNormalizer {
1022 fn default() -> Self {
1023 Self::new()
1024 }
1025}
1026
1027impl Default for NfdNormalizer {
1028 fn default() -> Self {
1029 Self::new()
1030 }
1031}
1032
1033impl Default for NfkcNormalizer {
1034 fn default() -> Self {
1035 Self::new()
1036 }
1037}
1038
1039impl Default for NfkdNormalizer {
1040 fn default() -> Self {
1041 Self::new()
1042 }
1043}
1044
1045impl NfcNormalizer {
1046 pub fn new() -> Self {
1048 NfcNormalizer
1049 }
1050
1051 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1053 quick_check::quick_check_nfc(input)
1054 }
1055
1056 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1060 normalize_impl(input, Form::Nfc)
1061 }
1062
1063 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1067 let result = normalize_impl(input, Form::Nfc);
1068 let already_normalized = matches!(&result, Cow::Borrowed(_));
1069 out.push_str(&result);
1070 already_normalized
1071 }
1072
1073 pub fn is_normalized(&self, input: &str) -> bool {
1075 quick_check::is_normalized_nfc(input)
1076 }
1077}
1078
1079impl NfdNormalizer {
1080 pub fn new() -> Self {
1082 NfdNormalizer
1083 }
1084
1085 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1087 quick_check::quick_check_nfd(input)
1088 }
1089
1090 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1094 normalize_impl(input, Form::Nfd)
1095 }
1096
1097 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1101 let result = normalize_impl(input, Form::Nfd);
1102 let already_normalized = matches!(&result, Cow::Borrowed(_));
1103 out.push_str(&result);
1104 already_normalized
1105 }
1106
1107 pub fn is_normalized(&self, input: &str) -> bool {
1109 quick_check::is_normalized_nfd(input)
1110 }
1111}
1112
1113impl NfkcNormalizer {
1114 pub fn new() -> Self {
1116 NfkcNormalizer
1117 }
1118
1119 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1121 quick_check::quick_check_nfkc(input)
1122 }
1123
1124 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1128 normalize_impl(input, Form::Nfkc)
1129 }
1130
1131 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1135 let result = normalize_impl(input, Form::Nfkc);
1136 let already_normalized = matches!(&result, Cow::Borrowed(_));
1137 out.push_str(&result);
1138 already_normalized
1139 }
1140
1141 pub fn is_normalized(&self, input: &str) -> bool {
1143 quick_check::is_normalized_nfkc(input)
1144 }
1145}
1146
1147impl NfkdNormalizer {
1148 pub fn new() -> Self {
1150 NfkdNormalizer
1151 }
1152
1153 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
1155 quick_check::quick_check_nfkd(input)
1156 }
1157
1158 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
1162 normalize_impl(input, Form::Nfkd)
1163 }
1164
1165 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
1169 let result = normalize_impl(input, Form::Nfkd);
1170 let already_normalized = matches!(&result, Cow::Borrowed(_));
1171 out.push_str(&result);
1172 already_normalized
1173 }
1174
1175 pub fn is_normalized(&self, input: &str) -> bool {
1177 quick_check::is_normalized_nfkd(input)
1178 }
1179}
1180
1181#[cfg(test)]
1186mod tests {
1187 use super::*;
1188 use alloc::borrow::Cow;
1189 use alloc::string::String;
1190 use alloc::vec::Vec;
1191
1192 #[test]
1197 fn latin1_table_matches_runtime_lookup_nfd() {
1198 for cp in 0xC0u32..=0xFF {
1202 let ch = char::from_u32(cp).unwrap();
1203 let mut buf = String::new();
1204 buf.push(ch);
1205 let general: Cow<'_, str> = normalize_impl(&buf, Form::Nfd);
1206
1207 let entry = LATIN1_NFD_TABLE[(cp - 0xC0) as usize];
1208 let mut fast = String::new();
1209 if entry.0 == 0 {
1210 fast.push(ch);
1212 } else {
1213 fast.push(entry.0 as char);
1214 fast.push(char::from_u32(entry.1 as u32).unwrap());
1215 }
1216 assert_eq!(
1217 &*general, fast,
1218 "NFD mismatch for U+{:04X}: trie={:?} table={:?}",
1219 cp, &*general, fast
1220 );
1221 }
1222 }
1223
1224 #[test]
1225 fn latin1_table_matches_runtime_lookup_nfkd() {
1226 for cp in 0xC0u32..=0xFF {
1228 let ch = char::from_u32(cp).unwrap();
1229 let mut buf = String::new();
1230 buf.push(ch);
1231 let nfd: Cow<'_, str> = normalize_impl(&buf, Form::Nfd);
1232 let nfkd: Cow<'_, str> = normalize_impl(&buf, Form::Nfkd);
1233 assert_eq!(
1234 &*nfd, &*nfkd,
1235 "NFD/NFKD diverge for U+{:04X}: nfd={:?} nfkd={:?}",
1236 cp, &*nfd, &*nfkd
1237 );
1238 }
1239 }
1240
1241 #[test]
1246 fn passthrough_bound_all_forms_return_0xc0() {
1247 assert_eq!(Form::Nfc.passthrough_bound(), 0xC0);
1248 assert_eq!(Form::Nfd.passthrough_bound(), 0xC0);
1249 assert_eq!(Form::Nfkc.passthrough_bound(), 0xC0);
1250 assert_eq!(Form::Nfkd.passthrough_bound(), 0xC0);
1251 }
1252
1253 #[test]
1254 fn composes_nfc_nfkc_true_nfd_nfkd_false() {
1255 assert!(Form::Nfc.composes());
1256 assert!(Form::Nfkc.composes());
1257 assert!(!Form::Nfd.composes());
1258 assert!(!Form::Nfkd.composes());
1259 }
1260
1261 #[test]
1262 fn decomp_form_canonical_vs_compatible() {
1263 assert_eq!(Form::Nfc.decomp_form(), DecompForm::Canonical);
1264 assert_eq!(Form::Nfd.decomp_form(), DecompForm::Canonical);
1265 assert_eq!(Form::Nfkc.decomp_form(), DecompForm::Compatible);
1266 assert_eq!(Form::Nfkd.decomp_form(), DecompForm::Compatible);
1267 }
1268
1269 #[test]
1270 fn estimated_capacity_nfc_nfkc_same_nfd_nfkd_larger() {
1271 let input_len = 100;
1272 assert_eq!(Form::Nfc.estimated_capacity(input_len), 100);
1273 assert_eq!(Form::Nfkc.estimated_capacity(input_len), 100);
1274 assert_eq!(Form::Nfd.estimated_capacity(input_len), 150);
1275 assert_eq!(Form::Nfkd.estimated_capacity(input_len), 150);
1276 }
1277
1278 #[test]
1279 fn estimated_capacity_zero_length() {
1280 assert_eq!(Form::Nfc.estimated_capacity(0), 0);
1281 assert_eq!(Form::Nfd.estimated_capacity(0), 0);
1282 }
1283
1284 #[test]
1285 fn quick_check_ascii_is_yes_for_all_forms() {
1286 let ascii = "Hello, World!";
1287 assert_eq!(Form::Nfc.quick_check(ascii), quick_check::IsNormalized::Yes);
1288 assert_eq!(Form::Nfd.quick_check(ascii), quick_check::IsNormalized::Yes);
1289 assert_eq!(
1290 Form::Nfkc.quick_check(ascii),
1291 quick_check::IsNormalized::Yes
1292 );
1293 assert_eq!(
1294 Form::Nfkd.quick_check(ascii),
1295 quick_check::IsNormalized::Yes
1296 );
1297 }
1298
1299 #[test]
1304 fn normstate_new_has_no_starter_empty_ccc_buf() {
1305 let state = NormState::new();
1306 assert!(state.current_starter.is_none());
1307 assert!(state.ccc_buf.is_empty());
1308 }
1309
1310 #[test]
1311 fn feed_entry_single_starter_sets_current_starter() {
1312 let mut state = NormState::new();
1313 let mut out = String::new();
1314 state.feed_entry('A', 0, &mut out, false);
1316 assert_eq!(state.current_starter, Some('A'));
1317 assert!(state.ccc_buf.is_empty());
1318 assert!(out.is_empty()); }
1320
1321 #[test]
1322 fn feed_entry_combining_mark_buffers_in_ccc_buf() {
1323 let mut state = NormState::new();
1324 let mut out = String::new();
1325 state.feed_entry('e', 0, &mut out, false);
1327 state.feed_entry('\u{0301}', 230, &mut out, false);
1329 assert_eq!(state.current_starter, Some('e'));
1330 assert!(!state.ccc_buf.is_empty());
1331 assert_eq!(state.ccc_buf.len(), 1);
1332 assert_eq!(state.ccc_buf.as_slice()[0].ch, '\u{0301}');
1333 assert_eq!(state.ccc_buf.as_slice()[0].ccc, 230);
1334 }
1335
1336 #[test]
1337 fn feed_entry_two_starters_first_gets_flushed() {
1338 let mut state = NormState::new();
1339 let mut out = String::new();
1340 state.feed_entry('A', 0, &mut out, false);
1342 assert!(out.is_empty());
1343 state.feed_entry('B', 0, &mut out, false);
1345 assert_eq!(out, "A");
1346 assert_eq!(state.current_starter, Some('B'));
1347 }
1348
1349 #[test]
1350 fn feed_entry_starter_to_starter_composition_hangul_lv() {
1351 let mut state = NormState::new();
1352 let mut out = String::new();
1353 state.feed_entry('\u{1100}', 0, &mut out, true);
1355 state.feed_entry('\u{1161}', 0, &mut out, true);
1357 assert_eq!(state.current_starter, Some('\u{AC00}'));
1359 assert!(out.is_empty());
1361 }
1362
1363 #[test]
1364 fn feed_entry_starter_to_starter_composition_e_acute() {
1365 let mut state = NormState::new();
1366 let mut out = String::new();
1367 state.feed_entry('e', 0, &mut out, true);
1371 state.feed_entry('\u{0301}', 230, &mut out, true);
1372 state.flush(&mut out, true);
1374 assert_eq!(out, "\u{00E9}"); }
1376
1377 #[test]
1378 fn feed_entry_nfd_starters_and_combining_marks() {
1379 let mut state = NormState::new();
1380 let mut out = String::new();
1381 state.feed_entry_nfd('A', 0, &mut out);
1383 assert_eq!(state.current_starter, Some('A'));
1384 state.feed_entry_nfd('\u{0300}', 230, &mut out);
1386 assert_eq!(state.ccc_buf.len(), 1);
1387 state.feed_entry_nfd('B', 0, &mut out);
1389 assert_eq!(out, "A\u{0300}");
1390 assert_eq!(state.current_starter, Some('B'));
1391 }
1392
1393 #[test]
1398 fn flush_no_starter_no_marks_nothing_emitted() {
1399 let mut state = NormState::new();
1400 let mut out = String::new();
1401 state.flush(&mut out, false);
1402 assert!(out.is_empty());
1403 state.flush(&mut out, true);
1404 assert!(out.is_empty());
1405 }
1406
1407 #[test]
1408 fn flush_starter_only_emits_starter() {
1409 let mut state = NormState::new();
1410 let mut out = String::new();
1411 state.current_starter = Some('X');
1412 state.flush(&mut out, false);
1413 assert_eq!(out, "X");
1414 }
1415
1416 #[test]
1417 fn flush_starter_one_combining_mark_no_compose() {
1418 let mut state = NormState::new();
1419 let mut out = String::new();
1420 state.current_starter = Some('e');
1421 state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, false);
1423 assert_eq!(out, "e\u{0301}");
1424 }
1425
1426 #[test]
1427 fn flush_starter_one_combining_mark_with_compose() {
1428 let mut state = NormState::new();
1429 let mut out = String::new();
1430 state.current_starter = Some('e');
1431 state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, true);
1433 assert_eq!(out, "\u{00E9}"); }
1435
1436 #[test]
1437 fn flush_starter_multiple_ccc_disordered_marks_emits_sorted() {
1438 let mut state = NormState::new();
1439 let mut out = String::new();
1440 state.current_starter = Some('a');
1441 state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0323}', 220); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
1446 let chars: Vec<char> = out.chars().collect();
1448 assert_eq!(chars[0], 'a');
1449 assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0323}'); assert_eq!(chars[3], '\u{0301}'); }
1453
1454 #[test]
1455 fn flush_orphan_combining_marks_no_starter_emits_sorted() {
1456 let mut state = NormState::new();
1457 let mut out = String::new();
1458 state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
1462 let chars: Vec<char> = out.chars().collect();
1463 assert_eq!(chars.len(), 2);
1464 assert_eq!(chars[0], '\u{0327}'); assert_eq!(chars[1], '\u{0301}'); }
1467
1468 #[test]
1469 fn flush_nfd_no_starter_no_marks_nothing_emitted() {
1470 let mut state = NormState::new();
1471 let mut out = String::new();
1472 state.flush_nfd(&mut out);
1473 assert!(out.is_empty());
1474 }
1475
1476 #[test]
1477 fn flush_nfd_starter_only_emits_starter() {
1478 let mut state = NormState::new();
1479 let mut out = String::new();
1480 state.current_starter = Some('Z');
1481 state.flush_nfd(&mut out);
1482 assert_eq!(out, "Z");
1483 }
1484
1485 #[test]
1486 fn flush_nfd_single_mark_fast_path_take_single_inline() {
1487 let mut state = NormState::new();
1488 let mut out = String::new();
1489 state.current_starter = Some('e');
1490 state.ccc_buf.push('\u{0301}', 230); state.flush_nfd(&mut out);
1493 assert_eq!(out, "e\u{0301}");
1494 assert!(state.ccc_buf.is_empty());
1496 }
1497
1498 #[test]
1499 fn flush_nfd_multiple_marks_sorted() {
1500 let mut state = NormState::new();
1501 let mut out = String::new();
1502 state.current_starter = Some('o');
1503 state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush_nfd(&mut out);
1506 let chars: Vec<char> = out.chars().collect();
1507 assert_eq!(chars[0], 'o');
1508 assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0301}'); }
1511
1512 #[test]
1513 fn flush_nfd_orphan_combining_marks_no_starter() {
1514 let mut state = NormState::new();
1515 let mut out = String::new();
1516 state.ccc_buf.push('\u{0301}', 230);
1517 state.ccc_buf.push('\u{0323}', 220);
1518 state.flush_nfd(&mut out);
1519 let chars: Vec<char> = out.chars().collect();
1520 assert_eq!(chars.len(), 2);
1521 assert_eq!(chars[0], '\u{0323}'); assert_eq!(chars[1], '\u{0301}'); }
1524
1525 #[test]
1530 fn normalize_impl_nfc_already_normalized_returns_borrowed() {
1531 let input = "\u{00C5}\u{0300}";
1536 let result = normalize_impl(input, Form::Nfc);
1537 assert!(
1538 matches!(result, Cow::Borrowed(_)),
1539 "Expected Cow::Borrowed for already-NFC input with Maybe QC, got Cow::Owned({:?})",
1540 result
1541 );
1542 assert_eq!(&*result, input);
1543 }
1544
1545 #[test]
1546 fn normalize_impl_nfc_maybe_borrowed_simd_path() {
1547 let mut input = String::new();
1551 input.push_str(&"a".repeat(60));
1552 input.push_str("\u{00C5}\u{0300}"); assert!(input.len() >= 64, "input must be >= 64 bytes for SIMD path");
1554 let result = normalize_impl(&input, Form::Nfc);
1555 assert!(
1556 matches!(result, Cow::Borrowed(_)),
1557 "Expected Cow::Borrowed for >=64 byte already-NFC input with Maybe QC, got Cow::Owned({:?})",
1558 result
1559 );
1560 assert_eq!(&*result, &*input);
1561 }
1562
1563 #[test]
1564 fn normalize_impl_ascii_returns_borrowed() {
1565 let input = "Hello, world!";
1566 let result = normalize_impl(input, Form::Nfc);
1567 assert!(matches!(result, Cow::Borrowed(_)));
1568 assert_eq!(&*result, input);
1569 }
1570
1571 #[test]
1572 fn normalize_impl_nfd_already_decomposed_returns_borrowed() {
1573 let input = "e\u{0301}";
1575 let result = normalize_impl(input, Form::Nfd);
1576 assert!(
1577 matches!(result, Cow::Borrowed(_)),
1578 "Expected Cow::Borrowed for already-NFD input"
1579 );
1580 }
1581
1582 #[test]
1583 fn normalize_impl_nfc_not_normalized_returns_owned() {
1584 let input = "e\u{0301}";
1586 let result = normalize_impl(input, Form::Nfc);
1587 assert!(matches!(result, Cow::Owned(_)));
1588 assert_eq!(&*result, "\u{00E9}");
1589 }
1590
1591 #[test]
1596 fn cjk_unified_extension_a_start() {
1597 assert!(is_cjk_unified(0x3400));
1598 }
1599
1600 #[test]
1601 fn cjk_unified_extension_a_end() {
1602 assert!(is_cjk_unified(0x4DBF));
1603 }
1604
1605 #[test]
1606 fn cjk_unified_main_start() {
1607 assert!(is_cjk_unified(0x4E00));
1608 }
1609
1610 #[test]
1611 fn cjk_unified_main_end() {
1612 assert!(is_cjk_unified(0x9FFF));
1613 }
1614
1615 #[test]
1616 fn cjk_unified_just_before_extension_a() {
1617 assert!(!is_cjk_unified(0x33FF));
1618 }
1619
1620 #[test]
1621 fn cjk_unified_gap_between_extension_a_and_main() {
1622 assert!(!is_cjk_unified(0x4DC0));
1623 }
1624
1625 #[test]
1626 fn cjk_unified_just_after_main() {
1627 assert!(!is_cjk_unified(0xA000));
1628 }
1629}