1use alloc::borrow::Cow;
8use alloc::string::String;
9
10use crate::ccc::CccBuffer;
11use crate::compose;
12use crate::decompose::{self, DecompForm};
13use crate::hangul;
14use crate::quick_check;
15use crate::simd;
16use crate::simd::prefetch;
17use crate::tables;
18use crate::utf8;
19
20#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum Form {
27 Nfc,
29 Nfd,
31 Nfkc,
33 Nfkd,
35}
36
37impl Form {
38 #[inline]
47 fn passthrough_bound(self) -> u8 {
48 match self {
49 Form::Nfc | Form::Nfkc => 0xC0,
50 Form::Nfd | Form::Nfkd => 0xC0,
51 }
52 }
53
54 #[inline]
56 fn composes(self) -> bool {
57 matches!(self, Form::Nfc | Form::Nfkc)
58 }
59
60 #[inline]
62 fn decomp_form(self) -> DecompForm {
63 match self {
64 Form::Nfc | Form::Nfd => DecompForm::Canonical,
65 Form::Nfkc | Form::Nfkd => DecompForm::Compatible,
66 }
67 }
68
69 #[inline]
71 fn estimated_capacity(self, input_len: usize) -> usize {
72 match self {
73 Form::Nfc | Form::Nfkc => input_len,
74 Form::Nfd | Form::Nfkd => input_len + input_len / 2,
75 }
76 }
77
78 #[inline]
80 fn quick_check(self, input: &str) -> quick_check::IsNormalized {
81 match self {
82 Form::Nfc => quick_check::quick_check_nfc(input),
83 Form::Nfd => quick_check::quick_check_nfd(input),
84 Form::Nfkc => quick_check::quick_check_nfkc(input),
85 Form::Nfkd => quick_check::quick_check_nfkd(input),
86 }
87 }
88}
89
90struct NormState {
95 current_starter: Option<char>,
97 ccc_buf: CccBuffer,
99}
100
101impl NormState {
102 #[inline]
103 fn new() -> Self {
104 NormState {
105 current_starter: None,
106 ccc_buf: CccBuffer::new(),
107 }
108 }
109
110 #[inline]
114 fn flush(&mut self, out: &mut String, composes: bool) {
115 let starter = match self.current_starter.take() {
116 Some(s) => s,
117 None => {
118 if !self.ccc_buf.is_empty() {
120 self.ccc_buf.sort_in_place();
121 for entry in self.ccc_buf.as_slice() {
122 out.push(entry.ch);
123 }
124 self.ccc_buf.clear();
125 }
126 return;
127 },
128 };
129
130 if self.ccc_buf.is_empty() {
131 out.push(starter);
133 return;
134 }
135
136 self.ccc_buf.sort_in_place();
138
139 if composes {
140 compose::compose_combining_sequence_into(starter, self.ccc_buf.as_slice(), out);
141 } else {
142 out.push(starter);
144 for entry in self.ccc_buf.as_slice() {
145 out.push(entry.ch);
146 }
147 }
148 self.ccc_buf.clear();
149 }
150
151 #[inline]
157 fn feed_entry(&mut self, ch: char, ccc: u8, out: &mut String, composes: bool) {
158 if ccc == 0 {
159 if composes && self.ccc_buf.is_empty() {
161 if let Some(prev) = self.current_starter
163 && let Some(composed) = compose::compose(prev, ch)
164 {
165 self.current_starter = Some(composed);
166 return;
167 }
168 }
169 self.flush(out, composes);
171 self.current_starter = Some(ch);
172 } else {
173 self.ccc_buf.push(ch, ccc);
175 }
176 }
177
178 #[inline]
180 fn flush_nfd(&mut self, out: &mut String) {
181 let starter = match self.current_starter.take() {
182 Some(s) => s,
183 None => {
184 if !self.ccc_buf.is_empty() {
185 self.ccc_buf.sort_in_place();
186 for entry in self.ccc_buf.as_slice() {
187 out.push(entry.ch);
188 }
189 self.ccc_buf.clear();
190 }
191 return;
192 },
193 };
194
195 if let Some(entry) = self.ccc_buf.take_single_inline() {
198 out.push(starter);
199 out.push(entry.ch);
200 return;
201 }
202
203 if self.ccc_buf.is_empty() {
204 out.push(starter);
205 return;
206 }
207
208 self.ccc_buf.sort_in_place();
210 out.push(starter);
211 for entry in self.ccc_buf.as_slice() {
212 out.push(entry.ch);
213 }
214 self.ccc_buf.clear();
215 }
216
217 #[inline]
219 fn feed_entry_nfd(&mut self, ch: char, ccc: u8, out: &mut String) {
220 if ccc == 0 {
221 self.flush_nfd(out);
222 self.current_starter = Some(ch);
223 } else {
224 self.ccc_buf.push(ch, ccc);
225 }
226 }
227}
228
229#[inline(always)]
236fn is_cjk_unified(cp: u32) -> bool {
237 (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
238}
239
240#[inline(always)]
244fn is_supp_safe(cp: u32) -> bool {
245 if cp >= 0x20000 {
246 return !(0x2F800..=0x2FA1F).contains(&cp);
248 }
249 (0x1F252..=0x1FBEF).contains(&cp)
252}
253
254#[inline]
259fn process_char(
260 ch: char,
261 state: &mut NormState,
262 out: &mut String,
263 form: Form,
264 decomp_buf: &mut CccBuffer,
265) {
266 let cp = ch as u32;
267
268 if cp >= 0x3400 && is_cjk_unified(cp) {
271 state.flush(out, form.composes());
272 state.current_starter = Some(ch);
273 return;
274 }
275
276 if hangul::is_hangul_syllable(ch) {
278 let (l, v, t) = hangul::decompose_hangul(ch);
279 state.feed_entry(l, 0, out, form.composes());
280 state.feed_entry(v, 0, out, form.composes());
281 if let Some(t_char) = t {
282 state.feed_entry(t_char, 0, out, form.composes());
283 }
284 return;
285 }
286
287 let trie_value = tables::raw_decomp_trie_value(ch, form.decomp_form());
289
290 if !tables::has_decomposition(trie_value) {
294 let ccc = tables::ccc_from_trie_value(trie_value);
295 state.feed_entry(ch, ccc, out, form.composes());
296 return;
297 }
298
299 decomp_buf.clear();
301 decompose::decompose_from_trie_value(ch, trie_value, decomp_buf, form.decomp_form());
302 for entry in decomp_buf.as_slice() {
303 state.feed_entry(entry.ch, entry.ccc, out, form.composes());
304 }
305}
306
307#[allow(dead_code)]
313#[inline(always)]
314fn process_from_trie(
315 ch: char,
316 tv: u32,
317 state: &mut NormState,
318 out: &mut String,
319 form: Form,
320 decomp_buf: &mut CccBuffer,
321) {
322 if !tables::has_decomposition(tv) {
323 let ccc = tables::ccc_from_trie_value(tv);
324 state.feed_entry(ch, ccc, out, form.composes());
325 } else {
326 decomp_buf.clear();
327 decompose::decompose_from_trie_value(ch, tv, decomp_buf, form.decomp_form());
328 for entry in decomp_buf.as_slice() {
329 state.feed_entry(entry.ch, entry.ccc, out, form.composes());
330 }
331 }
332}
333
334#[inline(always)]
338fn process_from_trie_nfd(
339 ch: char,
340 tv: u32,
341 state: &mut NormState,
342 out: &mut String,
343 decomp_form: DecompForm,
344) {
345 if !tables::has_decomposition(tv) {
346 let ccc = tables::ccc_from_trie_value(tv);
348 state.feed_entry_nfd(ch, ccc, out);
349 return;
350 }
351
352 if let Some(data) = tables::expansion_data_from_trie_value(tv, decomp_form) {
354 if data.len() == 2 {
358 let e0 = data[0];
359 let ccc0 = (e0 >> tables::EXPANSION_CCC_SHIFT) as u8;
360 if ccc0 == 0 {
361 state.flush_nfd(out);
363 let cp0 = e0 & tables::EXPANSION_CP_MASK;
364 debug_assert!(cp0 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp0));
365 state.current_starter = Some(unsafe { char::from_u32_unchecked(cp0) });
366 let e1 = data[1];
368 let cp1 = e1 & tables::EXPANSION_CP_MASK;
369 let ccc1 = (e1 >> tables::EXPANSION_CCC_SHIFT) as u8;
370 debug_assert!(cp1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp1));
371 let ch1 = unsafe { char::from_u32_unchecked(cp1) };
372 if ccc1 != 0 {
373 state.ccc_buf.push(ch1, ccc1);
374 } else {
375 state.feed_entry_nfd(ch1, 0, out);
377 }
378 return;
379 }
380 }
381 for &entry in data {
383 let cp = entry & tables::EXPANSION_CP_MASK;
384 let ccc = (entry >> tables::EXPANSION_CCC_SHIFT) as u8;
385 debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
386 let exp_ch = unsafe { char::from_u32_unchecked(cp) };
387 state.feed_entry_nfd(exp_ch, ccc, out);
388 }
389 return;
390 }
391
392 let info = tv & 0xFFFF;
394 debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
395 let decomposed = unsafe { char::from_u32_unchecked(info) };
396 let ccc = if info <= 0x7F {
397 0
398 } else {
399 tables::lookup_ccc(decomposed)
400 };
401 state.feed_entry_nfd(decomposed, ccc, out);
402}
403
404#[inline(always)]
410fn flush_compose_passthrough(
411 pass: &str,
412 ch: char,
413 form: Form,
414 state: &mut NormState,
415 out: &mut String,
416) {
417 let cp = ch as u32;
418 let next_tv = if cp >= 0x10000 {
421 unsafe { tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form()) }
422 } else {
423 tables::raw_decomp_trie_value(ch, form.decomp_form())
424 };
425 if tables::needs_starter_shadow(next_tv) {
426 let n = pass.len();
427 if n > 1 {
428 out.push_str(&pass[..n - 1]);
429 }
430 let last_ch = pass.as_bytes()[n - 1] as char;
431 state.feed_entry(last_ch, 0, out, true);
432 } else {
433 out.push_str(pass);
434 }
435}
436
437fn normalize_scalar<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
443 if input.is_empty() {
444 return Cow::Borrowed(input);
445 }
446
447 if form.quick_check(input) == quick_check::IsNormalized::Yes {
449 return Cow::Borrowed(input);
450 }
451
452 let mut out = String::with_capacity(input.len());
453 let mut state = NormState::new();
454 let mut decomp_buf = CccBuffer::new();
455
456 for ch in input.chars() {
457 process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
458 }
459
460 state.flush(&mut out, form.composes());
462
463 if out == input {
464 Cow::Borrowed(input)
465 } else {
466 Cow::Owned(out)
467 }
468}
469
470fn normalize_impl<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
479 let bytes = input.as_bytes();
480 let len = bytes.len();
481
482 if len < 64 {
484 return normalize_scalar(input, form);
485 }
486
487 let qc = form.quick_check(input);
489 if qc == quick_check::IsNormalized::Yes {
490 return Cow::Borrowed(input);
491 }
492
493 let bound = form.passthrough_bound();
495 let composes = form.composes();
496 let mut out = String::with_capacity(form.estimated_capacity(len));
497 let mut last_written: usize = 0;
498 let mut state = NormState::new();
499 let mut decomp_buf = CccBuffer::new();
500
501 let mut pos: usize = 0;
502 let ptr = bytes.as_ptr();
503
504 while pos + 64 <= len {
506 let chunk_start = pos;
507
508 let mask = unsafe {
512 let prefetch_l1 =
513 ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
514 let prefetch_l2 =
515 ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
516 simd::scan_and_prefetch(ptr.add(pos), prefetch_l1, prefetch_l2, bound)
517 };
518
519 unsafe {
524 let write_head = out.len();
525 let distance = prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE;
526 if write_head + distance <= out.capacity() {
527 prefetch::prefetch_write(out.as_ptr().wrapping_add(write_head + distance));
528 }
529 }
530
531 if mask == 0 {
532 pos += 64;
534 continue;
535 }
536
537 let mut chunk_mask = mask;
539 while chunk_mask != 0 {
540 let bit_pos = chunk_mask.trailing_zeros() as usize;
541 chunk_mask &= chunk_mask.wrapping_sub(1); let byte_pos = chunk_start + bit_pos;
544
545 if byte_pos < last_written {
547 continue;
548 }
549
550 if utf8::is_continuation_byte(bytes[byte_pos]) {
553 continue;
554 }
555
556 let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
558
559 if !composes {
564 let cp = ch as u32;
565 if (cp >= 0x3400 && is_cjk_unified(cp)) || (cp >= 0x10000 && is_supp_safe(cp)) {
567 continue;
568 }
569 if hangul::is_hangul_syllable(ch) {
572 if byte_pos > last_written {
573 state.flush_nfd(&mut out);
574 out.push_str(&input[last_written..byte_pos]);
575 }
576 last_written = byte_pos + width;
577 state.flush_nfd(&mut out);
578 let (l, v, t) = hangul::decompose_hangul(ch);
579 out.push(l);
580 out.push(v);
581 if let Some(t_char) = t {
582 out.push(t_char);
583 }
584 continue;
585 }
586 let tv = if cp >= 0x10000 {
590 unsafe { tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form()) }
592 } else {
593 tables::raw_decomp_trie_value(ch, form.decomp_form())
594 };
595 if !tables::has_decomposition(tv) && tables::ccc_from_trie_value(tv) == 0 {
596 continue; }
598 if byte_pos > last_written {
600 state.flush_nfd(&mut out);
601 out.push_str(&input[last_written..byte_pos]);
602 }
603 last_written = byte_pos + width;
604 process_from_trie_nfd(ch, tv, &mut state, &mut out, form.decomp_form());
605 continue;
606 }
607
608 if byte_pos > last_written {
617 state.flush(&mut out, composes);
618 let pass = &input[last_written..byte_pos];
619 if composes {
620 flush_compose_passthrough(pass, ch, form, &mut state, &mut out);
621 } else {
622 out.push_str(pass);
623 }
624 }
625
626 last_written = byte_pos + width;
627
628 process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
630 }
631
632 pos += 64;
633 }
634
635 if pos < len {
637 let tail_has_work = bytes[pos..].iter().any(|&b| b >= bound);
639
640 if tail_has_work {
641 let mut tail_pos = pos;
643 while tail_pos < len {
644 if tail_pos < last_written {
645 tail_pos += 1;
646 continue;
647 }
648
649 if utf8::is_continuation_byte(bytes[tail_pos]) {
650 tail_pos += 1;
651 continue;
652 }
653
654 let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
655
656 if !composes {
658 let cp = ch as u32;
659 if (cp >= 0x3400 && is_cjk_unified(cp)) || (cp >= 0x10000 && is_supp_safe(cp)) {
660 tail_pos += width;
661 continue;
662 }
663 if hangul::is_hangul_syllable(ch) {
665 if tail_pos > last_written {
666 state.flush_nfd(&mut out);
667 out.push_str(&input[last_written..tail_pos]);
668 }
669 last_written = tail_pos + width;
670 state.flush_nfd(&mut out);
671 let (l, v, t) = hangul::decompose_hangul(ch);
672 out.push(l);
673 out.push(v);
674 if let Some(t_char) = t {
675 out.push(t_char);
676 }
677 tail_pos += width;
678 continue;
679 }
680 let tv = if cp >= 0x10000 {
681 unsafe {
683 tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form())
684 }
685 } else {
686 tables::raw_decomp_trie_value(ch, form.decomp_form())
687 };
688 if !tables::has_decomposition(tv) && tables::ccc_from_trie_value(tv) == 0 {
689 tail_pos += width;
690 continue;
691 }
692 if tail_pos > last_written {
694 state.flush_nfd(&mut out);
695 out.push_str(&input[last_written..tail_pos]);
696 }
697 last_written = tail_pos + width;
698 process_from_trie_nfd(ch, tv, &mut state, &mut out, form.decomp_form());
699 tail_pos += width;
700 continue;
701 }
702
703 if tail_pos > last_written {
705 state.flush(&mut out, composes);
706 let pass = &input[last_written..tail_pos];
707 if composes {
708 flush_compose_passthrough(pass, ch, form, &mut state, &mut out);
709 } else {
710 out.push_str(pass);
711 }
712 }
713
714 last_written = tail_pos + width;
715
716 process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
717
718 tail_pos += width;
719 }
720 }
721 }
722
723 if composes {
725 state.flush(&mut out, true);
726 } else {
727 state.flush_nfd(&mut out);
728 }
729
730 if last_written < len {
732 out.push_str(&input[last_written..len]);
733 }
734
735 if qc == quick_check::IsNormalized::Maybe && out == input {
738 Cow::Borrowed(input)
739 } else {
740 Cow::Owned(out)
741 }
742}
743
744pub struct NfcNormalizer;
750
751pub struct NfdNormalizer;
753
754pub struct NfkcNormalizer;
756
757pub struct NfkdNormalizer;
759
760impl Default for NfcNormalizer {
761 fn default() -> Self {
762 Self::new()
763 }
764}
765
766impl Default for NfdNormalizer {
767 fn default() -> Self {
768 Self::new()
769 }
770}
771
772impl Default for NfkcNormalizer {
773 fn default() -> Self {
774 Self::new()
775 }
776}
777
778impl Default for NfkdNormalizer {
779 fn default() -> Self {
780 Self::new()
781 }
782}
783
784impl NfcNormalizer {
785 pub fn new() -> Self {
787 NfcNormalizer
788 }
789
790 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
792 quick_check::quick_check_nfc(input)
793 }
794
795 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
799 normalize_impl(input, Form::Nfc)
800 }
801
802 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
806 let result = normalize_impl(input, Form::Nfc);
807 let already_normalized = matches!(&result, Cow::Borrowed(_));
808 out.push_str(&result);
809 already_normalized
810 }
811
812 pub fn is_normalized(&self, input: &str) -> bool {
814 quick_check::is_normalized_nfc(input)
815 }
816}
817
818impl NfdNormalizer {
819 pub fn new() -> Self {
821 NfdNormalizer
822 }
823
824 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
826 quick_check::quick_check_nfd(input)
827 }
828
829 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
833 normalize_impl(input, Form::Nfd)
834 }
835
836 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
840 let result = normalize_impl(input, Form::Nfd);
841 let already_normalized = matches!(&result, Cow::Borrowed(_));
842 out.push_str(&result);
843 already_normalized
844 }
845
846 pub fn is_normalized(&self, input: &str) -> bool {
848 quick_check::is_normalized_nfd(input)
849 }
850}
851
852impl NfkcNormalizer {
853 pub fn new() -> Self {
855 NfkcNormalizer
856 }
857
858 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
860 quick_check::quick_check_nfkc(input)
861 }
862
863 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
867 normalize_impl(input, Form::Nfkc)
868 }
869
870 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
874 let result = normalize_impl(input, Form::Nfkc);
875 let already_normalized = matches!(&result, Cow::Borrowed(_));
876 out.push_str(&result);
877 already_normalized
878 }
879
880 pub fn is_normalized(&self, input: &str) -> bool {
882 quick_check::is_normalized_nfkc(input)
883 }
884}
885
886impl NfkdNormalizer {
887 pub fn new() -> Self {
889 NfkdNormalizer
890 }
891
892 pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
894 quick_check::quick_check_nfkd(input)
895 }
896
897 pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
901 normalize_impl(input, Form::Nfkd)
902 }
903
904 pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
908 let result = normalize_impl(input, Form::Nfkd);
909 let already_normalized = matches!(&result, Cow::Borrowed(_));
910 out.push_str(&result);
911 already_normalized
912 }
913
914 pub fn is_normalized(&self, input: &str) -> bool {
916 quick_check::is_normalized_nfkd(input)
917 }
918}
919
920#[cfg(test)]
925mod tests {
926 use super::*;
927 use alloc::borrow::Cow;
928 use alloc::string::String;
929 use alloc::vec::Vec;
930
931 #[test]
936 fn passthrough_bound_all_forms_return_0xc0() {
937 assert_eq!(Form::Nfc.passthrough_bound(), 0xC0);
938 assert_eq!(Form::Nfd.passthrough_bound(), 0xC0);
939 assert_eq!(Form::Nfkc.passthrough_bound(), 0xC0);
940 assert_eq!(Form::Nfkd.passthrough_bound(), 0xC0);
941 }
942
943 #[test]
944 fn composes_nfc_nfkc_true_nfd_nfkd_false() {
945 assert!(Form::Nfc.composes());
946 assert!(Form::Nfkc.composes());
947 assert!(!Form::Nfd.composes());
948 assert!(!Form::Nfkd.composes());
949 }
950
951 #[test]
952 fn decomp_form_canonical_vs_compatible() {
953 assert_eq!(Form::Nfc.decomp_form(), DecompForm::Canonical);
954 assert_eq!(Form::Nfd.decomp_form(), DecompForm::Canonical);
955 assert_eq!(Form::Nfkc.decomp_form(), DecompForm::Compatible);
956 assert_eq!(Form::Nfkd.decomp_form(), DecompForm::Compatible);
957 }
958
959 #[test]
960 fn estimated_capacity_nfc_nfkc_same_nfd_nfkd_larger() {
961 let input_len = 100;
962 assert_eq!(Form::Nfc.estimated_capacity(input_len), 100);
963 assert_eq!(Form::Nfkc.estimated_capacity(input_len), 100);
964 assert_eq!(Form::Nfd.estimated_capacity(input_len), 150);
965 assert_eq!(Form::Nfkd.estimated_capacity(input_len), 150);
966 }
967
968 #[test]
969 fn estimated_capacity_zero_length() {
970 assert_eq!(Form::Nfc.estimated_capacity(0), 0);
971 assert_eq!(Form::Nfd.estimated_capacity(0), 0);
972 }
973
974 #[test]
975 fn quick_check_ascii_is_yes_for_all_forms() {
976 let ascii = "Hello, World!";
977 assert_eq!(Form::Nfc.quick_check(ascii), quick_check::IsNormalized::Yes);
978 assert_eq!(Form::Nfd.quick_check(ascii), quick_check::IsNormalized::Yes);
979 assert_eq!(
980 Form::Nfkc.quick_check(ascii),
981 quick_check::IsNormalized::Yes
982 );
983 assert_eq!(
984 Form::Nfkd.quick_check(ascii),
985 quick_check::IsNormalized::Yes
986 );
987 }
988
989 #[test]
994 fn normstate_new_has_no_starter_empty_ccc_buf() {
995 let state = NormState::new();
996 assert!(state.current_starter.is_none());
997 assert!(state.ccc_buf.is_empty());
998 }
999
1000 #[test]
1001 fn feed_entry_single_starter_sets_current_starter() {
1002 let mut state = NormState::new();
1003 let mut out = String::new();
1004 state.feed_entry('A', 0, &mut out, false);
1006 assert_eq!(state.current_starter, Some('A'));
1007 assert!(state.ccc_buf.is_empty());
1008 assert!(out.is_empty()); }
1010
1011 #[test]
1012 fn feed_entry_combining_mark_buffers_in_ccc_buf() {
1013 let mut state = NormState::new();
1014 let mut out = String::new();
1015 state.feed_entry('e', 0, &mut out, false);
1017 state.feed_entry('\u{0301}', 230, &mut out, false);
1019 assert_eq!(state.current_starter, Some('e'));
1020 assert!(!state.ccc_buf.is_empty());
1021 assert_eq!(state.ccc_buf.len(), 1);
1022 assert_eq!(state.ccc_buf.as_slice()[0].ch, '\u{0301}');
1023 assert_eq!(state.ccc_buf.as_slice()[0].ccc, 230);
1024 }
1025
1026 #[test]
1027 fn feed_entry_two_starters_first_gets_flushed() {
1028 let mut state = NormState::new();
1029 let mut out = String::new();
1030 state.feed_entry('A', 0, &mut out, false);
1032 assert!(out.is_empty());
1033 state.feed_entry('B', 0, &mut out, false);
1035 assert_eq!(out, "A");
1036 assert_eq!(state.current_starter, Some('B'));
1037 }
1038
1039 #[test]
1040 fn feed_entry_starter_to_starter_composition_hangul_lv() {
1041 let mut state = NormState::new();
1042 let mut out = String::new();
1043 state.feed_entry('\u{1100}', 0, &mut out, true);
1045 state.feed_entry('\u{1161}', 0, &mut out, true);
1047 assert_eq!(state.current_starter, Some('\u{AC00}'));
1049 assert!(out.is_empty());
1051 }
1052
1053 #[test]
1054 fn feed_entry_starter_to_starter_composition_e_acute() {
1055 let mut state = NormState::new();
1056 let mut out = String::new();
1057 state.feed_entry('e', 0, &mut out, true);
1061 state.feed_entry('\u{0301}', 230, &mut out, true);
1062 state.flush(&mut out, true);
1064 assert_eq!(out, "\u{00E9}"); }
1066
1067 #[test]
1068 fn feed_entry_nfd_starters_and_combining_marks() {
1069 let mut state = NormState::new();
1070 let mut out = String::new();
1071 state.feed_entry_nfd('A', 0, &mut out);
1073 assert_eq!(state.current_starter, Some('A'));
1074 state.feed_entry_nfd('\u{0300}', 230, &mut out);
1076 assert_eq!(state.ccc_buf.len(), 1);
1077 state.feed_entry_nfd('B', 0, &mut out);
1079 assert_eq!(out, "A\u{0300}");
1080 assert_eq!(state.current_starter, Some('B'));
1081 }
1082
1083 #[test]
1088 fn flush_no_starter_no_marks_nothing_emitted() {
1089 let mut state = NormState::new();
1090 let mut out = String::new();
1091 state.flush(&mut out, false);
1092 assert!(out.is_empty());
1093 state.flush(&mut out, true);
1094 assert!(out.is_empty());
1095 }
1096
1097 #[test]
1098 fn flush_starter_only_emits_starter() {
1099 let mut state = NormState::new();
1100 let mut out = String::new();
1101 state.current_starter = Some('X');
1102 state.flush(&mut out, false);
1103 assert_eq!(out, "X");
1104 }
1105
1106 #[test]
1107 fn flush_starter_one_combining_mark_no_compose() {
1108 let mut state = NormState::new();
1109 let mut out = String::new();
1110 state.current_starter = Some('e');
1111 state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, false);
1113 assert_eq!(out, "e\u{0301}");
1114 }
1115
1116 #[test]
1117 fn flush_starter_one_combining_mark_with_compose() {
1118 let mut state = NormState::new();
1119 let mut out = String::new();
1120 state.current_starter = Some('e');
1121 state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, true);
1123 assert_eq!(out, "\u{00E9}"); }
1125
1126 #[test]
1127 fn flush_starter_multiple_ccc_disordered_marks_emits_sorted() {
1128 let mut state = NormState::new();
1129 let mut out = String::new();
1130 state.current_starter = Some('a');
1131 state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0323}', 220); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
1136 let chars: Vec<char> = out.chars().collect();
1138 assert_eq!(chars[0], 'a');
1139 assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0323}'); assert_eq!(chars[3], '\u{0301}'); }
1143
1144 #[test]
1145 fn flush_orphan_combining_marks_no_starter_emits_sorted() {
1146 let mut state = NormState::new();
1147 let mut out = String::new();
1148 state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
1152 let chars: Vec<char> = out.chars().collect();
1153 assert_eq!(chars.len(), 2);
1154 assert_eq!(chars[0], '\u{0327}'); assert_eq!(chars[1], '\u{0301}'); }
1157
1158 #[test]
1159 fn flush_nfd_no_starter_no_marks_nothing_emitted() {
1160 let mut state = NormState::new();
1161 let mut out = String::new();
1162 state.flush_nfd(&mut out);
1163 assert!(out.is_empty());
1164 }
1165
1166 #[test]
1167 fn flush_nfd_starter_only_emits_starter() {
1168 let mut state = NormState::new();
1169 let mut out = String::new();
1170 state.current_starter = Some('Z');
1171 state.flush_nfd(&mut out);
1172 assert_eq!(out, "Z");
1173 }
1174
1175 #[test]
1176 fn flush_nfd_single_mark_fast_path_take_single_inline() {
1177 let mut state = NormState::new();
1178 let mut out = String::new();
1179 state.current_starter = Some('e');
1180 state.ccc_buf.push('\u{0301}', 230); state.flush_nfd(&mut out);
1183 assert_eq!(out, "e\u{0301}");
1184 assert!(state.ccc_buf.is_empty());
1186 }
1187
1188 #[test]
1189 fn flush_nfd_multiple_marks_sorted() {
1190 let mut state = NormState::new();
1191 let mut out = String::new();
1192 state.current_starter = Some('o');
1193 state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush_nfd(&mut out);
1196 let chars: Vec<char> = out.chars().collect();
1197 assert_eq!(chars[0], 'o');
1198 assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0301}'); }
1201
1202 #[test]
1203 fn flush_nfd_orphan_combining_marks_no_starter() {
1204 let mut state = NormState::new();
1205 let mut out = String::new();
1206 state.ccc_buf.push('\u{0301}', 230);
1207 state.ccc_buf.push('\u{0323}', 220);
1208 state.flush_nfd(&mut out);
1209 let chars: Vec<char> = out.chars().collect();
1210 assert_eq!(chars.len(), 2);
1211 assert_eq!(chars[0], '\u{0323}'); assert_eq!(chars[1], '\u{0301}'); }
1214
1215 #[test]
1220 fn normalize_impl_nfc_already_normalized_returns_borrowed() {
1221 let input = "\u{00C5}\u{0300}";
1226 let result = normalize_impl(input, Form::Nfc);
1227 assert!(
1228 matches!(result, Cow::Borrowed(_)),
1229 "Expected Cow::Borrowed for already-NFC input with Maybe QC, got Cow::Owned({:?})",
1230 result
1231 );
1232 assert_eq!(&*result, input);
1233 }
1234
1235 #[test]
1236 fn normalize_impl_nfc_maybe_borrowed_simd_path() {
1237 let mut input = String::new();
1241 input.push_str(&"a".repeat(60));
1242 input.push_str("\u{00C5}\u{0300}"); assert!(input.len() >= 64, "input must be >= 64 bytes for SIMD path");
1244 let result = normalize_impl(&input, Form::Nfc);
1245 assert!(
1246 matches!(result, Cow::Borrowed(_)),
1247 "Expected Cow::Borrowed for >=64 byte already-NFC input with Maybe QC, got Cow::Owned({:?})",
1248 result
1249 );
1250 assert_eq!(&*result, &*input);
1251 }
1252
1253 #[test]
1254 fn normalize_impl_ascii_returns_borrowed() {
1255 let input = "Hello, world!";
1256 let result = normalize_impl(input, Form::Nfc);
1257 assert!(matches!(result, Cow::Borrowed(_)));
1258 assert_eq!(&*result, input);
1259 }
1260
1261 #[test]
1262 fn normalize_impl_nfd_already_decomposed_returns_borrowed() {
1263 let input = "e\u{0301}";
1265 let result = normalize_impl(input, Form::Nfd);
1266 assert!(
1267 matches!(result, Cow::Borrowed(_)),
1268 "Expected Cow::Borrowed for already-NFD input"
1269 );
1270 }
1271
1272 #[test]
1273 fn normalize_impl_nfc_not_normalized_returns_owned() {
1274 let input = "e\u{0301}";
1276 let result = normalize_impl(input, Form::Nfc);
1277 assert!(matches!(result, Cow::Owned(_)));
1278 assert_eq!(&*result, "\u{00E9}");
1279 }
1280
1281 #[test]
1286 fn cjk_unified_extension_a_start() {
1287 assert!(is_cjk_unified(0x3400));
1288 }
1289
1290 #[test]
1291 fn cjk_unified_extension_a_end() {
1292 assert!(is_cjk_unified(0x4DBF));
1293 }
1294
1295 #[test]
1296 fn cjk_unified_main_start() {
1297 assert!(is_cjk_unified(0x4E00));
1298 }
1299
1300 #[test]
1301 fn cjk_unified_main_end() {
1302 assert!(is_cjk_unified(0x9FFF));
1303 }
1304
1305 #[test]
1306 fn cjk_unified_just_before_extension_a() {
1307 assert!(!is_cjk_unified(0x33FF));
1308 }
1309
1310 #[test]
1311 fn cjk_unified_gap_between_extension_a_and_main() {
1312 assert!(!is_cjk_unified(0x4DC0));
1313 }
1314
1315 #[test]
1316 fn cjk_unified_just_after_main() {
1317 assert!(!is_cjk_unified(0xA000));
1318 }
1319
1320 #[test]
1325 fn supp_safe_plane2_start() {
1326 assert!(is_supp_safe(0x20000));
1328 }
1329
1330 #[test]
1331 fn supp_safe_cjk_compat_supplement_start() {
1332 assert!(!is_supp_safe(0x2F800));
1333 }
1334
1335 #[test]
1336 fn supp_safe_cjk_compat_supplement_end() {
1337 assert!(!is_supp_safe(0x2FA1F));
1338 }
1339
1340 #[test]
1341 fn supp_safe_just_after_compat_supplement() {
1342 assert!(is_supp_safe(0x2FA20));
1343 }
1344
1345 #[test]
1346 fn supp_safe_plane1_safe_range_start() {
1347 assert!(is_supp_safe(0x1F252));
1348 }
1349
1350 #[test]
1351 fn supp_safe_plane1_safe_range_end() {
1352 assert!(is_supp_safe(0x1FBEF));
1353 }
1354
1355 #[test]
1356 fn supp_safe_just_before_plane1_safe_range() {
1357 assert!(!is_supp_safe(0x1F251));
1358 }
1359
1360 #[test]
1361 fn supp_safe_just_after_plane1_safe_range() {
1362 assert!(!is_supp_safe(0x1FBF0));
1363 }
1364
1365 #[test]
1366 fn supp_safe_smp_start_before_safe_range() {
1367 assert!(!is_supp_safe(0x10000));
1369 }
1370}