simd-normalizer 0.1.1

SIMD-accelerated Unicode normalization (NFC, NFD, NFKC, NFKD)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
// tests/ccc_ordering_edge_cases.rs
//
// Edge case tests focused on Canonical Combining Class (CCC) ordering.
//
// The CCC sort must be *stable*: combining marks with the same CCC value
// must preserve their relative order from the input. The library uses an
// inline buffer (CccBuffer) for up to 18 entries and overflows to Vec for
// longer sequences; both paths must produce identical, correct results.
//
// Every test cross-validates against ICU4X (icu_normalizer) as the
// reference implementation.

use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};

// ---------------------------------------------------------------------------
// ICU4X reference helpers
// ---------------------------------------------------------------------------

fn icu_nfc(s: &str) -> String {
    ComposingNormalizerBorrowed::new_nfc()
        .normalize(s)
        .into_owned()
}

fn icu_nfd(s: &str) -> String {
    DecomposingNormalizerBorrowed::new_nfd()
        .normalize(s)
        .into_owned()
}

fn icu_nfkc(s: &str) -> String {
    ComposingNormalizerBorrowed::new_nfkc()
        .normalize(s)
        .into_owned()
}

fn icu_nfkd(s: &str) -> String {
    DecomposingNormalizerBorrowed::new_nfkd()
        .normalize(s)
        .into_owned()
}

// ---------------------------------------------------------------------------
// simd-normalizer helpers
// ---------------------------------------------------------------------------

fn our_nfc(s: &str) -> String {
    simd_normalizer::nfc().normalize(s).into_owned()
}

fn our_nfd(s: &str) -> String {
    simd_normalizer::nfd().normalize(s).into_owned()
}

fn our_nfkc(s: &str) -> String {
    simd_normalizer::nfkc().normalize(s).into_owned()
}

fn our_nfkd(s: &str) -> String {
    simd_normalizer::nfkd().normalize(s).into_owned()
}

// ---------------------------------------------------------------------------
// Formatting helper for diagnostics
// ---------------------------------------------------------------------------

fn codepoints(s: &str) -> String {
    s.chars()
        .map(|c| format!("U+{:04X}", c as u32))
        .collect::<Vec<_>>()
        .join(" ")
}

// ---------------------------------------------------------------------------
// Assertion helpers: compare simd-normalizer output against ICU4X for all forms
// ---------------------------------------------------------------------------

fn assert_nfd_matches_icu(label: &str, input: &str) {
    let ours = our_nfd(input);
    let reference = icu_nfd(input);
    assert_eq!(
        ours,
        reference,
        "NFD mismatch [{label}]\n  input: {}\n  ours:  {}\n  icu:   {}",
        codepoints(input),
        codepoints(&ours),
        codepoints(&reference),
    );
}

fn assert_nfc_matches_icu(label: &str, input: &str) {
    let ours = our_nfc(input);
    let reference = icu_nfc(input);
    assert_eq!(
        ours,
        reference,
        "NFC mismatch [{label}]\n  input: {}\n  ours:  {}\n  icu:   {}",
        codepoints(input),
        codepoints(&ours),
        codepoints(&reference),
    );
}

fn assert_nfkd_matches_icu(label: &str, input: &str) {
    let ours = our_nfkd(input);
    let reference = icu_nfkd(input);
    assert_eq!(
        ours,
        reference,
        "NFKD mismatch [{label}]\n  input: {}\n  ours:  {}\n  icu:   {}",
        codepoints(input),
        codepoints(&ours),
        codepoints(&reference),
    );
}

fn assert_nfkc_matches_icu(label: &str, input: &str) {
    let ours = our_nfkc(input);
    let reference = icu_nfkc(input);
    assert_eq!(
        ours,
        reference,
        "NFKC mismatch [{label}]\n  input: {}\n  ours:  {}\n  icu:   {}",
        codepoints(input),
        codepoints(&ours),
        codepoints(&reference),
    );
}

/// Assert all four normalization forms match ICU4X.
fn assert_all_forms_match_icu(label: &str, input: &str) {
    assert_nfd_matches_icu(label, input);
    assert_nfc_matches_icu(label, input);
    assert_nfkd_matches_icu(label, input);
    assert_nfkc_matches_icu(label, input);
}

// ===========================================================================
// 1. Stable sort verification: multiple marks with the same CCC value
// ===========================================================================

#[test]
fn stable_sort_same_ccc_above_marks() {
    // Multiple CCC 230 (above) marks on one base character.
    // The canonical ordering sort must be stable: marks with the same CCC
    // value must preserve their original relative order.
    //
    // U+0300 COMBINING GRAVE ACCENT (CCC 230)
    // U+0301 COMBINING ACUTE ACCENT (CCC 230)
    // U+0302 COMBINING CIRCUMFLEX ACCENT (CCC 230)
    // U+0303 COMBINING TILDE (CCC 230)
    // U+0304 COMBINING MACRON (CCC 230)
    // U+0308 COMBINING DIAERESIS (CCC 230)
    let input = "a\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0308}";
    assert_all_forms_match_icu("stable-sort-6-above-marks", input);

    // Reverse order of the same marks (still all CCC 230, so stable sort
    // must keep them in this reversed order after sorting).
    let input_rev = "a\u{0308}\u{0304}\u{0303}\u{0302}\u{0301}\u{0300}";
    assert_all_forms_match_icu("stable-sort-6-above-marks-reversed", input_rev);
}

#[test]
fn stable_sort_same_ccc_below_marks() {
    // Multiple CCC 220 (below) marks on one base.
    // U+0316 COMBINING GRAVE ACCENT BELOW (CCC 220)
    // U+0317 COMBINING ACUTE ACCENT BELOW (CCC 220)
    // U+0323 COMBINING DOT BELOW (CCC 220)
    // U+0324 COMBINING DIAERESIS BELOW (CCC 220)
    // U+0325 COMBINING RING BELOW (CCC 220)
    let input = "a\u{0316}\u{0317}\u{0323}\u{0324}\u{0325}";
    assert_all_forms_match_icu("stable-sort-5-below-marks", input);
}

#[test]
fn stable_sort_mixed_same_ccc_groups() {
    // Two groups of same-CCC marks interleaved with a different CCC.
    // CCC 230: U+0300, U+0301
    // CCC 220: U+0323, U+0325
    // CCC 230: U+0302, U+0303
    //
    // After sorting by CCC, all CCC 220 marks should come first, then all
    // CCC 230 marks. Within each group, the original relative order must
    // be preserved (stable sort).
    let input = "e\u{0300}\u{0301}\u{0323}\u{0325}\u{0302}\u{0303}";
    assert_all_forms_match_icu("stable-sort-mixed-groups", input);
}

#[test]
fn stable_sort_three_marks_same_ccc_with_composition() {
    // NFC composition with stable ordering.
    // e + U+0323 (CCC 220, dot below) + U+0302 (CCC 230, circumflex) + U+0301 (CCC 230, acute)
    // CCC sort: 220 stays first, then 230s in original order (circumflex then acute).
    // NFC should compose e + U+0323 + U+0302 into U+1EC7, with U+0301 trailing.
    let input = "e\u{0323}\u{0302}\u{0301}";
    assert_all_forms_match_icu("stable-sort-compose-with-trailing", input);
}

// ===========================================================================
// 2. Hebrew combining marks (CCC 10-26 range)
// ===========================================================================

#[test]
fn hebrew_vowel_marks_single_base() {
    // Hebrew letter Bet (U+05D1) with multiple Hebrew vowel points stacked.
    // U+05B0 SHEVA (CCC 10)
    // U+05B1 HATAF SEGOL (CCC 11)
    // U+05B4 HIRIQ (CCC 14)
    // U+05B7 PATAH (CCC 17)
    // U+05B8 QAMATS (CCC 18)
    // U+05BB QUBUTS (CCC 20)
    // U+05BC DAGESH (CCC 21)
    // U+05BD METEG (CCC 22)
    // U+05BF RAFE (CCC 23)
    // U+05C1 SHIN DOT (CCC 24)
    // U+05C2 SIN DOT (CCC 25)

    // Forward order (already sorted by CCC)
    let input_sorted = "\u{05D1}\u{05B0}\u{05B1}\u{05B4}\u{05B7}\u{05B8}\u{05BB}\u{05BC}\u{05BD}\u{05BF}\u{05C1}\u{05C2}";
    assert_all_forms_match_icu("hebrew-all-vowels-sorted", input_sorted);

    // Reverse order (worst-case for sorting)
    let input_reversed = "\u{05D1}\u{05C2}\u{05C1}\u{05BF}\u{05BD}\u{05BC}\u{05BB}\u{05B8}\u{05B7}\u{05B4}\u{05B1}\u{05B0}";
    assert_all_forms_match_icu("hebrew-all-vowels-reversed", input_reversed);
}

#[test]
fn hebrew_dagesh_and_vowel_ordering() {
    // Common Hebrew pattern: letter + dagesh (CCC 21) + vowel (various CCC).
    // The dagesh should sort after vowels with lower CCC.
    //
    // Bet + Dagesh (CCC 21) + Sheva (CCC 10)
    // After canonical ordering: Sheva (10) before Dagesh (21).
    let input = "\u{05D1}\u{05BC}\u{05B0}";
    assert_all_forms_match_icu("hebrew-dagesh-before-sheva", input);

    // Bet + Hiriq (CCC 14) + Dagesh (CCC 21) + Meteg (CCC 22)
    // Already in order; should be stable.
    let input2 = "\u{05D1}\u{05B4}\u{05BC}\u{05BD}";
    assert_all_forms_match_icu("hebrew-hiriq-dagesh-meteg", input2);
}

#[test]
fn hebrew_shin_sin_dot_distinction() {
    // Shin (U+05E9) + Shin Dot (U+05C1, CCC 24) vs Sin Dot (U+05C2, CCC 25)
    // These differ in CCC and should maintain their ordering.
    let shin_with_shin_dot = "\u{05E9}\u{05C1}";
    let shin_with_sin_dot = "\u{05E9}\u{05C2}";

    assert_all_forms_match_icu("hebrew-shin-dot", shin_with_shin_dot);
    assert_all_forms_match_icu("hebrew-sin-dot", shin_with_sin_dot);

    // Both dots plus a vowel: Shin + Sin Dot (CCC 25) + Shin Dot (CCC 24) + Hiriq (CCC 14)
    // After sorting: Hiriq (14) then Shin Dot (24) then Sin Dot (25)
    let input = "\u{05E9}\u{05C2}\u{05C1}\u{05B4}";
    assert_all_forms_match_icu("hebrew-two-dots-plus-vowel", input);
}

#[test]
fn hebrew_realistic_word() {
    // A realistic Hebrew word with multiple marks per letter.
    // Bet + Dagesh + Sheva + Resh + Alef + Shin + Shin Dot + Hiriq + Tav
    let word = "\u{05D1}\u{05BC}\u{05B0}\u{05E8}\u{05D0}\u{05E9}\u{05C1}\u{05B4}\u{05EA}";
    assert_all_forms_match_icu("hebrew-realistic-word", word);
}

// ===========================================================================
// 3. Arabic combining marks (CCC 27-35 range)
// ===========================================================================

#[test]
fn arabic_vowel_marks_single_base() {
    // Arabic letter Ba (U+0628) with various Arabic combining marks.
    // U+064B FATHATAN (CCC 27)
    // U+064C DAMMATAN (CCC 28)
    // U+064D KASRATAN (CCC 29)
    // U+064E FATHA (CCC 30)
    // U+064F DAMMA (CCC 31)
    // U+0650 KASRA (CCC 32)
    // U+0651 SHADDA (CCC 33)
    // U+0652 SUKUN (CCC 34)
    // U+0670 SUPERSCRIPT ALEF (CCC 35)

    // Forward order
    let input_sorted =
        "\u{0628}\u{064B}\u{064C}\u{064D}\u{064E}\u{064F}\u{0650}\u{0651}\u{0652}\u{0670}";
    assert_all_forms_match_icu("arabic-all-marks-sorted", input_sorted);

    // Reverse order
    let input_reversed =
        "\u{0628}\u{0670}\u{0652}\u{0651}\u{0650}\u{064F}\u{064E}\u{064D}\u{064C}\u{064B}";
    assert_all_forms_match_icu("arabic-all-marks-reversed", input_reversed);
}

#[test]
fn arabic_shadda_with_vowel() {
    // Very common Arabic pattern: letter + Shadda (CCC 33) + Fatha (CCC 30)
    // After canonical ordering: Fatha (30) before Shadda (33).
    let input = "\u{0628}\u{0651}\u{064E}";
    assert_all_forms_match_icu("arabic-shadda-fatha", input);

    // Letter + Kasra (CCC 32) + Shadda (CCC 33) -- already sorted
    let input2 = "\u{0628}\u{0650}\u{0651}";
    assert_all_forms_match_icu("arabic-kasra-shadda", input2);

    // Letter + Shadda (33) + Kasra (32) -- needs reorder
    let input3 = "\u{0628}\u{0651}\u{0650}";
    assert_all_forms_match_icu("arabic-shadda-kasra-reorder", input3);
}

#[test]
fn arabic_small_marks() {
    // Arabic small marks with lower CCC values.
    // U+0618 SMALL FATHA (CCC 30)
    // U+0619 SMALL DAMMA (CCC 31)
    // U+061A SMALL KASRA (CCC 32)
    let input = "\u{0628}\u{061A}\u{0619}\u{0618}";
    assert_all_forms_match_icu("arabic-small-marks-reversed", input);
}

#[test]
fn arabic_realistic_word() {
    // Bismillah: Ba + Kasra + Sin + Mim + Alef + Lam + Lam + Ha + ...
    // A realistic Arabic text fragment with diacritics.
    let word = "\u{0628}\u{0650}\u{0633}\u{0652}\u{0645}\u{0650}\u{0627}\u{0644}\u{0644}\u{0651}\u{064E}\u{0647}\u{0650}";
    assert_all_forms_match_icu("arabic-bismillah-fragment", word);
}

// ===========================================================================
// 4. Tibetan combining marks (unusual CCC values)
// ===========================================================================

#[test]
fn tibetan_vowel_signs() {
    // Tibetan base consonant Ka (U+0F40) with various Tibetan combining marks.
    // U+0F71 TIBETAN VOWEL SIGN AA (CCC 129)
    // U+0F72 TIBETAN VOWEL SIGN I (CCC 130)
    // U+0F74 TIBETAN VOWEL SIGN U (CCC 132)
    // U+0F7A TIBETAN VOWEL SIGN E (CCC 130)
    // U+0F7C TIBETAN VOWEL SIGN O (CCC 130)
    // U+0F80 TIBETAN VOWEL SIGN REVERSED I (CCC 130)

    // Mix of CCC 129, 130, 132 marks
    let input = "\u{0F40}\u{0F74}\u{0F72}\u{0F71}";
    assert_all_forms_match_icu("tibetan-vowels-unsorted", input);

    // CCC 130 marks only (stability test)
    let input_same_ccc = "\u{0F40}\u{0F72}\u{0F7A}\u{0F7C}\u{0F80}";
    assert_all_forms_match_icu("tibetan-same-ccc-130", input_same_ccc);
}

#[test]
fn tibetan_subjoined_consonants() {
    // Tibetan subjoined consonants have CCC 0 (they are starters), but
    // Tibetan vowel signs around them stress the sorting.
    // Ka (U+0F40) + subjoined Sa (U+0FB6, CCC 0) + vowel sign I (U+0F72, CCC 130)
    let input = "\u{0F40}\u{0FB6}\u{0F72}";
    assert_all_forms_match_icu("tibetan-subjoined-with-vowel", input);
}

#[test]
fn tibetan_marks_with_latin_marks() {
    // Mix Tibetan CCC values (129, 130, 132) with Latin CCC values (220, 230)
    // on a Latin base to exercise unusual CCC value combinations.
    // 'a' + U+0F39 (CCC 216) + U+0302 (CCC 230) + U+0323 (CCC 220) + U+0F71 (CCC 129)
    // After sorting: CCC 129, 216, 220, 230
    let input = "a\u{0F39}\u{0302}\u{0323}\u{0F71}";
    assert_all_forms_match_icu("tibetan-latin-mixed-ccc", input);
}

// ===========================================================================
// 5. Vietnamese tone marks in combination with other marks
// ===========================================================================

#[test]
fn vietnamese_tone_marks_basic() {
    // Vietnamese uses Latin base + horn/breve + tone mark.
    // U+031B COMBINING HORN (CCC 216)
    // U+0300 COMBINING GRAVE (CCC 230)
    // U+0301 COMBINING ACUTE (CCC 230)
    // U+0303 COMBINING TILDE (CCC 230)
    // U+0309 COMBINING HOOK ABOVE (CCC 230)
    // U+0323 COMBINING DOT BELOW (CCC 220)
    //
    // Common patterns: o + horn + tone mark

    // o + horn (CCC 216) + acute (CCC 230)
    let input1 = "o\u{031B}\u{0301}";
    assert_all_forms_match_icu("vietnamese-o-horn-acute", input1);

    // o + horn (CCC 216) + dot below (CCC 220) + grave (CCC 230)
    let input2 = "o\u{031B}\u{0323}\u{0300}";
    assert_all_forms_match_icu("vietnamese-o-horn-dot-grave", input2);

    // Reversed: o + grave (CCC 230) + dot below (CCC 220) + horn (CCC 216)
    // After sort: horn (216), dot below (220), grave (230)
    let input3 = "o\u{0300}\u{0323}\u{031B}";
    assert_all_forms_match_icu("vietnamese-reversed-marks", input3);
}

#[test]
fn vietnamese_precomposed_base_with_extra_marks() {
    // U+01A1 (o with horn, precomposed) = o + U+031B in NFD.
    // Adding more marks tests composition after decomposition.
    // NFC of (o-horn + acute) should produce U+1EDB.
    let input1 = "\u{01A1}\u{0301}";
    assert_all_forms_match_icu("vietnamese-precomposed-o-horn-acute", input1);

    // U+01A1 + dot below + hook above
    let input2 = "\u{01A1}\u{0323}\u{0309}";
    assert_all_forms_match_icu("vietnamese-precomposed-o-horn-dot-hook", input2);
}

#[test]
fn vietnamese_realistic_word() {
    // "Viet Nam" with full diacritics:
    // V + i + e + circumflex (U+0302) + dot below (U+0323) + t + N + a + breve (U+0306) + m
    // This decomposes/recomposes differently in NFC vs NFD.
    let word = "Vi\u{1EC7}t Nam";
    assert_all_forms_match_icu("vietnamese-viet-nam-precomposed", word);

    // Same word, fully decomposed input
    let word_decomposed = "Vie\u{0323}\u{0302}t Nam";
    assert_all_forms_match_icu("vietnamese-viet-nam-decomposed", word_decomposed);
}

// ===========================================================================
// 6. Very long sequences (50+ marks) to stress CccBuffer overflow to Vec
// ===========================================================================

#[test]
fn long_sequence_50_above_marks() {
    // 50 CCC 230 marks (all COMBINING GRAVE through COMBINING DOUBLE ACUTE)
    // on one base. This exceeds INLINE_CAP=18 and forces Vec overflow.
    let above_marks: &[char] = &[
        '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
        '\u{0307}', '\u{0308}', '\u{030B}',
    ];
    let mut input = String::from("a");
    for i in 0..50 {
        input.push(above_marks[i % above_marks.len()]);
    }
    assert_all_forms_match_icu("long-50-above-marks", &input);
}

#[test]
fn long_sequence_60_mixed_ccc() {
    // 60 marks with varying CCC values, in reverse order.
    // This tests both the overflow path and correct sorting of many elements.
    let marks_and_ccc: &[(char, u8)] = &[
        ('\u{0327}', 202), // cedilla
        ('\u{0328}', 202), // ogonek
        ('\u{0316}', 220), // grave below
        ('\u{0323}', 220), // dot below
        ('\u{0317}', 220), // acute below
        ('\u{0300}', 230), // grave
        ('\u{0301}', 230), // acute
        ('\u{0302}', 230), // circumflex
        ('\u{0308}', 230), // diaeresis
        ('\u{0303}', 230), // tilde
    ];

    // Build in reverse CCC order (worst case for sorting)
    let mut input = String::from("e");
    for i in (0..60).rev() {
        let (ch, _) = marks_and_ccc[i % marks_and_ccc.len()];
        input.push(ch);
    }
    assert_all_forms_match_icu("long-60-mixed-reverse-ccc", &input);
}

#[test]
fn long_sequence_100_marks_all_same_ccc() {
    // 100 marks all with CCC 230 to test stable sort on the Vec overflow path.
    // We cycle through distinct characters to verify order is preserved.
    let marks: &[char] = &[
        '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
        '\u{0307}', '\u{0308}', '\u{030B}',
    ];
    let mut input = String::from("x");
    for i in 0..100 {
        input.push(marks[i % marks.len()]);
    }
    assert_all_forms_match_icu("long-100-same-ccc-230", &input);
}

#[test]
fn long_sequence_boundary_at_inline_cap() {
    // Exactly 18 marks (INLINE_CAP boundary) -- should stay inline.
    let mut input_18 = String::from("a");
    for i in 0..18 {
        // Alternate between CCC 220 and CCC 230
        if i % 2 == 0 {
            input_18.push('\u{0323}'); // dot below (220)
        } else {
            input_18.push('\u{0301}'); // acute (230)
        }
    }
    assert_all_forms_match_icu("long-exactly-18-marks", &input_18);

    // 19 marks (just past INLINE_CAP) -- triggers overflow
    let mut input_19 = input_18.clone();
    input_19.push('\u{0300}'); // grave (230)
    assert_all_forms_match_icu("long-exactly-19-marks-overflow", &input_19);
}

#[test]
fn long_sequence_multiple_ccc_bands() {
    // 54 marks spanning many different CCC values, 3 of each.
    // This tests sorting correctness across a wide CCC range.
    let marks_by_ccc: &[(char, u8)] = &[
        ('\u{0334}', 1),   // tilde overlay (CCC 1)
        ('\u{093C}', 7),   // Devanagari nukta (CCC 7)
        ('\u{094D}', 9),   // Devanagari virama (CCC 9)
        ('\u{05B0}', 10),  // Hebrew sheva (CCC 10)
        ('\u{05B4}', 14),  // Hebrew hiriq (CCC 14)
        ('\u{05BC}', 21),  // Hebrew dagesh (CCC 21)
        ('\u{064B}', 27),  // Arabic fathatan (CCC 27)
        ('\u{064E}', 30),  // Arabic fatha (CCC 30)
        ('\u{0651}', 33),  // Arabic shadda (CCC 33)
        ('\u{0670}', 35),  // Arabic superscript alef (CCC 35)
        ('\u{0327}', 202), // cedilla (CCC 202)
        ('\u{031B}', 216), // horn (CCC 216)
        ('\u{0323}', 220), // dot below (CCC 220)
        ('\u{0300}', 230), // grave (CCC 230)
        ('\u{0301}', 230), // acute (CCC 230)
        ('\u{0302}', 230), // circumflex (CCC 230)
        ('\u{0345}', 240), // iota subscript (CCC 240)
        ('\u{0303}', 230), // tilde (CCC 230)
    ];

    // Build input in reverse order (worst case)
    let mut input = String::from("a");
    for i in 0..54 {
        let (ch, _) = marks_by_ccc[(54 - 1 - i) % marks_by_ccc.len()];
        input.push(ch);
    }
    assert_all_forms_match_icu("long-54-multi-ccc-bands", &input);
}

// ===========================================================================
// 7. Worst-case CCC ordering (marks in reverse CCC order)
// ===========================================================================

#[test]
fn worst_case_reverse_ccc_order() {
    // Marks in strictly decreasing CCC order: 240, 230, 220, 202, 216, 1
    // After canonical ordering sort: 1, 202, 216, 220, 230, 240
    let input = "a\u{0345}\u{0300}\u{0323}\u{0327}\u{031B}\u{0334}";
    assert_all_forms_match_icu("worst-case-reverse-6-marks", input);
}

#[test]
fn worst_case_interleaved_ascending_descending() {
    // Interleaved pattern: high, low, high, low, ...
    // CCC: 230, 1, 220, 7, 202, 9, 240
    let input = "a\u{0300}\u{0334}\u{0323}\u{093C}\u{0327}\u{094D}\u{0345}";
    assert_all_forms_match_icu("worst-case-interleaved", input);
}

#[test]
fn worst_case_all_distinct_ccc_values() {
    // One mark from each major CCC group, in reverse order.
    // CCC 240: U+0345 iota subscript
    // CCC 234: U+035D double breve
    // CCC 233: U+0362 double rightwards arrow below
    // CCC 232: U+0350 right arrowhead above
    // CCC 230: U+0300 grave
    // CCC 226: U+1D165 (CCC 216 actually -- let's use real chars)
    // CCC 222: U+0339 right half ring below
    // CCC 220: U+0323 dot below
    // CCC 218: U+031C left half ring below (CCC 220 actually)
    // CCC 202: U+0327 cedilla
    // CCC 9:   U+094D virama
    // CCC 1:   U+0334 tilde overlay
    //
    // Put in reverse CCC order for maximum reordering.
    let input = "a\u{0345}\u{035D}\u{0362}\u{0350}\u{0300}\u{0323}\u{0327}\u{094D}\u{0334}";
    assert_all_forms_match_icu("worst-case-many-distinct-ccc", input);
}

#[test]
fn worst_case_duplicate_ccc_values_reversed() {
    // Same CCC value appears multiple times, in various positions among
    // different CCC values. Tests that stable sort keeps same-CCC marks
    // in original relative order even when heavily interleaved.
    //
    // Sequence: acute(230) dot-below(220) grave(230) cedilla(202) circumflex(230) ogonek(202)
    // After stable sort by CCC: cedilla(202) ogonek(202) dot-below(220) acute(230) grave(230) circumflex(230)
    let input = "e\u{0301}\u{0323}\u{0300}\u{0327}\u{0302}\u{0328}";
    assert_all_forms_match_icu("worst-case-duplicates-interleaved", input);
}

// ===========================================================================
// 8. Cross-form validation: NFD (sort only) vs NFC (sort + compose)
// ===========================================================================

#[test]
fn cross_form_nfd_vs_nfc_cedilla_acute() {
    // e + cedilla (CCC 202) + acute (CCC 230)
    // NFD: e + cedilla + acute (already sorted)
    // NFC: should compose e + cedilla into U+0229, then add acute
    let input = "e\u{0327}\u{0301}";
    assert_nfd_matches_icu("cross-form-nfd-cedilla-acute", input);
    assert_nfc_matches_icu("cross-form-nfc-cedilla-acute", input);

    // Reversed: e + acute (CCC 230) + cedilla (CCC 202)
    // After canonical reordering: e + cedilla + acute (same as above)
    let input_rev = "e\u{0301}\u{0327}";
    assert_nfd_matches_icu("cross-form-nfd-acute-cedilla", input_rev);
    assert_nfc_matches_icu("cross-form-nfc-acute-cedilla", input_rev);
}

#[test]
fn cross_form_blocking_by_intervening_mark() {
    // CCC blocking: e + cedilla (CCC 202) + dot below (CCC 220) + acute (CCC 230)
    // The cedilla (202) comes first. In NFC, e + cedilla composes to U+0229.
    // Then dot below (220) + acute (230) remain as combining marks.
    let input = "e\u{0327}\u{0323}\u{0301}";
    assert_all_forms_match_icu("cross-form-blocking-cedilla-dot-acute", input);

    // Now: e + dot below (220) + cedilla (202) + acute (230)
    // After sort: cedilla (202) + dot below (220) + acute (230)
    // Same result as above.
    let input2 = "e\u{0323}\u{0327}\u{0301}";
    assert_all_forms_match_icu("cross-form-blocking-reordered", input2);
}

// ===========================================================================
// 9. Additional edge cases: overlay marks (CCC 1), nukta (CCC 7), etc.
// ===========================================================================

#[test]
fn overlay_marks_ccc_1() {
    // CCC 1 marks should sort before all other non-zero CCC marks.
    // U+0334 COMBINING TILDE OVERLAY (CCC 1)
    // U+0335 COMBINING SHORT STROKE OVERLAY (CCC 1)
    // U+0338 COMBINING LONG SOLIDUS OVERLAY (CCC 1)
    //
    // Put a CCC 230 mark first, then CCC 1 marks.
    let input = "a\u{0301}\u{0334}\u{0335}\u{0338}";
    assert_all_forms_match_icu("overlay-ccc-1-after-230", input);
}

#[test]
fn nukta_ccc_7() {
    // Devanagari Ka (U+0915) + Nukta (U+093C, CCC 7) + Virama (U+094D, CCC 9)
    // Already in order, should be stable.
    let input = "\u{0915}\u{093C}\u{094D}";
    assert_all_forms_match_icu("nukta-virama-sorted", input);

    // Reversed: Ka + Virama (CCC 9) + Nukta (CCC 7) -> should reorder to Nukta then Virama
    let input_rev = "\u{0915}\u{094D}\u{093C}";
    assert_all_forms_match_icu("nukta-virama-reversed", input_rev);
}

#[test]
fn kana_voicing_ccc_8() {
    // Hiragana Ka (U+304B) + Combining Dakuten (U+3099, CCC 8)
    // This should compose in NFC to U+304C (Ga).
    let input = "\u{304B}\u{3099}";
    assert_all_forms_match_icu("kana-voicing-dakuten", input);

    // Hiragana Ha (U+306F) + Combining Handakuten (U+309A, CCC 8)
    // Should compose in NFC to U+3071 (Pa).
    let input2 = "\u{306F}\u{309A}";
    assert_all_forms_match_icu("kana-voicing-handakuten", input2);
}

#[test]
fn iota_subscript_ccc_240() {
    // Iota subscript (U+0345, CCC 240) is the highest CCC value.
    // It should always sort last among combining marks.
    //
    // Alpha (U+0391) + iota subscript (240) + grave (230) + smooth breathing (230 for U+0313)
    let input = "\u{0391}\u{0345}\u{0300}\u{0313}";
    assert_all_forms_match_icu("iota-subscript-sorts-last", input);

    // Greek lowercase alpha with marks in sorted CCC order
    // smooth breathing (U+0313, CCC 230) + grave (U+0300, CCC 230) + iota subscript (U+0345, CCC 240)
    let input2 = "\u{03B1}\u{0313}\u{0300}\u{0345}";
    assert_all_forms_match_icu("greek-alpha-breathing-grave-iota", input2);
}

// ===========================================================================
// 10. Hebrew and Arabic mixed together
// ===========================================================================

#[test]
fn hebrew_arabic_marks_on_same_base() {
    // Stress test: put Hebrew and Arabic combining marks on a single base.
    // This is linguistically unusual but must sort correctly.
    // Hebrew Sheva (CCC 10) + Arabic Fatha (CCC 30) + Arabic Shadda (CCC 33) + above grave (CCC 230)
    let input = "a\u{0300}\u{0651}\u{064E}\u{05B0}";
    assert_all_forms_match_icu("hebrew-arabic-mixed-on-latin", input);
}

// ===========================================================================
// 11. Multiple starters interspersed with combining sequences
// ===========================================================================

#[test]
fn multiple_clusters_with_marks() {
    // Several base characters each followed by combining marks in various orders.
    // Each cluster's marks should be sorted independently.
    let input = "a\u{0302}\u{0327}b\u{0323}\u{0301}c\u{0345}\u{0300}\u{0334}";
    assert_all_forms_match_icu("multi-cluster-with-marks", input);
}

#[test]
fn alternating_starters_and_marks() {
    // Starter, mark, starter, mark, ... pattern
    // Each mark sequence is length 1, so no sorting needed, but composition may apply.
    let input = "a\u{0301}b\u{0302}c\u{0303}d\u{0327}e\u{0323}";
    assert_all_forms_match_icu("alternating-starter-mark", input);
}

// ===========================================================================
// 12. Edge: empty and single-character inputs
// ===========================================================================

#[test]
fn empty_input() {
    assert_all_forms_match_icu("empty", "");
}

#[test]
fn single_combining_mark_no_base() {
    // A combining mark without a base character.
    let input = "\u{0301}";
    assert_all_forms_match_icu("lone-acute", input);

    // Multiple combining marks without a base.
    let input2 = "\u{0323}\u{0301}\u{0327}";
    assert_all_forms_match_icu("multiple-marks-no-base", input2);
}

// ===========================================================================
// 13. Stress: repeated base+marks pattern at scale
// ===========================================================================

#[test]
fn repeated_cluster_stress() {
    // 100 repetitions of a base + 3 marks in reverse CCC order.
    // This tests sustained correctness over a long string.
    let cluster = "e\u{0301}\u{0323}\u{0327}"; // acute(230) dot-below(220) cedilla(202)
    let input: String = cluster.repeat(100);
    assert_all_forms_match_icu("repeated-cluster-100x", &input);
}

// ===========================================================================
// 14. Combining marks from supplementary planes
// ===========================================================================

#[test]
fn supplementary_plane_combining_marks() {
    // Musical combining marks from the SMP (Supplementary Multilingual Plane).
    // U+1D165 MUSICAL SYMBOL COMBINING STEM (CCC 216)
    // U+1D166 MUSICAL SYMBOL COMBINING SPRECHGESANG STEM (CCC 216)
    // U+1D167 MUSICAL SYMBOL COMBINING TREMOLO-1 (CCC 1)
    // U+1D16D MUSICAL SYMBOL COMBINING AUGMENTATION DOT (CCC 226)

    // Musical void notehead + combining marks in reverse CCC order
    let input = "\u{1D157}\u{1D16D}\u{1D165}\u{1D167}";
    assert_all_forms_match_icu("musical-supplementary-marks", input);
}

// ===========================================================================
// 15. CCC 200 and 202 (attached below left / attached below)
// ===========================================================================

#[test]
fn attached_below_marks() {
    // U+0327 COMBINING CEDILLA (CCC 202)
    // U+0328 COMBINING OGONEK (CCC 202)
    // Both CCC 202: test stability.
    let input = "a\u{0327}\u{0328}";
    assert_all_forms_match_icu("attached-below-cedilla-ogonek", input);

    // Reversed
    let input_rev = "a\u{0328}\u{0327}";
    assert_all_forms_match_icu("attached-below-ogonek-cedilla", input_rev);
}

// ===========================================================================
// 16. Hebrew cantillation marks (CCC 220-230 range accents)
// ===========================================================================

#[test]
fn hebrew_cantillation_marks() {
    // Hebrew has a rich set of cantillation marks (te'amim) with various CCC values.
    // U+0591 ETNAHTA (CCC 220)
    // U+0592 SEGOL (CCC 230)
    // U+0593 SHALSHELET (CCC 230)
    // U+0594 ZAQEF QATAN (CCC 230)
    // U+0596 TIPEHA (CCC 220)
    // U+059A YETIV (CCC 222)
    //
    // Mix of CCC 220, 222, 230 on a Hebrew letter.
    let input = "\u{05D0}\u{0594}\u{0593}\u{0592}\u{059A}\u{0596}\u{0591}";
    assert_all_forms_match_icu("hebrew-cantillation-mixed", input);
}

// ===========================================================================
// 17. Regression guard: exactly INLINE_CAP marks with same CCC
// ===========================================================================

#[test]
fn exactly_inline_cap_same_ccc() {
    // Exactly 18 marks (INLINE_CAP) all with CCC 230.
    // This is the boundary case where inline storage is full but overflow
    // is NOT triggered.
    let marks: &[char] = &[
        '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
        '\u{0307}', '\u{0308}', '\u{030B}', '\u{030C}', '\u{030D}', '\u{030E}', '\u{030F}',
        '\u{0310}', '\u{0311}', '\u{0312}', '\u{0313}',
    ];
    assert_eq!(marks.len(), 18);

    let mut input = String::from("a");
    for &m in marks {
        input.push(m);
    }
    assert_all_forms_match_icu("exactly-inline-cap-18-same-ccc", &input);
}