simd-normalizer 0.1.1

SIMD-accelerated Unicode normalization (NFC, NFD, NFKC, NFKD)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
//! Table access API -- trie lookup wrappers for decomposition, composition,
//! CCC, and quick-check data.

pub(crate) mod casefold;
pub(crate) mod ccc;
pub(crate) mod ccc_qc;
pub(crate) mod composition;
pub(crate) mod confusable;
pub(crate) mod decomposition;
pub(crate) mod qc;
pub(crate) mod trie;

use trie::CodePointTrie;

// ---------------------------------------------------------------------------
// Bit-field constants for packed decomposition trie values
// ---------------------------------------------------------------------------

/// First character of decomposition combines backwards.
#[allow(dead_code)]
const BACKWARD_COMBINING: u32 = 1 << 31;
/// Decomposition doesn't round-trip via NFC.
#[allow(dead_code)]
const NON_ROUND_TRIP: u32 = 1 << 30;
/// Code point needs decomposition (not a self-mapping).
const HAS_DECOMPOSITION: u32 = 1 << 29;
/// If set, DECOMP_INFO is an offset into the expansion table (not a singleton).
const IS_EXPANSION: u32 = 1 << 24;
/// Shift to extract the CCC byte from a decomposition trie value.
const CCC_SHIFT: u32 = 16;
/// Mask to isolate the CCC byte after shifting.
const CCC_MASK: u32 = 0xFF << CCC_SHIFT;
/// Mask for the 16-bit decomposition info field.
const DECOMP_INFO_MASK: u32 = 0xFFFF;

/// Set iff the codepoint has CCC > 0 (i.e. is any combining mark). Used by the
/// compose-mode passthrough fast path to decide whether the final byte of a
/// passthrough run must be fed through NormState as a potential starter.
/// See `needs_starter_shadow` for why the narrower ASCII-composer rule is unsound.
pub(crate) const NEEDS_STARTER_SHADOW: u32 = 1 << 28;

// ---------------------------------------------------------------------------
// Expansion entry format: (ccc << 21) | code_point
// ---------------------------------------------------------------------------

/// Shift to extract CCC from a packed expansion entry.
pub(crate) const EXPANSION_CCC_SHIFT: u32 = 21;
/// Mask to extract code point from a packed expansion entry.
pub(crate) const EXPANSION_CP_MASK: u32 = 0x1FFFFF;

// ---------------------------------------------------------------------------
// DecompResult
// ---------------------------------------------------------------------------

/// Result of decoding a decomposition trie value.
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum DecompResult {
    /// No decomposition (character maps to itself).
    None,
    /// Singleton BMP decomposition.
    Singleton(char),
    /// Expansion: offset and length into the relevant expansion table.
    Expansion { offset: usize, length: usize },
}

// ---------------------------------------------------------------------------
// Static trie constructors
// ---------------------------------------------------------------------------

/// Build the canonical decomposition trie from generated data.
#[inline]
pub(crate) fn canonical_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: decomposition::CANONICAL_BMP_INDEX,
        data: decomposition::CANONICAL_TRIE_DATA,
        supp_index1: decomposition::CANONICAL_SUPP_INDEX1,
        supp_index2: decomposition::CANONICAL_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the compatibility decomposition trie from generated data.
#[inline]
pub(crate) fn compat_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: decomposition::COMPAT_BMP_INDEX,
        data: decomposition::COMPAT_TRIE_DATA,
        supp_index1: decomposition::COMPAT_SUPP_INDEX1,
        supp_index2: decomposition::COMPAT_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the CCC trie from generated data.
#[inline]
pub(crate) fn ccc_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: ccc::CCC_BMP_INDEX,
        data: ccc::CCC_TRIE_DATA,
        supp_index1: ccc::CCC_SUPP_INDEX1,
        supp_index2: ccc::CCC_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the NFC quick-check trie.
#[allow(dead_code)]
#[inline]
pub(crate) fn nfc_qc_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: qc::NFC_QC_BMP_INDEX,
        data: qc::NFC_QC_TRIE_DATA,
        supp_index1: qc::NFC_QC_SUPP_INDEX1,
        supp_index2: qc::NFC_QC_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the NFD quick-check trie.
#[allow(dead_code)]
#[inline]
pub(crate) fn nfd_qc_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: qc::NFD_QC_BMP_INDEX,
        data: qc::NFD_QC_TRIE_DATA,
        supp_index1: qc::NFD_QC_SUPP_INDEX1,
        supp_index2: qc::NFD_QC_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the NFKC quick-check trie.
#[allow(dead_code)]
#[inline]
pub(crate) fn nfkc_qc_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: qc::NFKC_QC_BMP_INDEX,
        data: qc::NFKC_QC_TRIE_DATA,
        supp_index1: qc::NFKC_QC_SUPP_INDEX1,
        supp_index2: qc::NFKC_QC_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the NFKD quick-check trie.
#[allow(dead_code)]
#[inline]
pub(crate) fn nfkd_qc_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: qc::NFKD_QC_BMP_INDEX,
        data: qc::NFKD_QC_TRIE_DATA,
        supp_index1: qc::NFKD_QC_SUPP_INDEX1,
        supp_index2: qc::NFKD_QC_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Build the fused CCC+QC trie.
///
/// Packed value: `(ccc << 8) | (nfkd_qc << 6) | (nfkc_qc << 4) | (nfd_qc << 2) | nfc_qc`
#[inline]
pub(crate) fn ccc_qc_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: ccc_qc::CCC_QC_BMP_INDEX,
        data: ccc_qc::CCC_QC_TRIE_DATA,
        supp_index1: ccc_qc::CCC_QC_SUPP_INDEX1,
        supp_index2: ccc_qc::CCC_QC_SUPP_INDEX2,
        default_value: 0,
    }
}

// ---------------------------------------------------------------------------
// Decoding helpers
// ---------------------------------------------------------------------------

/// Extract CCC from a decomposition trie value.
#[inline]
pub(crate) fn ccc_from_trie_value(v: u32) -> u8 {
    ((v & CCC_MASK) >> CCC_SHIFT) as u8
}

/// Check if a trie value indicates the character has a decomposition mapping.
#[inline(always)]
pub(crate) fn has_decomposition(trie_value: u32) -> bool {
    trie_value & HAS_DECOMPOSITION != 0
}

/// Check whether the passthrough fast path must feed this codepoint through
/// NormState as a potential starter. The bit is set iff CCC>0 — i.e. the
/// codepoint is any combining mark. This conservatively preserves the
/// preceding starter across any combining-mark run.
///
/// Note: the narrower rule "CCC>0 AND has an ASCII-starter composition
/// partner" is unsound under canonical reorder: a later combining mark that
/// *does* compose with the starter can be reordered ahead of a non-composing
/// mark in front of it, so we cannot skip feeding *any* CCC>0 codepoint into
/// NormState without losing the starter context.
#[inline(always)]
pub(crate) fn needs_starter_shadow(trie_value: u32) -> bool {
    trie_value & NEEDS_STARTER_SHADOW != 0
}

/// Look up the raw decomposition trie value for a character.
#[inline]
pub(crate) fn raw_decomp_trie_value(c: char, form: crate::decompose::DecompForm) -> u32 {
    match form {
        crate::decompose::DecompForm::Canonical => canonical_trie().get(c as u32),
        crate::decompose::DecompForm::Compatible => compat_trie().get(c as u32),
    }
}

/// Fast supplementary-plane decomposition trie lookup (no BMP check, no bounds checks).
///
/// # Safety
/// `cp` must be a supplementary code point (U+10000..U+10FFFF).
#[inline(always)]
pub(crate) unsafe fn raw_decomp_trie_value_supplementary(
    cp: u32,
    form: crate::decompose::DecompForm,
) -> u32 {
    match form {
        crate::decompose::DecompForm::Canonical => unsafe {
            canonical_trie().get_supplementary_unchecked(cp)
        },
        crate::decompose::DecompForm::Compatible => unsafe {
            compat_trie().get_supplementary_unchecked(cp)
        },
    }
}

/// Pipelined 2-codepoint decomposition trie lookup (BMP only).
///
/// Both code points must be BMP (< 0x10000). Supplementary codepoints must
/// go through `raw_decomp_trie_value_supplementary` individually. The result
/// is bit-identical to two serial `raw_decomp_trie_value` calls, but the two
/// dependent loads overlap so the out-of-order engine can hide an L2 miss.
#[allow(dead_code)]
// retained as a primitive for future pipelining work
#[inline(always)]
pub(crate) fn raw_decomp_trie_values_pipelined<const N: usize>(
    cps: &[u32; N],
    form: crate::decompose::DecompForm,
) -> [u32; N] {
    debug_assert!(N == 2, "only 2-way pipelining is implemented");
    debug_assert!(cps[0] < 0x10000 && cps[1] < 0x10000);
    let trie = match form {
        crate::decompose::DecompForm::Canonical => canonical_trie(),
        crate::decompose::DecompForm::Compatible => compat_trie(),
    };
    // SAFETY: debug_assert above guarantees both cps are BMP; generated tries
    // are well-formed so `get_two_bmp_pipelined_unchecked`'s safety holds.
    let pair = unsafe { trie.get_two_bmp_pipelined_unchecked(cps[0], cps[1]) };
    let mut out = [0u32; N];
    out[0] = pair[0];
    out[1] = pair[1];
    out
}

/// Decode a decomposition trie value into (DecompResult, CCC).
#[inline]
pub(crate) fn decode_trie_value(
    trie_value: u32,
    form: crate::decompose::DecompForm,
) -> (DecompResult, u8) {
    let ccc = ccc_from_trie_value(trie_value);
    let expansion_table = match form {
        crate::decompose::DecompForm::Canonical => decomposition::CANONICAL_EXPANSIONS,
        crate::decompose::DecompForm::Compatible => decomposition::COMPAT_EXPANSIONS,
    };
    let decomp = decode_decomp(trie_value, expansion_table);
    (decomp, ccc)
}

/// Decode decomposition from a trie value.
///
/// For expansions, reads length from `expansion_table[offset]` and data
/// follows at `offset + 1`.
#[inline]
pub(crate) fn decode_decomp(trie_value: u32, expansion_table: &[u32]) -> DecompResult {
    if trie_value & HAS_DECOMPOSITION == 0 {
        return DecompResult::None;
    }
    let info = trie_value & DECOMP_INFO_MASK;
    if trie_value & IS_EXPANSION != 0 {
        // Expansion: info is offset into length-prefixed expansion table.
        let offset = info as usize;
        let length = expansion_table[offset] as usize;
        DecompResult::Expansion {
            offset: offset + 1,
            length,
        }
    } else {
        // Singleton BMP decomposition.
        // SAFETY: The table generator guarantees info is a valid BMP code point.
        debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
        let ch = unsafe { char::from_u32_unchecked(info) };
        DecompResult::Singleton(ch)
    }
}

/// Look up canonical decomposition for a character.
///
/// Returns `(decomp_result, ccc)` -- both extracted from the same trie lookup.
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_canonical_decomp(c: char) -> (DecompResult, u8) {
    let trie = canonical_trie();
    let v = trie.get(c as u32);
    let ccc = ccc_from_trie_value(v);
    let decomp = decode_decomp(v, decomposition::CANONICAL_EXPANSIONS);
    (decomp, ccc)
}

/// Look up compatibility decomposition for a character.
///
/// Returns `(decomp_result, ccc)` -- both extracted from the same trie lookup.
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_compat_decomp(c: char) -> (DecompResult, u8) {
    let trie = compat_trie();
    let v = trie.get(c as u32);
    let ccc = ccc_from_trie_value(v);
    let decomp = decode_decomp(v, decomposition::COMPAT_EXPANSIONS);
    (decomp, ccc)
}

/// Look up the Canonical Combining Class from the dedicated CCC trie.
#[inline]
pub(crate) fn lookup_ccc(c: char) -> u8 {
    let trie = ccc_trie();
    trie.get(c as u32) as u8
}

/// Compose a `(starter, combining)` pair.
///
/// Returns `Some(composed)` if the pair is canonically composable.
#[inline]
pub(crate) fn compose_pair(a: char, b: char) -> Option<char> {
    let key = ((a as u64) << 21) | (b as u64);
    let pairs = composition::COMPOSITION_PAIRS;
    let mut len = pairs.len();
    let mut base = 0usize;

    while len > 1 {
        let half = len / 2;
        // Branchless: if pairs[base + half].0 <= key, advance base.
        // The compiler should emit cmov for this pattern.
        base += (pairs[base + half].0 <= key) as usize * half;
        len -= half;
    }

    if base < pairs.len() && pairs[base].0 == key {
        // SAFETY: composition table only contains valid Unicode scalar values
        debug_assert!(pairs[base].1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&(pairs[base].1)));
        Some(unsafe { char::from_u32_unchecked(pairs[base].1) })
    } else {
        None
    }
}

// ---------------------------------------------------------------------------
// Quick-check lookups (0=Yes, 1=Maybe, 2=No)
// ---------------------------------------------------------------------------

/// NFC quick-check: 0=Yes, 1=Maybe, 2=No.
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfc_qc(c: char) -> u8 {
    nfc_qc_trie().get(c as u32) as u8
}

/// NFD quick-check: 0=Yes, 1=Maybe, 2=No.
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfd_qc(c: char) -> u8 {
    nfd_qc_trie().get(c as u32) as u8
}

/// NFKC quick-check: 0=Yes, 1=Maybe, 2=No.
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfkc_qc(c: char) -> u8 {
    nfkc_qc_trie().get(c as u32) as u8
}

/// NFKD quick-check: 0=Yes, 1=Maybe, 2=No.
#[allow(dead_code)]
#[inline]
pub(crate) fn lookup_nfkd_qc(c: char) -> u8 {
    nfkd_qc_trie().get(c as u32) as u8
}

// ---------------------------------------------------------------------------
// Fused CCC+QC lookup (single trie lookup for both CCC and QC)
// ---------------------------------------------------------------------------

/// Bit shifts to extract each form's 2-bit QC value from the packed CCC+QC trie.
pub(crate) const CCC_QC_NFC_SHIFT: u32 = 0;
pub(crate) const CCC_QC_NFD_SHIFT: u32 = 2;
pub(crate) const CCC_QC_NFKC_SHIFT: u32 = 4;
pub(crate) const CCC_QC_NFKD_SHIFT: u32 = 6;
/// Shift to extract CCC from the packed CCC+QC trie value.
pub(crate) const CCC_QC_CCC_SHIFT: u32 = 8;

/// Look up fused CCC + QC value in a single trie lookup.
///
/// Returns `(ccc, qc)` where qc is 0=Yes, 1=Maybe, 2=No for the given form.
#[inline]
pub(crate) fn lookup_ccc_qc(c: char, qc_shift: u32) -> (u8, u8) {
    let packed = ccc_qc_trie().get(c as u32);
    let ccc = (packed >> CCC_QC_CCC_SHIFT) as u8;
    let qc = ((packed >> qc_shift) & 0x3) as u8;
    (ccc, qc)
}

// ---------------------------------------------------------------------------
// Expansion data accessors
// ---------------------------------------------------------------------------

/// Read expansion data from the canonical expansion table.
#[inline]
pub(crate) fn canonical_expansion_data(offset: usize, length: usize) -> &'static [u32] {
    &decomposition::CANONICAL_EXPANSIONS[offset..offset + length]
}

/// Read expansion data from the compatibility expansion table.
#[inline]
pub(crate) fn compat_expansion_data(offset: usize, length: usize) -> &'static [u32] {
    &decomposition::COMPAT_EXPANSIONS[offset..offset + length]
}

/// Extract expansion data directly from a decomposition trie value.
///
/// Returns `Some(data_slice)` if the trie value encodes an expansion,
/// `None` if it is a singleton or no-decomposition. This avoids constructing
/// a `DecompResult` enum in the hot path.
///
/// The caller must have already verified `has_decomposition(trie_value)`.
#[inline(always)]
pub(crate) fn expansion_data_from_trie_value(
    trie_value: u32,
    form: crate::decompose::DecompForm,
) -> Option<&'static [u32]> {
    if trie_value & IS_EXPANSION == 0 {
        return None;
    }
    let offset = (trie_value & DECOMP_INFO_MASK) as usize;
    let table = match form {
        crate::decompose::DecompForm::Canonical => decomposition::CANONICAL_EXPANSIONS,
        crate::decompose::DecompForm::Compatible => decomposition::COMPAT_EXPANSIONS,
    };
    let length = table[offset] as usize;
    Some(&table[offset + 1..offset + 1 + length])
}

// ---------------------------------------------------------------------------
// Case folding lookups
// ---------------------------------------------------------------------------

/// Build the case folding trie from generated data.
#[inline]
pub(crate) fn casefold_trie() -> CodePointTrie {
    CodePointTrie {
        bmp_index: casefold::CASEFOLD_BMP_INDEX,
        data: casefold::CASEFOLD_TRIE_DATA,
        supp_index1: casefold::CASEFOLD_SUPP_INDEX1,
        supp_index2: casefold::CASEFOLD_SUPP_INDEX2,
        default_value: 0,
    }
}

/// Look up simple case folding for a character.
///
/// Returns `Some(folded)` if the character has a case folding, `None` otherwise.
#[inline]
pub(crate) fn lookup_casefold(c: char) -> Option<char> {
    let trie = casefold_trie();
    let v = trie.get(c as u32);
    if v == 0 {
        None
    } else {
        // SAFETY: case folding trie contains only valid Unicode scalar values.
        debug_assert!(v <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&v));
        Some(unsafe { char::from_u32_unchecked(v) })
    }
}

/// Turkish case folding exception table.
#[inline]
pub(crate) fn turkish_casefold(c: char) -> Option<char> {
    let cp = c as u32;
    for &(src, tgt) in casefold::TURKISH_FOLDS {
        if src == cp {
            // SAFETY: table contains only valid Unicode scalar values.
            return Some(unsafe { char::from_u32_unchecked(tgt) });
        }
    }
    None
}

// ---------------------------------------------------------------------------
// Confusable mapping lookups
// ---------------------------------------------------------------------------

/// Flag bit indicating an expansion in confusable mapping values.
const CONFUSABLE_EXPANSION_FLAG: u32 = 0x80000000;

/// Result of looking up a confusable mapping.
pub(crate) enum ConfusableResult {
    /// No mapping (character is not confusable).
    None,
    /// Single-character mapping.
    Single(char),
    /// Multi-character mapping: offset and length into the expansion table.
    Expansion { offset: usize, length: usize },
}

/// Look up the confusable mapping for a character.
///
/// Uses branchless binary search over the sorted mapping table.
#[inline]
pub(crate) fn lookup_confusable(c: char) -> ConfusableResult {
    let cp = c as u32;
    let pairs = confusable::CONFUSABLE_MAPPINGS;
    let mut len = pairs.len();
    let mut base = 0usize;

    // Branchless binary search.
    while len > 1 {
        let half = len / 2;
        base += (pairs[base + half].0 <= cp) as usize * half;
        len -= half;
    }

    if base < pairs.len() && pairs[base].0 == cp {
        let value = pairs[base].1;
        if value & CONFUSABLE_EXPANSION_FLAG != 0 {
            let length = ((value >> 16) & 0xFF) as usize;
            let offset = (value & 0xFFFF) as usize;
            ConfusableResult::Expansion { offset, length }
        } else {
            // SAFETY: confusable table contains only valid Unicode scalar values.
            debug_assert!(value <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&value));
            ConfusableResult::Single(unsafe { char::from_u32_unchecked(value) })
        }
    } else {
        ConfusableResult::None
    }
}

/// Read expansion data from the confusable expansion table.
#[inline]
pub(crate) fn confusable_expansion_data(offset: usize, length: usize) -> &'static [u32] {
    &confusable::CONFUSABLE_EXPANSIONS[offset..offset + length]
}

// ---------------------------------------------------------------------------
// Confusable bloom filter (Component E)
// ---------------------------------------------------------------------------
//
// 256-byte bloom filter indexed by `(cp >> 8) ^ (cp & 0xFF)`, mapping to a
// single bit. False positives are acceptable; false negatives are not (a
// false negative would skip a required confusable mapping). The bloom is
// computed at compile time from the same `CONFUSABLE_MAPPINGS` list that
// `lookup_confusable` consults, so by construction every source codepoint
// hashes to a set bit.

/// Compute the bloom hash for a codepoint: byte index in `[0, 256)` and bit
/// position in `[0, 8)`.
#[inline(always)]
pub(crate) const fn confusable_bloom_hash(cp: u32) -> (usize, u8) {
    // Index: low 8 bits of `(cp >> 8) ^ (cp & 0xFF)`. The `& 0xFF` mask
    // keeps the index in `[0, 256)` for supplementary codepoints whose
    // `cp >> 8` can exceed 0xFF. The full hash mixes the high nibble of cp
    // into the bit position so distinct codepoints sharing the same byte
    // index typically pick different bits.
    let h = (((cp >> 8) ^ (cp & 0xFF)) & 0xFF) as usize;
    let bit = (((cp >> 16) ^ (cp >> 4)) & 0x07) as u8;
    (h, bit)
}

/// Build the 256-byte confusable bloom filter at compile time.
const fn build_confusable_bloom() -> [u8; 256] {
    let mut bloom = [0u8; 256];
    let mappings = confusable::CONFUSABLE_MAPPINGS;
    let mut i = 0;
    while i < mappings.len() {
        let cp = mappings[i].0;
        let (idx, bit) = confusable_bloom_hash(cp);
        bloom[idx] |= 1u8 << bit;
        i += 1;
    }
    bloom
}

/// 256-byte bloom filter for the confusable mapping table.
///
/// Constructed at compile time from `CONFUSABLE_MAPPINGS`. A clear bit at
/// position `confusable_bloom_hash(cp)` proves the codepoint has no
/// confusable mapping; a set bit means the codepoint *might* have one
/// (proceed to `lookup_confusable`).
pub(crate) const CONFUSABLE_BLOOM: [u8; 256] = build_confusable_bloom();

/// Probabilistic membership test: returns `false` only if the codepoint is
/// provably not in the confusable mapping table.
#[inline(always)]
pub(crate) fn confusable_bloom_might_contain(cp: u32) -> bool {
    let (idx, bit) = confusable_bloom_hash(cp);
    (CONFUSABLE_BLOOM[idx] >> bit) & 1 != 0
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // -----------------------------------------------------------------------
    // Synthetic unit tests (hand-crafted trie values)
    // -----------------------------------------------------------------------

    #[test]
    fn test_ccc_extraction() {
        // CCC = 0
        assert_eq!(ccc_from_trie_value(0), 0);
        // CCC = 230 (0xE6) at bits 23..16
        let v = 0xE6 << 16;
        assert_eq!(ccc_from_trie_value(v), 230);
        // CCC = 1
        let v = 0x01 << 16;
        assert_eq!(ccc_from_trie_value(v), 1);
        // CCC = 254
        let v = 0xFE << 16;
        assert_eq!(ccc_from_trie_value(v), 254);
    }

    #[test]
    fn test_decode_decomp_none() {
        // HAS_DECOMPOSITION not set -> None
        let dummy: [u32; 0] = [];
        assert_eq!(decode_decomp(0, &dummy), DecompResult::None);
        // Even with other bits set, if HAS_DECOMPOSITION is clear, it's None.
        assert_eq!(
            decode_decomp(BACKWARD_COMBINING | NON_ROUND_TRIP | 0x00FF_FFFF, &dummy),
            DecompResult::None
        );
    }

    #[test]
    fn test_decode_decomp_singleton() {
        // HAS_DECOMPOSITION set, IS_EXPANSION not set -> Singleton
        let v = HAS_DECOMPOSITION | 0x0041; // 'A'
        let dummy: [u32; 0] = [];
        assert_eq!(decode_decomp(v, &dummy), DecompResult::Singleton('A'));
    }

    #[test]
    fn test_decode_decomp_expansion() {
        // HAS_DECOMPOSITION | IS_EXPANSION, info = 3 (offset into expansion table)
        let v = HAS_DECOMPOSITION | IS_EXPANSION | 0x0003;
        // expansion_table[3] = 2 (length), followed by data at [4] and [5].
        let expansion_table: [u32; 6] = [0, 0, 0, 2, 0x0000_0041, 0x0000_0042];
        assert_eq!(
            decode_decomp(v, &expansion_table),
            DecompResult::Expansion {
                offset: 4,
                length: 2
            }
        );
    }

    #[test]
    fn test_ccc_from_trie_value_with_other_bits() {
        // CCC = 202, plus HAS_DECOMPOSITION and DECOMP_INFO bits
        let v = HAS_DECOMPOSITION | (202u32 << 16) | 0x1234;
        assert_eq!(ccc_from_trie_value(v), 202);
    }

    // -----------------------------------------------------------------------
    // Integration tests against real generated data
    // -----------------------------------------------------------------------

    #[test]
    fn test_compose_pair_a_grave() {
        // 'A' (U+0041) + U+0300 (COMBINING GRAVE ACCENT) -> U+00C0 (LATIN CAPITAL LETTER A WITH GRAVE)
        let result = compose_pair('A', '\u{0300}');
        assert_eq!(result, Some('\u{00C0}'));
    }

    #[test]
    fn test_compose_pair_e_acute() {
        // 'E' (U+0045) + U+0301 (COMBINING ACUTE ACCENT) -> U+00C9 (LATIN CAPITAL LETTER E WITH ACUTE)
        let result = compose_pair('E', '\u{0301}');
        assert_eq!(result, Some('\u{00C9}'));
    }

    #[test]
    fn test_compose_pair_nonexistent() {
        // 'Z' + U+0300 -- check if it exists; if it doesn't, should return None.
        // (In Unicode, Z + grave does not compose to a precomposed character.)
        let result = compose_pair('Z', '\u{0300}');
        assert_eq!(result, Option::None);
    }

    #[test]
    fn test_compose_pair_non_composable() {
        // Two random ASCII characters should not compose.
        assert_eq!(compose_pair('x', 'y'), Option::None);
    }

    #[test]
    fn test_lookup_ccc_grave_accent() {
        // U+0300 COMBINING GRAVE ACCENT has CCC = 230.
        assert_eq!(lookup_ccc('\u{0300}'), 230);
    }

    #[test]
    fn test_lookup_ccc_cedilla() {
        // U+0327 COMBINING CEDILLA has CCC = 202.
        assert_eq!(lookup_ccc('\u{0327}'), 202);
    }

    #[test]
    fn test_lookup_ccc_ascii() {
        // ASCII 'A' has CCC = 0.
        assert_eq!(lookup_ccc('A'), 0);
    }

    #[test]
    fn test_canonical_decomp_ascii() {
        // ASCII 'A' should have no canonical decomposition.
        let (decomp, ccc) = lookup_canonical_decomp('A');
        assert_eq!(decomp, DecompResult::None);
        assert_eq!(ccc, 0);
    }

    #[test]
    fn test_canonical_decomp_a_grave() {
        // U+00C0 (LATIN CAPITAL LETTER A WITH GRAVE) decomposes canonically
        // to 'A' (U+0041) + U+0300.
        let (decomp, _ccc) = lookup_canonical_decomp('\u{00C0}');
        match decomp {
            DecompResult::Expansion { offset, length } => {
                let data = canonical_expansion_data(offset, length);
                // Should be 2 entries: A (CCC=0) and U+0300 (CCC=230)
                assert_eq!(data.len(), 2);
                assert_eq!(data[0] & EXPANSION_CP_MASK, 0x0041); // 'A'
                assert_eq!(data[0] >> EXPANSION_CCC_SHIFT, 0); // CCC=0
                assert_eq!(data[1] & EXPANSION_CP_MASK, 0x0300); // combining grave accent
                assert_eq!(data[1] >> EXPANSION_CCC_SHIFT, 230); // CCC=230
            },
            DecompResult::Singleton(ch) => {
                // Some generators produce singleton for single-char decomp.
                // But U+00C0 decomposes to two characters, so this shouldn't happen.
                panic!("Expected Expansion for U+00C0, got Singleton({ch:?})");
            },
            DecompResult::None => {
                panic!("Expected decomposition for U+00C0, got None");
            },
        }
    }

    #[test]
    fn test_nfc_qc_ascii() {
        // ASCII characters are NFC quick-check Yes (0).
        assert_eq!(lookup_nfc_qc('A'), 0);
        assert_eq!(lookup_nfc_qc('z'), 0);
    }

    #[test]
    fn test_nfd_qc_ascii() {
        // ASCII characters are NFD quick-check Yes (0).
        assert_eq!(lookup_nfd_qc('A'), 0);
    }

    #[test]
    fn test_nfd_qc_precomposed() {
        // U+00C0 (A-grave) is NFD_QC = No (2) -- it has a canonical decomposition.
        let v = lookup_nfd_qc('\u{00C0}');
        assert_eq!(v, 2);
    }

    #[test]
    fn test_nfc_qc_combining_mark() {
        // U+0300 (COMBINING GRAVE ACCENT) -- not quick-check Yes (either Maybe or No
        // depending on generator encoding).
        let v = lookup_nfc_qc('\u{0300}');
        assert_ne!(v, 0, "U+0300 should not be NFC_QC=Yes");
    }

    #[test]
    fn test_nfkd_qc_ascii() {
        // ASCII is NFKD_QC Yes.
        assert_eq!(lookup_nfkd_qc('a'), 0);
    }

    #[test]
    fn test_nfkc_qc_ascii() {
        // ASCII is NFKC_QC Yes.
        assert_eq!(lookup_nfkc_qc('a'), 0);
    }

    #[test]
    fn test_trie_constructors_dont_panic() {
        // Smoke test: constructing each trie should not panic.
        let _ = canonical_trie();
        let _ = compat_trie();
        let _ = ccc_trie();
        let _ = nfc_qc_trie();
        let _ = nfd_qc_trie();
        let _ = nfkc_qc_trie();
        let _ = nfkd_qc_trie();
    }

    #[test]
    fn test_backward_combining_and_non_round_trip_bits() {
        // Verify the bit constants don't overlap and are correctly positioned.
        assert_eq!(BACKWARD_COMBINING & NON_ROUND_TRIP, 0);
        assert_eq!(BACKWARD_COMBINING & HAS_DECOMPOSITION, 0);
        assert_eq!(NON_ROUND_TRIP & HAS_DECOMPOSITION, 0);
        assert_eq!(HAS_DECOMPOSITION & IS_EXPANSION, 0);
        assert_eq!(IS_EXPANSION & CCC_MASK, 0);
        assert_eq!(CCC_MASK & DECOMP_INFO_MASK, 0);
        assert_eq!(NEEDS_STARTER_SHADOW & BACKWARD_COMBINING, 0);
        assert_eq!(NEEDS_STARTER_SHADOW & NON_ROUND_TRIP, 0);
        assert_eq!(NEEDS_STARTER_SHADOW & HAS_DECOMPOSITION, 0);
        assert_eq!(NEEDS_STARTER_SHADOW & IS_EXPANSION, 0);
        assert_eq!(NEEDS_STARTER_SHADOW & CCC_MASK, 0);
        assert_eq!(NEEDS_STARTER_SHADOW & DECOMP_INFO_MASK, 0);
    }

    #[test]
    fn pipelined_trie_walk_matches_serial() {
        use crate::decompose::DecompForm;
        for cp0 in (0u32..=0xFFFE).step_by(2) {
            let cp1 = cp0 + 1;
            if (0xD800..=0xDFFF).contains(&cp0) || (0xD800..=0xDFFF).contains(&cp1) {
                continue;
            }
            let ch0 = char::from_u32(cp0).unwrap();
            let ch1 = char::from_u32(cp1).unwrap();
            let serial = [
                raw_decomp_trie_value(ch0, DecompForm::Canonical),
                raw_decomp_trie_value(ch1, DecompForm::Canonical),
            ];
            let pipelined =
                raw_decomp_trie_values_pipelined::<2>(&[cp0, cp1], DecompForm::Canonical);
            assert_eq!(pipelined, serial, "mismatch at cp=U+{:04X}", cp0);
        }
    }
}