simd-normalizer 0.1.1

SIMD-accelerated Unicode normalization (NFC, NFD, NFKC, NFKD)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
//! Fused normalization pipeline for case-insensitive, confusable-aware matching.
//!
//! Pipeline: **NFKC → CaseFold → Confusable Skeleton** (NFD → confusable_map → NFD).
//!
//! Two strings that produce the same [`normalize_for_matching`] output are
//! equivalent for matching purposes: they share the same compatibility
//! decomposition, the same case folding, and the same confusable prototype.
//!
//! ## Optimization summary (Component E)
//!
//! The matching pipeline composes four conceptually-distinct stages
//! (`NFKC → casefold → skeleton → casefold`). A naive implementation walks
//! the input four times with three string allocations between stages. We
//! preserve that staged structure for correctness — full-fusion attempts
//! produced subtle parity divergences against the legacy chain on
//! cross-codepoint canonical reorder cases — but every individual stage is
//! optimized:
//!
//! * **NFKC** is the existing fused decomposer/composer (Component D),
//!   running at peak SIMD throughput on the hot ASCII / Latin-1 path.
//! * **Casefold** has a SIMD-driven ASCII fast path that scans 64-byte
//!   chunks for non-ASCII / uppercase bytes and lowercases via `b | 0x20`,
//!   avoiding per-byte trie lookups on pure-ASCII regions
//!   (see [`mod@crate::casefold`]).
//! * **Skeleton** uses a 256-byte bloom filter to skip the binary search
//!   into the confusable mapping table for the vast majority of codepoints
//!   that have no mapping (see `tables::confusable_bloom_might_contain`,
//!   wired into [`crate::confusable::skeleton`]).
//! * **Outer fixed-point** loop runs at most 4 iterations; in practice it
//!   converges after 1.

use alloc::string::String;
use alloc::vec::Vec;

use crate::casefold::{self, CaseFoldMode};
use crate::confusable;

/// Options for the matching normalization pipeline.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct MatchingOptions {
    /// Case folding mode. Defaults to [`CaseFoldMode::Standard`].
    pub case_fold: CaseFoldMode,
}

impl Default for MatchingOptions {
    fn default() -> Self {
        MatchingOptions {
            case_fold: CaseFoldMode::Standard,
        }
    }
}

/// Normalize input for matching: NFKC → CaseFold → Confusable Skeleton.
///
/// Returns a canonical matching form where:
/// - Compatibility equivalents are unified (NFKC)
/// - Case differences are eliminated (Unicode case folding)
/// - Visually confusable characters map to the same prototype (UTS #39 skeleton)
///
/// Two strings produce the same result if and only if they should be
/// treated as equivalent for keyword detection and anti-spoofing.
///
/// # Examples
///
/// ```
/// use simd_normalizer::matching::{normalize_for_matching, MatchingOptions};
///
/// let opts = MatchingOptions::default();
///
/// // Case folding
/// assert_eq!(
///     normalize_for_matching("File", &opts),
///     normalize_for_matching("file", &opts),
/// );
///
/// // Turkish dotless-I
/// assert_eq!(
///     normalize_for_matching("file", &opts),
///     normalize_for_matching("f\u{0131}le", &opts),
/// );
/// ```
pub fn normalize_for_matching(input: &str, opts: &MatchingOptions) -> String {
    if input.is_empty() {
        return String::new();
    }

    // Iterate the full pipeline to a fixed point. Each `one_pass` is a
    // NFKC → casefold → skeleton → casefold chain; convergence typically
    // occurs in 1–2 outer iterations.
    let mut current = one_pass(input, opts);
    for _ in 0..3 {
        let next = one_pass(&current, opts);
        if next == current {
            return current;
        }
        current = next;
    }
    current
}

/// Single pass of the matching pipeline: NFKC → casefold → skeleton → casefold.
///
/// The NFKC-first ordering is parity-critical. NFKC canonically composes
/// before casefold, hiding code points like U+0345 (COMBINING GREEK
/// YPOGEGRAMMENI) inside precomposed starters (e.g. U+1F80 `ᾀ`). A per-char
/// pipeline that decomposed first and casefolded the exposed combining mark
/// (→ U+03B9) would produce a different skeleton.
///
/// Per-stage optimizations are documented in the module-level comment.
#[inline]
fn one_pass(input: &str, opts: &MatchingOptions) -> String {
    let nfkc = crate::nfkc().normalize(input);
    let folded = casefold::casefold(&nfkc, opts.case_fold);
    let skel = confusable::skeleton(&folded);
    let final_folded = casefold::casefold(&skel, opts.case_fold);
    final_folded.into_owned()
}

/// Reference implementation of the matching pipeline, preserved for parity
/// testing against any alternative composition order.
#[cfg(any(test, feature = "internal-test-api"))]
pub fn normalize_for_matching_legacy(input: &str, opts: &MatchingOptions) -> String {
    if input.is_empty() {
        return String::new();
    }
    let mut current = one_pass_legacy(input, opts);
    for _ in 0..3 {
        let next = one_pass_legacy(&current, opts);
        if next == current {
            return current;
        }
        current = next;
    }
    current
}

/// Single legacy pass: NFKC → casefold → skeleton → casefold.
#[cfg(any(test, feature = "internal-test-api"))]
fn one_pass_legacy(input: &str, opts: &MatchingOptions) -> String {
    let nfkc = crate::nfkc().normalize(input);
    let folded = casefold::casefold(&nfkc, opts.case_fold);
    let skel = confusable::skeleton(&folded);
    let final_folded = casefold::casefold(&skel, opts.case_fold);
    final_folded.into_owned()
}

/// Normalize input for matching and encode the result as UTF-16.
///
/// Useful for interoperability with systems that use UTF-16 keyword tables.
pub fn normalize_for_matching_utf16(input: &str, opts: &MatchingOptions) -> Vec<u16> {
    normalize_for_matching(input, opts).encode_utf16().collect()
}

/// Check whether two strings match after full normalization.
///
/// Returns `true` if both strings produce the same matching form after
/// NFKC normalization, case folding, and confusable skeleton mapping.
///
/// # Examples
///
/// ```
/// use simd_normalizer::matching::{matches_normalized, MatchingOptions};
///
/// let opts = MatchingOptions::default();
///
/// // "File" and "file" match (case folding)
/// assert!(matches_normalized("File", "file", &opts));
///
/// // Latin 'a' and Cyrillic 'а' match (confusable mapping)
/// assert!(matches_normalized("a", "\u{0430}", &opts));
/// ```
pub fn matches_normalized(a: &str, b: &str, opts: &MatchingOptions) -> bool {
    // Fast path: identical strings always match.
    if a == b {
        return true;
    }
    normalize_for_matching(a, opts) == normalize_for_matching(b, opts)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tables;

    fn default_opts() -> MatchingOptions {
        MatchingOptions::default()
    }

    fn turkish_opts() -> MatchingOptions {
        MatchingOptions {
            case_fold: CaseFoldMode::Turkish,
        }
    }

    // ---- Bloom filter coverage ----

    #[test]
    fn confusable_bloom_covers_every_source() {
        // Every source codepoint in the confusable mapping table must hash
        // to a set bit in the bloom. False negatives are unsound (would
        // skip required mappings); false positives are fine.
        for &(source_cp, _) in tables::confusable::CONFUSABLE_MAPPINGS {
            assert!(
                tables::confusable_bloom_might_contain(source_cp),
                "confusable source U+{:06X} hashed to a clear bit",
                source_cp,
            );
        }
    }

    // ---- Basic tests ----

    #[test]
    fn empty_input() {
        assert_eq!(normalize_for_matching("", &default_opts()), "");
    }

    #[test]
    fn ascii_lowercase_unchanged() {
        let result = normalize_for_matching("hello", &default_opts());
        assert!(!result.is_empty());
    }

    #[test]
    fn identical_strings_match() {
        assert!(matches_normalized("test", "test", &default_opts()));
    }

    #[test]
    fn different_strings_dont_match() {
        assert!(!matches_normalized("hello", "world", &default_opts()));
    }

    // ---- Case folding tests ----

    #[test]
    fn case_insensitive_ascii() {
        let opts = default_opts();
        assert!(matches_normalized("File", "file", &opts));
        assert!(matches_normalized("FILE", "file", &opts));
        assert!(matches_normalized("FiLe", "file", &opts));
    }

    #[test]
    fn case_insensitive_extended() {
        let opts = default_opts();
        // Ö (U+00D6) case folds to ö (U+00F6)
        assert!(matches_normalized("Ströme", "ströme", &opts));
    }

    // ---- Confusable detection tests ----

    #[test]
    fn confusable_latin_cyrillic_a() {
        let opts = default_opts();
        // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430)
        assert!(matches_normalized("a", "\u{0430}", &opts));
    }

    #[test]
    fn confusable_latin_cyrillic_word() {
        let opts = default_opts();
        // "apple" in Latin vs mixed Latin/Cyrillic
        // Cyrillic: а=U+0430, р=U+0440, е=U+0435
        let latin = "apple";
        let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
        assert!(matches_normalized(latin, mixed, &opts));
    }

    // ---- Combined case + confusable tests (the key requirement) ----

    #[test]
    fn file_variants_all_match() {
        let opts = default_opts();
        let canonical = normalize_for_matching("file", &opts);

        // Case variant
        assert_eq!(normalize_for_matching("File", &opts), canonical);
        assert_eq!(normalize_for_matching("FILE", &opts), canonical);

        // Turkish dotless-ı (U+0131) — in standard mode, ı case-folds to itself (ı),
        // but it's confusable with 'i' via the confusable mapping.
        // The matching pipeline handles this through the confusable skeleton step.
        let fıle = "f\u{0131}le";
        assert!(
            matches_normalized("file", fıle, &opts),
            "'file' and 'fıle' should match: file={:?}, fıle={:?}",
            normalize_for_matching("file", &opts),
            normalize_for_matching(fıle, &opts),
        );
    }

    #[test]
    fn file_mixed_case_and_confusable() {
        let opts = default_opts();
        // "FıLE" — uppercase + Turkish dotless-ı
        let input = "F\u{0131}LE";
        assert!(
            matches_normalized("file", input, &opts),
            "'file' and 'FıLE' should match: file={:?}, FıLE={:?}",
            normalize_for_matching("file", &opts),
            normalize_for_matching(input, &opts),
        );
    }

    // ---- NFKC compatibility tests ----

    #[test]
    fn nfkc_fullwidth() {
        let opts = default_opts();
        // Fullwidth 'A' (U+FF21) should NFKC-normalize to 'A', then case-fold to 'a'
        let fullwidth_a = "\u{FF21}";
        assert!(matches_normalized(fullwidth_a, "a", &opts));
    }

    #[test]
    fn nfkc_superscript() {
        let opts = default_opts();
        // Superscript '2' (U+00B2) NFKC-normalizes to '2'
        assert_eq!(
            normalize_for_matching("\u{00B2}", &opts),
            normalize_for_matching("2", &opts),
        );
    }

    // ---- Turkish mode tests ----

    #[test]
    fn turkish_mode_dotless_i() {
        let opts = turkish_opts();
        // In Turkish mode: I → ı (U+0131), not i
        // So "Istanbul" in Turkish mode has ı as first char
        let a = normalize_for_matching("Istanbul", &opts);
        let b = normalize_for_matching("\u{0131}stanbul", &opts);
        assert_eq!(a, b);
    }

    #[test]
    fn turkish_mode_dotted_i() {
        let opts = turkish_opts();
        // In Turkish mode: İ (U+0130) → i
        assert!(matches_normalized("\u{0130}stanbul", "istanbul", &opts));
    }

    // ---- UTF-16 encoding test ----

    #[test]
    fn utf16_encoding() {
        let opts = default_opts();
        let utf16 = normalize_for_matching_utf16("hello", &opts);
        assert!(!utf16.is_empty());
        // Should round-trip back to a valid string
        let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
        assert_eq!(decoded, normalize_for_matching("hello", &opts));
    }

    #[test]
    fn utf16_supplementary() {
        let opts = default_opts();
        // U+1F600 (emoji) — supplementary character, encodes as surrogate pair in UTF-16
        let utf16 = normalize_for_matching_utf16("\u{1F600}", &opts);
        assert!(!utf16.is_empty());
        let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
        assert_eq!(decoded, normalize_for_matching("\u{1F600}", &opts));
    }

    // ---- Stability tests ----

    #[test]
    fn matching_idempotent() {
        let opts = default_opts();
        let inputs = [
            "hello",
            "File",
            "\u{0430}\u{0440}\u{0440}l\u{0435}",
            "\u{00C0}",
        ];
        for input in &inputs {
            let once = normalize_for_matching(input, &opts);
            let twice = normalize_for_matching(&once, &opts);
            assert_eq!(
                once, twice,
                "normalize_for_matching should be idempotent for {:?}",
                input
            );
        }
    }

    #[test]
    fn matching_not_confusable_different_words() {
        let opts = default_opts();
        assert!(!matches_normalized("hello", "world", &opts));
        assert!(!matches_normalized("file", "pile", &opts));
    }

    // ---- Parity with legacy implementation ----

    #[test]
    fn fused_matches_legacy_on_fixtures() {
        let opts = default_opts();
        let fixtures = [
            "",
            "hello",
            "File",
            "FILE",
            "FiLe",
            "Ströme",
            "ströme",
            "a",
            "\u{0430}",
            "\u{0430}\u{0440}\u{0440}l\u{0435}",
            "f\u{0131}le",
            "F\u{0131}LE",
            "\u{FF21}",
            "\u{00B2}",
            "\u{00C0}",
            "Hel\u{0430}",
            "\u{1D0E}\u{326}\u{306}",
            "\u{1F600}",
            "Istanbul",
            "test mixing\u{0430}cyrillic",
            // Long input mixing scripts:
            "The quick brown FOX jumps over the lazy DOG (Привет, Мир!) Καλημέρα",
        ];
        for input in &fixtures {
            let fused = normalize_for_matching(input, &opts);
            let legacy = normalize_for_matching_legacy(input, &opts);
            assert_eq!(
                fused, legacy,
                "fused vs legacy diverged for {:?}: fused={:?}, legacy={:?}",
                input, fused, legacy,
            );
        }
    }

    #[test]
    fn fused_matches_legacy_turkish() {
        let opts = turkish_opts();
        let fixtures = [
            "Istanbul",
            "\u{0130}stanbul",
            "\u{0131}stanbul",
            "FILE",
            "fıle",
        ];
        for input in &fixtures {
            let fused = normalize_for_matching(input, &opts);
            let legacy = normalize_for_matching_legacy(input, &opts);
            assert_eq!(
                fused, legacy,
                "fused vs legacy diverged for {:?} (Turkish): fused={:?}, legacy={:?}",
                input, fused, legacy,
            );
        }
    }
}