Skip to main content

simd_normalizer/
matching.rs

1//! Fused normalization pipeline for case-insensitive, confusable-aware matching.
2//!
3//! Pipeline: **NFKC → CaseFold → Confusable Skeleton** (NFD → confusable_map → NFD).
4//!
5//! Two strings that produce the same [`normalize_for_matching`] output are
6//! equivalent for matching purposes: they share the same compatibility
7//! decomposition, the same case folding, and the same confusable prototype.
8//!
9//! ## Optimization summary (Component E)
10//!
11//! The matching pipeline composes four conceptually-distinct stages
12//! (`NFKC → casefold → skeleton → casefold`). A naive implementation walks
13//! the input four times with three string allocations between stages. We
14//! preserve that staged structure for correctness — full-fusion attempts
15//! produced subtle parity divergences against the legacy chain on
16//! cross-codepoint canonical reorder cases — but every individual stage is
17//! optimized:
18//!
19//! * **NFKC** is the existing fused decomposer/composer (Component D),
20//!   running at peak SIMD throughput on the hot ASCII / Latin-1 path.
21//! * **Casefold** has a SIMD-driven ASCII fast path that scans 64-byte
22//!   chunks for non-ASCII / uppercase bytes and lowercases via `b | 0x20`,
23//!   avoiding per-byte trie lookups on pure-ASCII regions
24//!   (see [`mod@crate::casefold`]).
25//! * **Skeleton** uses a 256-byte bloom filter to skip the binary search
26//!   into the confusable mapping table for the vast majority of codepoints
27//!   that have no mapping (see `tables::confusable_bloom_might_contain`,
28//!   wired into [`crate::confusable::skeleton`]).
29//! * **Outer fixed-point** loop runs at most 4 iterations; in practice it
30//!   converges after 1.
31
32use alloc::string::String;
33use alloc::vec::Vec;
34
35use crate::casefold::{self, CaseFoldMode};
36use crate::confusable;
37
38/// Options for the matching normalization pipeline.
39#[derive(Clone, Copy, Debug, PartialEq, Eq)]
40pub struct MatchingOptions {
41    /// Case folding mode. Defaults to [`CaseFoldMode::Standard`].
42    pub case_fold: CaseFoldMode,
43}
44
45impl Default for MatchingOptions {
46    fn default() -> Self {
47        MatchingOptions {
48            case_fold: CaseFoldMode::Standard,
49        }
50    }
51}
52
53/// Normalize input for matching: NFKC → CaseFold → Confusable Skeleton.
54///
55/// Returns a canonical matching form where:
56/// - Compatibility equivalents are unified (NFKC)
57/// - Case differences are eliminated (Unicode case folding)
58/// - Visually confusable characters map to the same prototype (UTS #39 skeleton)
59///
60/// Two strings produce the same result if and only if they should be
61/// treated as equivalent for keyword detection and anti-spoofing.
62///
63/// # Examples
64///
65/// ```
66/// use simd_normalizer::matching::{normalize_for_matching, MatchingOptions};
67///
68/// let opts = MatchingOptions::default();
69///
70/// // Case folding
71/// assert_eq!(
72///     normalize_for_matching("File", &opts),
73///     normalize_for_matching("file", &opts),
74/// );
75///
76/// // Turkish dotless-I
77/// assert_eq!(
78///     normalize_for_matching("file", &opts),
79///     normalize_for_matching("f\u{0131}le", &opts),
80/// );
81/// ```
82pub fn normalize_for_matching(input: &str, opts: &MatchingOptions) -> String {
83    if input.is_empty() {
84        return String::new();
85    }
86
87    // Iterate the full pipeline to a fixed point. Each `one_pass` is a
88    // NFKC → casefold → skeleton → casefold chain; convergence typically
89    // occurs in 1–2 outer iterations.
90    let mut current = one_pass(input, opts);
91    for _ in 0..3 {
92        let next = one_pass(&current, opts);
93        if next == current {
94            return current;
95        }
96        current = next;
97    }
98    current
99}
100
101/// Single pass of the matching pipeline: NFKC → casefold → skeleton → casefold.
102///
103/// The NFKC-first ordering is parity-critical. NFKC canonically composes
104/// before casefold, hiding code points like U+0345 (COMBINING GREEK
105/// YPOGEGRAMMENI) inside precomposed starters (e.g. U+1F80 `ᾀ`). A per-char
106/// pipeline that decomposed first and casefolded the exposed combining mark
107/// (→ U+03B9) would produce a different skeleton.
108///
109/// Per-stage optimizations are documented in the module-level comment.
110#[inline]
111fn one_pass(input: &str, opts: &MatchingOptions) -> String {
112    let nfkc = crate::nfkc().normalize(input);
113    let folded = casefold::casefold(&nfkc, opts.case_fold);
114    let skel = confusable::skeleton(&folded);
115    let final_folded = casefold::casefold(&skel, opts.case_fold);
116    final_folded.into_owned()
117}
118
119/// Reference implementation of the matching pipeline, preserved for parity
120/// testing against any alternative composition order.
121#[cfg(any(test, feature = "internal-test-api"))]
122pub fn normalize_for_matching_legacy(input: &str, opts: &MatchingOptions) -> String {
123    if input.is_empty() {
124        return String::new();
125    }
126    let mut current = one_pass_legacy(input, opts);
127    for _ in 0..3 {
128        let next = one_pass_legacy(&current, opts);
129        if next == current {
130            return current;
131        }
132        current = next;
133    }
134    current
135}
136
137/// Single legacy pass: NFKC → casefold → skeleton → casefold.
138#[cfg(any(test, feature = "internal-test-api"))]
139fn one_pass_legacy(input: &str, opts: &MatchingOptions) -> String {
140    let nfkc = crate::nfkc().normalize(input);
141    let folded = casefold::casefold(&nfkc, opts.case_fold);
142    let skel = confusable::skeleton(&folded);
143    let final_folded = casefold::casefold(&skel, opts.case_fold);
144    final_folded.into_owned()
145}
146
147/// Normalize input for matching and encode the result as UTF-16.
148///
149/// Useful for interoperability with systems that use UTF-16 keyword tables.
150pub fn normalize_for_matching_utf16(input: &str, opts: &MatchingOptions) -> Vec<u16> {
151    normalize_for_matching(input, opts).encode_utf16().collect()
152}
153
154/// Check whether two strings match after full normalization.
155///
156/// Returns `true` if both strings produce the same matching form after
157/// NFKC normalization, case folding, and confusable skeleton mapping.
158///
159/// # Examples
160///
161/// ```
162/// use simd_normalizer::matching::{matches_normalized, MatchingOptions};
163///
164/// let opts = MatchingOptions::default();
165///
166/// // "File" and "file" match (case folding)
167/// assert!(matches_normalized("File", "file", &opts));
168///
169/// // Latin 'a' and Cyrillic 'а' match (confusable mapping)
170/// assert!(matches_normalized("a", "\u{0430}", &opts));
171/// ```
172pub fn matches_normalized(a: &str, b: &str, opts: &MatchingOptions) -> bool {
173    // Fast path: identical strings always match.
174    if a == b {
175        return true;
176    }
177    normalize_for_matching(a, opts) == normalize_for_matching(b, opts)
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183    use crate::tables;
184
185    fn default_opts() -> MatchingOptions {
186        MatchingOptions::default()
187    }
188
189    fn turkish_opts() -> MatchingOptions {
190        MatchingOptions {
191            case_fold: CaseFoldMode::Turkish,
192        }
193    }
194
195    // ---- Bloom filter coverage ----
196
197    #[test]
198    fn confusable_bloom_covers_every_source() {
199        // Every source codepoint in the confusable mapping table must hash
200        // to a set bit in the bloom. False negatives are unsound (would
201        // skip required mappings); false positives are fine.
202        for &(source_cp, _) in tables::confusable::CONFUSABLE_MAPPINGS {
203            assert!(
204                tables::confusable_bloom_might_contain(source_cp),
205                "confusable source U+{:06X} hashed to a clear bit",
206                source_cp,
207            );
208        }
209    }
210
211    // ---- Basic tests ----
212
213    #[test]
214    fn empty_input() {
215        assert_eq!(normalize_for_matching("", &default_opts()), "");
216    }
217
218    #[test]
219    fn ascii_lowercase_unchanged() {
220        let result = normalize_for_matching("hello", &default_opts());
221        assert!(!result.is_empty());
222    }
223
224    #[test]
225    fn identical_strings_match() {
226        assert!(matches_normalized("test", "test", &default_opts()));
227    }
228
229    #[test]
230    fn different_strings_dont_match() {
231        assert!(!matches_normalized("hello", "world", &default_opts()));
232    }
233
234    // ---- Case folding tests ----
235
236    #[test]
237    fn case_insensitive_ascii() {
238        let opts = default_opts();
239        assert!(matches_normalized("File", "file", &opts));
240        assert!(matches_normalized("FILE", "file", &opts));
241        assert!(matches_normalized("FiLe", "file", &opts));
242    }
243
244    #[test]
245    fn case_insensitive_extended() {
246        let opts = default_opts();
247        // Ö (U+00D6) case folds to ö (U+00F6)
248        assert!(matches_normalized("Ströme", "ströme", &opts));
249    }
250
251    // ---- Confusable detection tests ----
252
253    #[test]
254    fn confusable_latin_cyrillic_a() {
255        let opts = default_opts();
256        // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430)
257        assert!(matches_normalized("a", "\u{0430}", &opts));
258    }
259
260    #[test]
261    fn confusable_latin_cyrillic_word() {
262        let opts = default_opts();
263        // "apple" in Latin vs mixed Latin/Cyrillic
264        // Cyrillic: а=U+0430, р=U+0440, е=U+0435
265        let latin = "apple";
266        let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
267        assert!(matches_normalized(latin, mixed, &opts));
268    }
269
270    // ---- Combined case + confusable tests (the key requirement) ----
271
272    #[test]
273    fn file_variants_all_match() {
274        let opts = default_opts();
275        let canonical = normalize_for_matching("file", &opts);
276
277        // Case variant
278        assert_eq!(normalize_for_matching("File", &opts), canonical);
279        assert_eq!(normalize_for_matching("FILE", &opts), canonical);
280
281        // Turkish dotless-ı (U+0131) — in standard mode, ı case-folds to itself (ı),
282        // but it's confusable with 'i' via the confusable mapping.
283        // The matching pipeline handles this through the confusable skeleton step.
284        let fıle = "f\u{0131}le";
285        assert!(
286            matches_normalized("file", fıle, &opts),
287            "'file' and 'fıle' should match: file={:?}, fıle={:?}",
288            normalize_for_matching("file", &opts),
289            normalize_for_matching(fıle, &opts),
290        );
291    }
292
293    #[test]
294    fn file_mixed_case_and_confusable() {
295        let opts = default_opts();
296        // "FıLE" — uppercase + Turkish dotless-ı
297        let input = "F\u{0131}LE";
298        assert!(
299            matches_normalized("file", input, &opts),
300            "'file' and 'FıLE' should match: file={:?}, FıLE={:?}",
301            normalize_for_matching("file", &opts),
302            normalize_for_matching(input, &opts),
303        );
304    }
305
306    // ---- NFKC compatibility tests ----
307
308    #[test]
309    fn nfkc_fullwidth() {
310        let opts = default_opts();
311        // Fullwidth 'A' (U+FF21) should NFKC-normalize to 'A', then case-fold to 'a'
312        let fullwidth_a = "\u{FF21}";
313        assert!(matches_normalized(fullwidth_a, "a", &opts));
314    }
315
316    #[test]
317    fn nfkc_superscript() {
318        let opts = default_opts();
319        // Superscript '2' (U+00B2) NFKC-normalizes to '2'
320        assert_eq!(
321            normalize_for_matching("\u{00B2}", &opts),
322            normalize_for_matching("2", &opts),
323        );
324    }
325
326    // ---- Turkish mode tests ----
327
328    #[test]
329    fn turkish_mode_dotless_i() {
330        let opts = turkish_opts();
331        // In Turkish mode: I → ı (U+0131), not i
332        // So "Istanbul" in Turkish mode has ı as first char
333        let a = normalize_for_matching("Istanbul", &opts);
334        let b = normalize_for_matching("\u{0131}stanbul", &opts);
335        assert_eq!(a, b);
336    }
337
338    #[test]
339    fn turkish_mode_dotted_i() {
340        let opts = turkish_opts();
341        // In Turkish mode: İ (U+0130) → i
342        assert!(matches_normalized("\u{0130}stanbul", "istanbul", &opts));
343    }
344
345    // ---- UTF-16 encoding test ----
346
347    #[test]
348    fn utf16_encoding() {
349        let opts = default_opts();
350        let utf16 = normalize_for_matching_utf16("hello", &opts);
351        assert!(!utf16.is_empty());
352        // Should round-trip back to a valid string
353        let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
354        assert_eq!(decoded, normalize_for_matching("hello", &opts));
355    }
356
357    #[test]
358    fn utf16_supplementary() {
359        let opts = default_opts();
360        // U+1F600 (emoji) — supplementary character, encodes as surrogate pair in UTF-16
361        let utf16 = normalize_for_matching_utf16("\u{1F600}", &opts);
362        assert!(!utf16.is_empty());
363        let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
364        assert_eq!(decoded, normalize_for_matching("\u{1F600}", &opts));
365    }
366
367    // ---- Stability tests ----
368
369    #[test]
370    fn matching_idempotent() {
371        let opts = default_opts();
372        let inputs = [
373            "hello",
374            "File",
375            "\u{0430}\u{0440}\u{0440}l\u{0435}",
376            "\u{00C0}",
377        ];
378        for input in &inputs {
379            let once = normalize_for_matching(input, &opts);
380            let twice = normalize_for_matching(&once, &opts);
381            assert_eq!(
382                once, twice,
383                "normalize_for_matching should be idempotent for {:?}",
384                input
385            );
386        }
387    }
388
389    #[test]
390    fn matching_not_confusable_different_words() {
391        let opts = default_opts();
392        assert!(!matches_normalized("hello", "world", &opts));
393        assert!(!matches_normalized("file", "pile", &opts));
394    }
395
396    // ---- Parity with legacy implementation ----
397
398    #[test]
399    fn fused_matches_legacy_on_fixtures() {
400        let opts = default_opts();
401        let fixtures = [
402            "",
403            "hello",
404            "File",
405            "FILE",
406            "FiLe",
407            "Ströme",
408            "ströme",
409            "a",
410            "\u{0430}",
411            "\u{0430}\u{0440}\u{0440}l\u{0435}",
412            "f\u{0131}le",
413            "F\u{0131}LE",
414            "\u{FF21}",
415            "\u{00B2}",
416            "\u{00C0}",
417            "Hel\u{0430}",
418            "\u{1D0E}\u{326}\u{306}",
419            "\u{1F600}",
420            "Istanbul",
421            "test mixing\u{0430}cyrillic",
422            // Long input mixing scripts:
423            "The quick brown FOX jumps over the lazy DOG (Привет, Мир!) Καλημέρα",
424        ];
425        for input in &fixtures {
426            let fused = normalize_for_matching(input, &opts);
427            let legacy = normalize_for_matching_legacy(input, &opts);
428            assert_eq!(
429                fused, legacy,
430                "fused vs legacy diverged for {:?}: fused={:?}, legacy={:?}",
431                input, fused, legacy,
432            );
433        }
434    }
435
436    #[test]
437    fn fused_matches_legacy_turkish() {
438        let opts = turkish_opts();
439        let fixtures = [
440            "Istanbul",
441            "\u{0130}stanbul",
442            "\u{0131}stanbul",
443            "FILE",
444            "fıle",
445        ];
446        for input in &fixtures {
447            let fused = normalize_for_matching(input, &opts);
448            let legacy = normalize_for_matching_legacy(input, &opts);
449            assert_eq!(
450                fused, legacy,
451                "fused vs legacy diverged for {:?} (Turkish): fused={:?}, legacy={:?}",
452                input, fused, legacy,
453            );
454        }
455    }
456}