Skip to main content

simd_normalizer/
matching.rs

1//! Fused normalization pipeline for case-insensitive, confusable-aware matching.
2//!
3//! Pipeline: **NFKC → CaseFold → Confusable Skeleton** (NFD → confusable_map → NFD).
4//!
5//! Two strings that produce the same [`normalize_for_matching`] output are
6//! equivalent for matching purposes: they share the same compatibility
7//! decomposition, the same case folding, and the same confusable prototype.
8
9use alloc::string::String;
10use alloc::vec::Vec;
11
12use crate::casefold::{self, CaseFoldMode};
13use crate::confusable;
14
15/// Options for the matching normalization pipeline.
16#[derive(Clone, Copy, Debug, PartialEq, Eq)]
17pub struct MatchingOptions {
18    /// Case folding mode. Defaults to [`CaseFoldMode::Standard`].
19    pub case_fold: CaseFoldMode,
20}
21
22impl Default for MatchingOptions {
23    fn default() -> Self {
24        MatchingOptions {
25            case_fold: CaseFoldMode::Standard,
26        }
27    }
28}
29
30/// Normalize input for matching: NFKC → CaseFold → Confusable Skeleton.
31///
32/// Returns a canonical matching form where:
33/// - Compatibility equivalents are unified (NFKC)
34/// - Case differences are eliminated (Unicode case folding)
35/// - Visually confusable characters map to the same prototype (UTS #39 skeleton)
36///
37/// Two strings produce the same result if and only if they should be
38/// treated as equivalent for keyword detection and anti-spoofing.
39///
40/// # Examples
41///
42/// ```
43/// use simd_normalizer::matching::{normalize_for_matching, MatchingOptions};
44///
45/// let opts = MatchingOptions::default();
46///
47/// // Case folding
48/// assert_eq!(
49///     normalize_for_matching("File", &opts),
50///     normalize_for_matching("file", &opts),
51/// );
52///
53/// // Turkish dotless-I
54/// assert_eq!(
55///     normalize_for_matching("file", &opts),
56///     normalize_for_matching("f\u{0131}le", &opts),
57/// );
58/// ```
59pub fn normalize_for_matching(input: &str, opts: &MatchingOptions) -> String {
60    if input.is_empty() {
61        return String::new();
62    }
63
64    // Iterate the full pipeline to a fixed point. Each `one_pass` is a
65    // NFKC → casefold → skeleton → casefold chain; convergence typically
66    // occurs in 1–2 outer iterations.
67    let mut current = one_pass(input, opts);
68    for _ in 0..3 {
69        let next = one_pass(&current, opts);
70        if next == current {
71            return current;
72        }
73        current = next;
74    }
75    current
76}
77
78/// Single pass of the matching pipeline: NFKC → casefold → skeleton → casefold.
79///
80/// The NFKC-first ordering is parity-critical. NFKC canonically composes
81/// before casefold, hiding code points like U+0345 (COMBINING GREEK
82/// YPOGEGRAMMENI) inside precomposed starters (e.g. U+1F80 `ᾀ`). A per-char
83/// pipeline that decomposed first and casefolded the exposed combining mark
84/// (→ U+03B9) would produce a different skeleton. The
85/// `normalize_for_matching_legacy` reference function and
86/// `tests/perf_regression.rs` are kept as regression infrastructure for any
87/// future change to this ordering.
88fn one_pass(input: &str, opts: &MatchingOptions) -> String {
89    let nfkc = crate::nfkc().normalize(input);
90    let folded = casefold::casefold(&nfkc, opts.case_fold);
91    let skel = confusable::skeleton(&folded);
92    let final_folded = casefold::casefold(&skel, opts.case_fold);
93    final_folded.into_owned()
94}
95
96/// Reference implementation of the matching pipeline, preserved for parity
97/// testing against any alternative composition order.
98#[cfg(any(test, feature = "internal-test-api"))]
99pub fn normalize_for_matching_legacy(input: &str, opts: &MatchingOptions) -> String {
100    if input.is_empty() {
101        return String::new();
102    }
103    let mut current = one_pass_legacy(input, opts);
104    for _ in 0..3 {
105        let next = one_pass_legacy(&current, opts);
106        if next == current {
107            return current;
108        }
109        current = next;
110    }
111    current
112}
113
114/// Single legacy pass: NFKC → casefold → skeleton → casefold.
115#[cfg(any(test, feature = "internal-test-api"))]
116fn one_pass_legacy(input: &str, opts: &MatchingOptions) -> String {
117    let nfkc = crate::nfkc().normalize(input);
118    let folded = casefold::casefold(&nfkc, opts.case_fold);
119    let skel = confusable::skeleton(&folded);
120    let final_folded = casefold::casefold(&skel, opts.case_fold);
121    final_folded.into_owned()
122}
123
124/// Normalize input for matching and encode the result as UTF-16.
125///
126/// Useful for interoperability with systems that use UTF-16 keyword tables.
127pub fn normalize_for_matching_utf16(input: &str, opts: &MatchingOptions) -> Vec<u16> {
128    normalize_for_matching(input, opts).encode_utf16().collect()
129}
130
131/// Check whether two strings match after full normalization.
132///
133/// Returns `true` if both strings produce the same matching form after
134/// NFKC normalization, case folding, and confusable skeleton mapping.
135///
136/// # Examples
137///
138/// ```
139/// use simd_normalizer::matching::{matches_normalized, MatchingOptions};
140///
141/// let opts = MatchingOptions::default();
142///
143/// // "File" and "file" match (case folding)
144/// assert!(matches_normalized("File", "file", &opts));
145///
146/// // Latin 'a' and Cyrillic 'а' match (confusable mapping)
147/// assert!(matches_normalized("a", "\u{0430}", &opts));
148/// ```
149pub fn matches_normalized(a: &str, b: &str, opts: &MatchingOptions) -> bool {
150    // Fast path: identical strings always match.
151    if a == b {
152        return true;
153    }
154    normalize_for_matching(a, opts) == normalize_for_matching(b, opts)
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    fn default_opts() -> MatchingOptions {
162        MatchingOptions::default()
163    }
164
165    fn turkish_opts() -> MatchingOptions {
166        MatchingOptions {
167            case_fold: CaseFoldMode::Turkish,
168        }
169    }
170
171    // ---- Basic tests ----
172
173    #[test]
174    fn empty_input() {
175        assert_eq!(normalize_for_matching("", &default_opts()), "");
176    }
177
178    #[test]
179    fn ascii_lowercase_unchanged() {
180        let result = normalize_for_matching("hello", &default_opts());
181        assert!(!result.is_empty());
182    }
183
184    #[test]
185    fn identical_strings_match() {
186        assert!(matches_normalized("test", "test", &default_opts()));
187    }
188
189    #[test]
190    fn different_strings_dont_match() {
191        assert!(!matches_normalized("hello", "world", &default_opts()));
192    }
193
194    // ---- Case folding tests ----
195
196    #[test]
197    fn case_insensitive_ascii() {
198        let opts = default_opts();
199        assert!(matches_normalized("File", "file", &opts));
200        assert!(matches_normalized("FILE", "file", &opts));
201        assert!(matches_normalized("FiLe", "file", &opts));
202    }
203
204    #[test]
205    fn case_insensitive_extended() {
206        let opts = default_opts();
207        // Ö (U+00D6) case folds to ö (U+00F6)
208        assert!(matches_normalized("Ströme", "ströme", &opts));
209    }
210
211    // ---- Confusable detection tests ----
212
213    #[test]
214    fn confusable_latin_cyrillic_a() {
215        let opts = default_opts();
216        // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430)
217        assert!(matches_normalized("a", "\u{0430}", &opts));
218    }
219
220    #[test]
221    fn confusable_latin_cyrillic_word() {
222        let opts = default_opts();
223        // "apple" in Latin vs mixed Latin/Cyrillic
224        // Cyrillic: а=U+0430, р=U+0440, е=U+0435
225        let latin = "apple";
226        let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
227        assert!(matches_normalized(latin, mixed, &opts));
228    }
229
230    // ---- Combined case + confusable tests (the key requirement) ----
231
232    #[test]
233    fn file_variants_all_match() {
234        let opts = default_opts();
235        let canonical = normalize_for_matching("file", &opts);
236
237        // Case variant
238        assert_eq!(normalize_for_matching("File", &opts), canonical);
239        assert_eq!(normalize_for_matching("FILE", &opts), canonical);
240
241        // Turkish dotless-ı (U+0131) — in standard mode, ı case-folds to itself (ı),
242        // but it's confusable with 'i' via the confusable mapping.
243        // The matching pipeline handles this through the confusable skeleton step.
244        let fıle = "f\u{0131}le";
245        assert!(
246            matches_normalized("file", fıle, &opts),
247            "'file' and 'fıle' should match: file={:?}, fıle={:?}",
248            normalize_for_matching("file", &opts),
249            normalize_for_matching(fıle, &opts),
250        );
251    }
252
253    #[test]
254    fn file_mixed_case_and_confusable() {
255        let opts = default_opts();
256        // "FıLE" — uppercase + Turkish dotless-ı
257        let input = "F\u{0131}LE";
258        assert!(
259            matches_normalized("file", input, &opts),
260            "'file' and 'FıLE' should match: file={:?}, FıLE={:?}",
261            normalize_for_matching("file", &opts),
262            normalize_for_matching(input, &opts),
263        );
264    }
265
266    // ---- NFKC compatibility tests ----
267
268    #[test]
269    fn nfkc_fullwidth() {
270        let opts = default_opts();
271        // Fullwidth 'A' (U+FF21) should NFKC-normalize to 'A', then case-fold to 'a'
272        let fullwidth_a = "\u{FF21}";
273        assert!(matches_normalized(fullwidth_a, "a", &opts));
274    }
275
276    #[test]
277    fn nfkc_superscript() {
278        let opts = default_opts();
279        // Superscript '2' (U+00B2) NFKC-normalizes to '2'
280        assert_eq!(
281            normalize_for_matching("\u{00B2}", &opts),
282            normalize_for_matching("2", &opts),
283        );
284    }
285
286    // ---- Turkish mode tests ----
287
288    #[test]
289    fn turkish_mode_dotless_i() {
290        let opts = turkish_opts();
291        // In Turkish mode: I → ı (U+0131), not i
292        // So "Istanbul" in Turkish mode has ı as first char
293        let a = normalize_for_matching("Istanbul", &opts);
294        let b = normalize_for_matching("\u{0131}stanbul", &opts);
295        assert_eq!(a, b);
296    }
297
298    #[test]
299    fn turkish_mode_dotted_i() {
300        let opts = turkish_opts();
301        // In Turkish mode: İ (U+0130) → i
302        assert!(matches_normalized("\u{0130}stanbul", "istanbul", &opts));
303    }
304
305    // ---- UTF-16 encoding test ----
306
307    #[test]
308    fn utf16_encoding() {
309        let opts = default_opts();
310        let utf16 = normalize_for_matching_utf16("hello", &opts);
311        assert!(!utf16.is_empty());
312        // Should round-trip back to a valid string
313        let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
314        assert_eq!(decoded, normalize_for_matching("hello", &opts));
315    }
316
317    #[test]
318    fn utf16_supplementary() {
319        let opts = default_opts();
320        // U+1F600 (emoji) — supplementary character, encodes as surrogate pair in UTF-16
321        let utf16 = normalize_for_matching_utf16("\u{1F600}", &opts);
322        assert!(!utf16.is_empty());
323        let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
324        assert_eq!(decoded, normalize_for_matching("\u{1F600}", &opts));
325    }
326
327    // ---- Stability tests ----
328
329    #[test]
330    fn matching_idempotent() {
331        let opts = default_opts();
332        let inputs = [
333            "hello",
334            "File",
335            "\u{0430}\u{0440}\u{0440}l\u{0435}",
336            "\u{00C0}",
337        ];
338        for input in &inputs {
339            let once = normalize_for_matching(input, &opts);
340            let twice = normalize_for_matching(&once, &opts);
341            assert_eq!(
342                once, twice,
343                "normalize_for_matching should be idempotent for {:?}",
344                input
345            );
346        }
347    }
348
349    #[test]
350    fn matching_not_confusable_different_words() {
351        let opts = default_opts();
352        assert!(!matches_normalized("hello", "world", &opts));
353        assert!(!matches_normalized("file", "pile", &opts));
354    }
355}