1use alloc::string::String;
33use alloc::vec::Vec;
34
35use crate::casefold::{self, CaseFoldMode};
36use crate::confusable;
37
38#[derive(Clone, Copy, Debug, PartialEq, Eq)]
40pub struct MatchingOptions {
41 pub case_fold: CaseFoldMode,
43}
44
45impl Default for MatchingOptions {
46 fn default() -> Self {
47 MatchingOptions {
48 case_fold: CaseFoldMode::Standard,
49 }
50 }
51}
52
53pub fn normalize_for_matching(input: &str, opts: &MatchingOptions) -> String {
83 if input.is_empty() {
84 return String::new();
85 }
86
87 let mut current = one_pass(input, opts);
91 for _ in 0..3 {
92 let next = one_pass(¤t, opts);
93 if next == current {
94 return current;
95 }
96 current = next;
97 }
98 current
99}
100
101#[inline]
111fn one_pass(input: &str, opts: &MatchingOptions) -> String {
112 let nfkc = crate::nfkc().normalize(input);
113 let folded = casefold::casefold(&nfkc, opts.case_fold);
114 let skel = confusable::skeleton(&folded);
115 let final_folded = casefold::casefold(&skel, opts.case_fold);
116 final_folded.into_owned()
117}
118
119#[cfg(any(test, feature = "internal-test-api"))]
122pub fn normalize_for_matching_legacy(input: &str, opts: &MatchingOptions) -> String {
123 if input.is_empty() {
124 return String::new();
125 }
126 let mut current = one_pass_legacy(input, opts);
127 for _ in 0..3 {
128 let next = one_pass_legacy(¤t, opts);
129 if next == current {
130 return current;
131 }
132 current = next;
133 }
134 current
135}
136
137#[cfg(any(test, feature = "internal-test-api"))]
139fn one_pass_legacy(input: &str, opts: &MatchingOptions) -> String {
140 let nfkc = crate::nfkc().normalize(input);
141 let folded = casefold::casefold(&nfkc, opts.case_fold);
142 let skel = confusable::skeleton(&folded);
143 let final_folded = casefold::casefold(&skel, opts.case_fold);
144 final_folded.into_owned()
145}
146
147pub fn normalize_for_matching_utf16(input: &str, opts: &MatchingOptions) -> Vec<u16> {
151 normalize_for_matching(input, opts).encode_utf16().collect()
152}
153
154pub fn matches_normalized(a: &str, b: &str, opts: &MatchingOptions) -> bool {
173 if a == b {
175 return true;
176 }
177 normalize_for_matching(a, opts) == normalize_for_matching(b, opts)
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183 use crate::tables;
184
185 fn default_opts() -> MatchingOptions {
186 MatchingOptions::default()
187 }
188
189 fn turkish_opts() -> MatchingOptions {
190 MatchingOptions {
191 case_fold: CaseFoldMode::Turkish,
192 }
193 }
194
195 #[test]
198 fn confusable_bloom_covers_every_source() {
199 for &(source_cp, _) in tables::confusable::CONFUSABLE_MAPPINGS {
203 assert!(
204 tables::confusable_bloom_might_contain(source_cp),
205 "confusable source U+{:06X} hashed to a clear bit",
206 source_cp,
207 );
208 }
209 }
210
211 #[test]
214 fn empty_input() {
215 assert_eq!(normalize_for_matching("", &default_opts()), "");
216 }
217
218 #[test]
219 fn ascii_lowercase_unchanged() {
220 let result = normalize_for_matching("hello", &default_opts());
221 assert!(!result.is_empty());
222 }
223
224 #[test]
225 fn identical_strings_match() {
226 assert!(matches_normalized("test", "test", &default_opts()));
227 }
228
229 #[test]
230 fn different_strings_dont_match() {
231 assert!(!matches_normalized("hello", "world", &default_opts()));
232 }
233
234 #[test]
237 fn case_insensitive_ascii() {
238 let opts = default_opts();
239 assert!(matches_normalized("File", "file", &opts));
240 assert!(matches_normalized("FILE", "file", &opts));
241 assert!(matches_normalized("FiLe", "file", &opts));
242 }
243
244 #[test]
245 fn case_insensitive_extended() {
246 let opts = default_opts();
247 assert!(matches_normalized("Ströme", "ströme", &opts));
249 }
250
251 #[test]
254 fn confusable_latin_cyrillic_a() {
255 let opts = default_opts();
256 assert!(matches_normalized("a", "\u{0430}", &opts));
258 }
259
260 #[test]
261 fn confusable_latin_cyrillic_word() {
262 let opts = default_opts();
263 let latin = "apple";
266 let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
267 assert!(matches_normalized(latin, mixed, &opts));
268 }
269
270 #[test]
273 fn file_variants_all_match() {
274 let opts = default_opts();
275 let canonical = normalize_for_matching("file", &opts);
276
277 assert_eq!(normalize_for_matching("File", &opts), canonical);
279 assert_eq!(normalize_for_matching("FILE", &opts), canonical);
280
281 let fıle = "f\u{0131}le";
285 assert!(
286 matches_normalized("file", fıle, &opts),
287 "'file' and 'fıle' should match: file={:?}, fıle={:?}",
288 normalize_for_matching("file", &opts),
289 normalize_for_matching(fıle, &opts),
290 );
291 }
292
293 #[test]
294 fn file_mixed_case_and_confusable() {
295 let opts = default_opts();
296 let input = "F\u{0131}LE";
298 assert!(
299 matches_normalized("file", input, &opts),
300 "'file' and 'FıLE' should match: file={:?}, FıLE={:?}",
301 normalize_for_matching("file", &opts),
302 normalize_for_matching(input, &opts),
303 );
304 }
305
306 #[test]
309 fn nfkc_fullwidth() {
310 let opts = default_opts();
311 let fullwidth_a = "\u{FF21}";
313 assert!(matches_normalized(fullwidth_a, "a", &opts));
314 }
315
316 #[test]
317 fn nfkc_superscript() {
318 let opts = default_opts();
319 assert_eq!(
321 normalize_for_matching("\u{00B2}", &opts),
322 normalize_for_matching("2", &opts),
323 );
324 }
325
326 #[test]
329 fn turkish_mode_dotless_i() {
330 let opts = turkish_opts();
331 let a = normalize_for_matching("Istanbul", &opts);
334 let b = normalize_for_matching("\u{0131}stanbul", &opts);
335 assert_eq!(a, b);
336 }
337
338 #[test]
339 fn turkish_mode_dotted_i() {
340 let opts = turkish_opts();
341 assert!(matches_normalized("\u{0130}stanbul", "istanbul", &opts));
343 }
344
345 #[test]
348 fn utf16_encoding() {
349 let opts = default_opts();
350 let utf16 = normalize_for_matching_utf16("hello", &opts);
351 assert!(!utf16.is_empty());
352 let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
354 assert_eq!(decoded, normalize_for_matching("hello", &opts));
355 }
356
357 #[test]
358 fn utf16_supplementary() {
359 let opts = default_opts();
360 let utf16 = normalize_for_matching_utf16("\u{1F600}", &opts);
362 assert!(!utf16.is_empty());
363 let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
364 assert_eq!(decoded, normalize_for_matching("\u{1F600}", &opts));
365 }
366
367 #[test]
370 fn matching_idempotent() {
371 let opts = default_opts();
372 let inputs = [
373 "hello",
374 "File",
375 "\u{0430}\u{0440}\u{0440}l\u{0435}",
376 "\u{00C0}",
377 ];
378 for input in &inputs {
379 let once = normalize_for_matching(input, &opts);
380 let twice = normalize_for_matching(&once, &opts);
381 assert_eq!(
382 once, twice,
383 "normalize_for_matching should be idempotent for {:?}",
384 input
385 );
386 }
387 }
388
389 #[test]
390 fn matching_not_confusable_different_words() {
391 let opts = default_opts();
392 assert!(!matches_normalized("hello", "world", &opts));
393 assert!(!matches_normalized("file", "pile", &opts));
394 }
395
396 #[test]
399 fn fused_matches_legacy_on_fixtures() {
400 let opts = default_opts();
401 let fixtures = [
402 "",
403 "hello",
404 "File",
405 "FILE",
406 "FiLe",
407 "Ströme",
408 "ströme",
409 "a",
410 "\u{0430}",
411 "\u{0430}\u{0440}\u{0440}l\u{0435}",
412 "f\u{0131}le",
413 "F\u{0131}LE",
414 "\u{FF21}",
415 "\u{00B2}",
416 "\u{00C0}",
417 "Hel\u{0430}",
418 "\u{1D0E}\u{326}\u{306}",
419 "\u{1F600}",
420 "Istanbul",
421 "test mixing\u{0430}cyrillic",
422 "The quick brown FOX jumps over the lazy DOG (Привет, Мир!) Καλημέρα",
424 ];
425 for input in &fixtures {
426 let fused = normalize_for_matching(input, &opts);
427 let legacy = normalize_for_matching_legacy(input, &opts);
428 assert_eq!(
429 fused, legacy,
430 "fused vs legacy diverged for {:?}: fused={:?}, legacy={:?}",
431 input, fused, legacy,
432 );
433 }
434 }
435
436 #[test]
437 fn fused_matches_legacy_turkish() {
438 let opts = turkish_opts();
439 let fixtures = [
440 "Istanbul",
441 "\u{0130}stanbul",
442 "\u{0131}stanbul",
443 "FILE",
444 "fıle",
445 ];
446 for input in &fixtures {
447 let fused = normalize_for_matching(input, &opts);
448 let legacy = normalize_for_matching_legacy(input, &opts);
449 assert_eq!(
450 fused, legacy,
451 "fused vs legacy diverged for {:?} (Turkish): fused={:?}, legacy={:?}",
452 input, fused, legacy,
453 );
454 }
455 }
456}