use alloc::string::String;
use alloc::vec::Vec;
use crate::casefold::{self, CaseFoldMode};
use crate::confusable;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct MatchingOptions {
pub case_fold: CaseFoldMode,
}
impl Default for MatchingOptions {
fn default() -> Self {
MatchingOptions {
case_fold: CaseFoldMode::Standard,
}
}
}
pub fn normalize_for_matching(input: &str, opts: &MatchingOptions) -> String {
if input.is_empty() {
return String::new();
}
let mut current = one_pass(input, opts);
for _ in 0..3 {
let next = one_pass(¤t, opts);
if next == current {
return current;
}
current = next;
}
current
}
#[inline]
fn one_pass(input: &str, opts: &MatchingOptions) -> String {
let nfkc = crate::nfkc().normalize(input);
let folded = casefold::casefold(&nfkc, opts.case_fold);
let skel = confusable::skeleton(&folded);
let final_folded = casefold::casefold(&skel, opts.case_fold);
final_folded.into_owned()
}
#[cfg(any(test, feature = "internal-test-api"))]
pub fn normalize_for_matching_legacy(input: &str, opts: &MatchingOptions) -> String {
if input.is_empty() {
return String::new();
}
let mut current = one_pass_legacy(input, opts);
for _ in 0..3 {
let next = one_pass_legacy(¤t, opts);
if next == current {
return current;
}
current = next;
}
current
}
#[cfg(any(test, feature = "internal-test-api"))]
fn one_pass_legacy(input: &str, opts: &MatchingOptions) -> String {
let nfkc = crate::nfkc().normalize(input);
let folded = casefold::casefold(&nfkc, opts.case_fold);
let skel = confusable::skeleton(&folded);
let final_folded = casefold::casefold(&skel, opts.case_fold);
final_folded.into_owned()
}
pub fn normalize_for_matching_utf16(input: &str, opts: &MatchingOptions) -> Vec<u16> {
normalize_for_matching(input, opts).encode_utf16().collect()
}
pub fn matches_normalized(a: &str, b: &str, opts: &MatchingOptions) -> bool {
if a == b {
return true;
}
normalize_for_matching(a, opts) == normalize_for_matching(b, opts)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tables;
fn default_opts() -> MatchingOptions {
MatchingOptions::default()
}
fn turkish_opts() -> MatchingOptions {
MatchingOptions {
case_fold: CaseFoldMode::Turkish,
}
}
#[test]
fn confusable_bloom_covers_every_source() {
for &(source_cp, _) in tables::confusable::CONFUSABLE_MAPPINGS {
assert!(
tables::confusable_bloom_might_contain(source_cp),
"confusable source U+{:06X} hashed to a clear bit",
source_cp,
);
}
}
#[test]
fn empty_input() {
assert_eq!(normalize_for_matching("", &default_opts()), "");
}
#[test]
fn ascii_lowercase_unchanged() {
let result = normalize_for_matching("hello", &default_opts());
assert!(!result.is_empty());
}
#[test]
fn identical_strings_match() {
assert!(matches_normalized("test", "test", &default_opts()));
}
#[test]
fn different_strings_dont_match() {
assert!(!matches_normalized("hello", "world", &default_opts()));
}
#[test]
fn case_insensitive_ascii() {
let opts = default_opts();
assert!(matches_normalized("File", "file", &opts));
assert!(matches_normalized("FILE", "file", &opts));
assert!(matches_normalized("FiLe", "file", &opts));
}
#[test]
fn case_insensitive_extended() {
let opts = default_opts();
assert!(matches_normalized("Ströme", "ströme", &opts));
}
#[test]
fn confusable_latin_cyrillic_a() {
let opts = default_opts();
assert!(matches_normalized("a", "\u{0430}", &opts));
}
#[test]
fn confusable_latin_cyrillic_word() {
let opts = default_opts();
let latin = "apple";
let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
assert!(matches_normalized(latin, mixed, &opts));
}
#[test]
fn file_variants_all_match() {
let opts = default_opts();
let canonical = normalize_for_matching("file", &opts);
assert_eq!(normalize_for_matching("File", &opts), canonical);
assert_eq!(normalize_for_matching("FILE", &opts), canonical);
let fıle = "f\u{0131}le";
assert!(
matches_normalized("file", fıle, &opts),
"'file' and 'fıle' should match: file={:?}, fıle={:?}",
normalize_for_matching("file", &opts),
normalize_for_matching(fıle, &opts),
);
}
#[test]
fn file_mixed_case_and_confusable() {
let opts = default_opts();
let input = "F\u{0131}LE";
assert!(
matches_normalized("file", input, &opts),
"'file' and 'FıLE' should match: file={:?}, FıLE={:?}",
normalize_for_matching("file", &opts),
normalize_for_matching(input, &opts),
);
}
#[test]
fn nfkc_fullwidth() {
let opts = default_opts();
let fullwidth_a = "\u{FF21}";
assert!(matches_normalized(fullwidth_a, "a", &opts));
}
#[test]
fn nfkc_superscript() {
let opts = default_opts();
assert_eq!(
normalize_for_matching("\u{00B2}", &opts),
normalize_for_matching("2", &opts),
);
}
#[test]
fn turkish_mode_dotless_i() {
let opts = turkish_opts();
let a = normalize_for_matching("Istanbul", &opts);
let b = normalize_for_matching("\u{0131}stanbul", &opts);
assert_eq!(a, b);
}
#[test]
fn turkish_mode_dotted_i() {
let opts = turkish_opts();
assert!(matches_normalized("\u{0130}stanbul", "istanbul", &opts));
}
#[test]
fn utf16_encoding() {
let opts = default_opts();
let utf16 = normalize_for_matching_utf16("hello", &opts);
assert!(!utf16.is_empty());
let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
assert_eq!(decoded, normalize_for_matching("hello", &opts));
}
#[test]
fn utf16_supplementary() {
let opts = default_opts();
let utf16 = normalize_for_matching_utf16("\u{1F600}", &opts);
assert!(!utf16.is_empty());
let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
assert_eq!(decoded, normalize_for_matching("\u{1F600}", &opts));
}
#[test]
fn matching_idempotent() {
let opts = default_opts();
let inputs = [
"hello",
"File",
"\u{0430}\u{0440}\u{0440}l\u{0435}",
"\u{00C0}",
];
for input in &inputs {
let once = normalize_for_matching(input, &opts);
let twice = normalize_for_matching(&once, &opts);
assert_eq!(
once, twice,
"normalize_for_matching should be idempotent for {:?}",
input
);
}
}
#[test]
fn matching_not_confusable_different_words() {
let opts = default_opts();
assert!(!matches_normalized("hello", "world", &opts));
assert!(!matches_normalized("file", "pile", &opts));
}
#[test]
fn fused_matches_legacy_on_fixtures() {
let opts = default_opts();
let fixtures = [
"",
"hello",
"File",
"FILE",
"FiLe",
"Ströme",
"ströme",
"a",
"\u{0430}",
"\u{0430}\u{0440}\u{0440}l\u{0435}",
"f\u{0131}le",
"F\u{0131}LE",
"\u{FF21}",
"\u{00B2}",
"\u{00C0}",
"Hel\u{0430}",
"\u{1D0E}\u{326}\u{306}",
"\u{1F600}",
"Istanbul",
"test mixing\u{0430}cyrillic",
"The quick brown FOX jumps over the lazy DOG (Привет, Мир!) Καλημέρα",
];
for input in &fixtures {
let fused = normalize_for_matching(input, &opts);
let legacy = normalize_for_matching_legacy(input, &opts);
assert_eq!(
fused, legacy,
"fused vs legacy diverged for {:?}: fused={:?}, legacy={:?}",
input, fused, legacy,
);
}
}
#[test]
fn fused_matches_legacy_turkish() {
let opts = turkish_opts();
let fixtures = [
"Istanbul",
"\u{0130}stanbul",
"\u{0131}stanbul",
"FILE",
"fıle",
];
for input in &fixtures {
let fused = normalize_for_matching(input, &opts);
let legacy = normalize_for_matching_legacy(input, &opts);
assert_eq!(
fused, legacy,
"fused vs legacy diverged for {:?} (Turkish): fused={:?}, legacy={:?}",
input, fused, legacy,
);
}
}
}