use std::borrow::Cow;
use unicode_normalization::UnicodeNormalization;
use crate::{case_fold, confusables, emoji, transliterate, whitespace, zalgo};
#[inline]
fn nfkc_normalize(text: &str) -> Cow<'_, str> {
if text.is_ascii() {
Cow::Borrowed(text)
} else {
Cow::Owned(text.nfkc().collect())
}
}
pub(crate) fn strip_bidi(text: &str) -> String {
let mut out = String::new();
strip_bidi_into(text, &mut out);
out
}
pub(crate) fn strip_bidi_into(text: &str, out: &mut String) {
out.clear();
out.extend(text.chars().filter(|&ch| !is_bidi_or_format(ch)));
}
fn neutralize_path_separators(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let mut prev_dot = false;
for ch in text.chars() {
match ch {
'/' | '\\' => {
out.push('_');
prev_dot = false;
}
'.' => {
if !prev_dot {
out.push('.');
}
prev_dot = true;
}
other => {
out.push(other);
prev_dot = false;
}
}
}
out
}
#[inline]
fn is_bidi_or_format(ch: char) -> bool {
if ch == '\u{00AD}' {
return true;
}
if matches!(ch, '\u{206A}'..='\u{206F}' | '\u{FFF9}'..='\u{FFFB}') {
return true;
}
matches!(
ch,
'\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{061C}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
pub(crate) fn security_clean(text: &str) -> Result<String, crate::ErrorRepr> {
let buf = nfkc_normalize(text);
let buf = confusables::normalize_confusables(&buf, "latin")?;
let buf = strip_bidi(&buf);
let buf = whitespace::collapse_whitespace(&buf, true, true);
Ok(neutralize_path_separators(&buf))
}
pub(crate) fn ml_normalize(
text: &str,
lang: Option<&str>,
emoji_style: &str,
) -> Result<String, crate::ErrorRepr> {
crate::transliterate::validate_lang(lang)?;
if !matches!(emoji_style, "cldr" | "none") {
return Err(crate::ErrorRepr::InvalidEmojiStyle {
got: emoji_style.to_owned(),
});
}
let normalized = nfkc_normalize(text);
let mut buf = if emoji_style == "cldr" {
emoji::demojize_rust(&normalized, false)
} else {
normalized.into_owned()
};
if lang.is_some() {
buf = transliterate::transliterate_impl(
&buf,
lang,
crate::ErrorMode::Ignore,
"",
false,
false,
false,
)
.into_owned();
}
buf = transliterate::strip_accents(&buf);
buf = case_fold::fold_case_impl(&buf);
buf = whitespace::collapse_whitespace(&buf, true, true);
Ok(buf)
}
pub(crate) fn catalog_key(
text: &str,
lang: Option<&str>,
strict_iso9: bool,
) -> Result<String, crate::ErrorRepr> {
crate::transliterate::validate_lang(lang)?;
let buf = nfkc_normalize(text);
let buf = strip_bidi(&buf);
let buf = transliterate::transliterate_impl(
&buf,
lang,
crate::ErrorMode::Preserve,
"",
strict_iso9,
false,
false,
)
.into_owned();
let buf = confusables::normalize_confusables(&buf, "latin")?;
let buf = transliterate::strip_accents(&buf);
let buf = case_fold::fold_case_impl(&buf);
let buf = whitespace::collapse_whitespace(&buf, true, true);
Ok(buf)
}
pub(crate) fn search_key(text: &str, lang: Option<&str>) -> Result<String, crate::ErrorRepr> {
crate::transliterate::validate_lang(lang)?;
let buf = nfkc_normalize(text);
let buf = strip_bidi(&buf);
let buf = transliterate::transliterate_impl(
&buf,
lang,
crate::ErrorMode::Preserve,
"",
false,
false,
false,
)
.into_owned();
let buf = transliterate::strip_accents(&buf);
let buf = case_fold::fold_case_impl(&buf);
let buf = whitespace::collapse_whitespace(&buf, true, true);
Ok(buf)
}
pub(crate) fn sort_key(text: &str, lang: Option<&str>) -> Result<String, crate::ErrorRepr> {
crate::transliterate::validate_lang(lang)?;
let buf = nfkc_normalize(text);
let buf = strip_bidi(&buf);
let buf = transliterate::transliterate_impl(
&buf,
lang,
crate::ErrorMode::Preserve,
"",
false,
false,
false,
)
.into_owned();
let buf = case_fold::fold_case_impl(&buf);
let buf = whitespace::collapse_whitespace(&buf, true, true);
Ok(buf)
}
pub(crate) fn display_clean(text: &str) -> String {
let buf = strip_bidi(text);
whitespace::collapse_whitespace(&buf, true, true)
}
pub(crate) fn normalize_user_input(text: &str) -> Result<String, crate::ErrorRepr> {
let buf = nfkc_normalize(text);
let buf = strip_bidi(&buf);
let buf = whitespace::strip_zero_width_chars(&buf);
let buf = whitespace::strip_control_chars(&buf);
let buf = zalgo::strip_zalgo(&buf, 2);
let buf = confusables::normalize_confusables(&buf, "latin")?;
let buf = whitespace::collapse_whitespace(&buf, true, true);
Ok(neutralize_path_separators(&buf))
}
pub(crate) fn strip_obfuscation(text: &str) -> Result<String, crate::ErrorRepr> {
let buf = nfkc_normalize(text);
let buf = zalgo::strip_zalgo(&buf, 0);
let buf = strip_bidi(&buf);
let buf = whitespace::strip_zero_width_chars(&buf);
let buf = emoji::demojize_rust(&buf, false);
let buf = confusables::normalize_confusables(&buf, "latin")?;
let buf = transliterate::strip_accents(&buf);
Ok(whitespace::collapse_whitespace(&buf, true, true))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_neutralize_path_separators() {
assert_eq!(neutralize_path_separators("etc/passwd"), "etc_passwd");
assert_eq!(neutralize_path_separators("a\\b"), "a_b");
assert_eq!(neutralize_path_separators("../../etc"), "._._etc");
assert_eq!(neutralize_path_separators("a..b"), "a.b");
assert_eq!(neutralize_path_separators("file.tar.gz"), "file.tar.gz");
assert_eq!(neutralize_path_separators("hello world"), "hello world");
assert!(!neutralize_path_separators("x⁄y/z\\w").contains(['/', '\\']));
}
#[test]
fn test_neutralize_path_separators_idempotent() {
for s in ["etc/passwd", "../../x", "a..b/c\\d", "plain text"] {
let once = neutralize_path_separators(s);
assert_eq!(
neutralize_path_separators(&once),
once,
"not idempotent: {s:?}"
);
}
}
#[test]
fn test_nfkc_normalize_matches_reference() {
let cases = [
"", "hello world 123", "!@#$%^&*()_+-=[]{}", "\u{007F}\u{0000}", "Fullwidth", "fi fl ffi", "x²y³", "café e\u{0301}", "ⅣⅧ", "Москва 日本語 αβγ", ];
for s in cases {
let reference: String = s.nfkc().collect();
assert_eq!(
nfkc_normalize(s).as_ref(),
reference.as_str(),
"nfkc_normalize diverged from nfkc() on {s:?}"
);
}
}
#[test]
fn test_nfkc_normalize_borrows_ascii_owns_nonascii() {
assert!(matches!(nfkc_normalize("plain ascii"), Cow::Borrowed(_)));
assert!(matches!(nfkc_normalize(""), Cow::Borrowed(_)));
assert!(matches!(nfkc_normalize("Full"), Cow::Owned(_)));
assert!(matches!(nfkc_normalize("café"), Cow::Owned(_)));
}
#[test]
fn test_strip_bidi_soft_hyphen() {
assert_eq!(strip_bidi("pass\u{00AD}word"), "password");
}
#[test]
fn test_strip_bidi_arabic_letter_mark() {
assert_eq!(strip_bidi("hello\u{061C}world"), "helloworld");
}
#[test]
fn test_strip_bidi_marks() {
assert_eq!(strip_bidi("a\u{200E}b"), "ab"); assert_eq!(strip_bidi("a\u{200F}b"), "ab"); }
#[test]
fn test_strip_bidi_embeddings_overrides() {
assert_eq!(strip_bidi("a\u{202A}b"), "ab"); assert_eq!(strip_bidi("a\u{202B}b"), "ab"); assert_eq!(strip_bidi("a\u{202C}b"), "ab"); assert_eq!(strip_bidi("a\u{202D}b"), "ab"); assert_eq!(strip_bidi("a\u{202E}b"), "ab"); }
#[test]
fn test_strip_bidi_isolates() {
assert_eq!(strip_bidi("a\u{2066}b"), "ab"); assert_eq!(strip_bidi("a\u{2067}b"), "ab"); assert_eq!(strip_bidi("a\u{2068}b"), "ab"); assert_eq!(strip_bidi("a\u{2069}b"), "ab"); }
#[test]
fn test_strip_bidi_all_at_once() {
let all_bidi = "\u{00AD}\u{061C}\u{200E}\u{200F}\
\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\
\u{2066}\u{2067}\u{2068}\u{2069}";
assert_eq!(strip_bidi(&format!("x{all_bidi}y")), "xy");
assert_eq!(all_bidi.chars().count(), 13);
}
#[test]
fn test_strip_bidi_preserves_normal() {
assert_eq!(strip_bidi("hello world"), "hello world");
assert_eq!(strip_bidi("café"), "café");
assert_eq!(strip_bidi("مرحبا"), "مرحبا");
}
#[test]
fn test_security_clean_homoglyph() {
let result = security_clean("\u{0440}\u{0430}ypal").unwrap();
assert_eq!(result, "paypal");
}
#[test]
fn test_security_clean_bidi() {
let result = security_clean("admin\u{202E}user").unwrap();
assert_eq!(result, "adminuser");
}
#[test]
fn test_security_clean_arabic_letter_mark() {
let result = security_clean("admin\u{061C}user").unwrap();
assert_eq!(result, "adminuser");
}
#[test]
fn test_security_clean_invisible_math_operators() {
let result = security_clean("pass\u{2061}word").unwrap();
assert_eq!(result, "password");
}
#[test]
fn test_security_clean_soft_hyphen() {
let result = security_clean("pass\u{00AD}word").unwrap();
assert_eq!(result, "password");
}
#[test]
fn test_security_clean_zwsp() {
let result = security_clean("admin\u{200B}user").unwrap();
assert_eq!(result, "adminuser");
}
#[test]
fn test_ml_normalize_basic() {
let result = ml_normalize("Café Résumé", None, "cldr").unwrap();
assert_eq!(result, "cafe resume");
}
#[test]
fn test_ml_normalize_ligature() {
let result = ml_normalize("\u{FB01}lter", None, "cldr").unwrap();
assert_eq!(result, "filter");
}
#[test]
fn test_catalog_key_dedup() {
let a = catalog_key("Café", None, false).unwrap();
let b = catalog_key("café", None, false).unwrap();
let c = catalog_key("CAFÉ", None, false).unwrap();
assert_eq!(a, b);
assert_eq!(b, c);
}
#[test]
fn test_catalog_key_iso9() {
let result = catalog_key("\u{0419}\u{043E}\u{0433}\u{0430}", None, true).unwrap();
assert_eq!(result, "joga");
}
#[test]
fn test_search_key_accent_insensitive() {
let a = search_key("Café", None).unwrap();
let b = search_key("cafe", None).unwrap();
let c = search_key("CAFÉ", None).unwrap();
assert_eq!(a, "cafe");
assert_eq!(a, b);
assert_eq!(b, c);
}
#[test]
fn test_search_key_cyrillic() {
assert_eq!(search_key("Москва", None).unwrap(), "moskva");
}
#[test]
fn test_search_key_greek() {
assert_eq!(search_key("ΩMEGA", None).unwrap(), "omega");
}
#[test]
fn test_sort_key_preserves_accents_as_base() {
let result = sort_key("Über", None).unwrap();
assert_eq!(result, "uber");
}
#[test]
fn test_sort_key_cyrillic() {
assert_eq!(sort_key("Война и мир", None).unwrap(), "voyna i mir");
}
#[test]
fn test_sort_key_vs_search_key() {
assert_eq!(
sort_key("Москва", None).unwrap(),
search_key("Москва", None).unwrap()
);
}
#[test]
fn test_key_functions_strip_bidi_and_soft_hyphen() {
for (stored, clean) in [
("pass\u{00AD}word", "password"), ("user\u{202E}txt", "usertxt"), ("a\u{200E}b", "ab"), ("x\u{061C}y", "xy"), ] {
assert_eq!(
search_key(stored, None).unwrap(),
search_key(clean, None).unwrap(),
"search_key must collide for {stored:?} vs {clean:?}"
);
assert_eq!(
catalog_key(stored, None, false).unwrap(),
catalog_key(clean, None, false).unwrap(),
"catalog_key must collide for {stored:?} vs {clean:?}"
);
assert_eq!(
sort_key(stored, None).unwrap(),
sort_key(clean, None).unwrap(),
"sort_key must collide for {stored:?} vs {clean:?}"
);
}
}
#[test]
fn test_display_clean_basic() {
assert_eq!(display_clean("hello world"), "hello world");
assert_eq!(display_clean("hello\x00world"), "helloworld");
assert_eq!(display_clean("hello\u{200B}world"), "helloworld");
}
#[test]
fn test_display_clean_strips_bidi() {
assert_eq!(display_clean("admin\u{202E}user"), "adminuser");
assert_eq!(display_clean("pass\u{00AD}word"), "password");
assert_eq!(display_clean("hello\u{061C}world"), "helloworld");
}
#[test]
fn test_normalize_user_input_clean_text() {
assert_eq!(
normalize_user_input("Hello, world!").unwrap(),
"Hello, world!"
);
}
#[test]
fn test_normalize_user_input_preserves_script() {
let result = normalize_user_input("Москва").unwrap();
assert!(!result.is_empty());
}
#[test]
fn test_normalize_user_input_strips_zalgo() {
let mut zalgo = String::from("hello");
for _ in 0..20 {
zalgo.push('\u{0300}');
}
zalgo.push_str(" world");
let result = normalize_user_input(&zalgo).unwrap();
assert!(result.len() < zalgo.len());
assert!(result.contains("world"));
}
#[test]
fn test_normalize_user_input_strips_bidi() {
assert_eq!(
normalize_user_input("admin\u{202E}user").unwrap(),
"adminuser"
);
}
#[test]
fn test_normalize_user_input_strips_zero_width() {
assert_eq!(
normalize_user_input("pass\u{200B}word").unwrap(),
"password"
);
}
#[test]
fn test_normalize_user_input_preserves_accents() {
assert_eq!(normalize_user_input("café").unwrap(), "café");
assert_eq!(normalize_user_input("résumé").unwrap(), "résumé");
}
#[test]
fn test_normalize_user_input_homoglyph() {
let result = normalize_user_input("p\u{0430}ypal").unwrap();
assert_eq!(result, "paypal");
}
mod proptest_properties {
use super::*;
use proptest::prelude::*;
const SPECIAL: &[char] = &[
'\u{200E}',
'\u{200F}',
'\u{202A}',
'\u{202B}',
'\u{202C}',
'\u{202D}',
'\u{202E}',
'\u{061C}',
'\u{2066}',
'\u{2067}',
'\u{2068}',
'\u{2069}',
'\u{00AD}',
'\u{200B}',
'\u{200C}',
'\u{200D}',
'\u{2060}',
'\u{FEFF}',
'\u{0301}',
'\u{0300}',
'\u{0489}',
'\u{0430}',
'\u{0440}',
'\u{0441}',
'\u{0435}',
'\u{043E}',
'\u{FF41}',
'\u{1F452}',
];
fn adversarial() -> impl Strategy<Value = String> {
let special = proptest::sample::select(SPECIAL.to_vec());
proptest::collection::vec(
prop_oneof![4 => any::<char>(), 3 => special, 2 => prop::char::range('a', 'z')],
0..40,
)
.prop_map(|cs| cs.into_iter().collect())
}
fn nfc(s: &str) -> String {
s.nfc().collect()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn security_clean_idempotent(s in adversarial()) {
let once = security_clean(&s).unwrap();
let twice = security_clean(&once).unwrap();
prop_assert_eq!(nfc(&once), nfc(&twice));
}
#[test]
fn strip_obfuscation_idempotent(s in adversarial()) {
let once = strip_obfuscation(&s).unwrap();
let twice = strip_obfuscation(&once).unwrap();
prop_assert_eq!(nfc(&once), nfc(&twice));
}
#[test]
fn normalize_user_input_idempotent(s in adversarial()) {
let once = normalize_user_input(&s).unwrap();
let twice = normalize_user_input(&once).unwrap();
prop_assert_eq!(nfc(&once), nfc(&twice));
}
#[test]
fn strip_bidi_idempotent(s in adversarial()) {
let once = strip_bidi(&s);
prop_assert_eq!(&once, &strip_bidi(&once));
}
#[test]
fn no_bidi_after_strip_bidi(s in adversarial()) {
prop_assert!(!strip_bidi(&s).chars().any(is_bidi_or_format));
}
#[test]
fn no_bidi_after_security_clean(s in adversarial()) {
prop_assert!(!security_clean(&s).unwrap().chars().any(is_bidi_or_format));
}
#[test]
fn no_bidi_after_strip_obfuscation(s in adversarial()) {
prop_assert!(!strip_obfuscation(&s).unwrap().chars().any(is_bidi_or_format));
}
#[test]
fn no_bidi_after_normalize_user_input(s in adversarial()) {
prop_assert!(!normalize_user_input(&s).unwrap().chars().any(is_bidi_or_format));
}
}
}
}