use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TextField {
Text,
TextContains,
TextPattern,
AriaLabel,
}
pub fn normalize_text(input: &str) -> String {
let decoded: String = html_escape::decode_html_entities(input).into_owned();
let nfc: String = decoded.chars().nfc().collect();
let stripped: String = nfc.chars().filter(|c| !is_invisible_format(*c)).collect();
let folded = fold_whitespace(&stripped);
folded.to_lowercase()
}
pub fn normalize_for_field(input: &str, _field: TextField) -> String {
normalize_text(input)
}
fn is_invisible_format(c: char) -> bool {
matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
fn fold_whitespace(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut in_space_run = false;
for c in input.chars() {
let is_ws = matches!(
c,
'\u{00A0}' | '\u{202F}' | ' ' | '\t' | '\r' | '\n' | '\u{000B}' | '\u{000C}'
);
if is_ws {
if !in_space_run && !out.is_empty() {
out.push(' ');
}
in_space_run = true;
} else {
out.push(c);
in_space_run = false;
}
}
if out.ends_with(' ') {
out.pop();
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decodes_named_html_entity() {
assert_eq!(normalize_text("Save & Quit"), "save & quit");
}
#[test]
fn decodes_numeric_html_entity() {
assert_eq!(normalize_text(""hi""), "\"hi\"");
}
#[test]
fn applies_nfc() {
assert_eq!(normalize_text("e\u{0301}"), "é");
}
#[test]
fn does_not_apply_nfkc() {
assert_eq!(normalize_text("①"), "①");
}
#[test]
fn strips_zero_width_space() {
assert_eq!(normalize_text("a\u{200B}b"), "ab");
}
#[test]
fn strips_zero_width_joiner() {
assert_eq!(normalize_text("a\u{200D}b"), "ab");
}
#[test]
fn strips_bom() {
assert_eq!(normalize_text("\u{FEFF}hi"), "hi");
}
#[test]
fn strips_lrm_rlm() {
assert_eq!(normalize_text("a\u{200E}b\u{200F}c"), "abc");
}
#[test]
fn strips_bidi_isolate_marks() {
assert_eq!(normalize_text("a\u{2066}b\u{2069}c"), "abc");
}
#[test]
fn folds_nbsp_to_space() {
assert_eq!(normalize_text("a\u{00A0}b"), "a b");
}
#[test]
fn folds_narrow_nbsp_to_space() {
assert_eq!(normalize_text("a\u{202F}b"), "a b");
}
#[test]
fn collapses_runs_of_whitespace() {
assert_eq!(normalize_text("a \t b"), "a b");
}
#[test]
fn trims_leading_trailing_whitespace() {
assert_eq!(normalize_text(" hi "), "hi");
}
#[test]
fn case_folds_basic_ascii() {
assert_eq!(normalize_text("Hello"), "hello");
}
#[test]
fn case_folds_german_sharp_s() {
assert_eq!(normalize_text("Straße"), "straße");
}
#[test]
fn case_folds_turkish_capital_i_with_dot() {
assert_eq!(normalize_text("İ"), "i\u{0307}");
}
#[test]
fn preserves_smart_quotes() {
assert_eq!(normalize_text("\u{201C}hi\u{201D}"), "\u{201C}hi\u{201D}");
}
#[test]
fn preserves_em_dash() {
assert_eq!(normalize_text("a — b"), "a — b");
}
#[test]
fn preserves_ellipsis_char() {
assert_eq!(normalize_text("a…"), "a…");
}
#[test]
fn full_pipeline_composes() {
assert_eq!(normalize_text(" & \u{00A0}HELLO\u{200B} "), "& hello");
}
}