use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
pub fn normalize_name(input: &str) -> String {
let decomposed: String = input.nfd().filter(|c| !is_combining_mark(*c)).collect();
let mut out = String::with_capacity(decomposed.len());
let mut pending_space = false;
for c in decomposed.chars().flat_map(char::to_lowercase) {
match c {
'\'' | '\u{2019}' | '\u{2018}' | '`' | '"' | '\u{201C}' | '\u{201D}' => {}
_ if c.is_whitespace() || c == '-' => {
if !out.is_empty() {
pending_space = true;
}
}
_ => {
if pending_space {
out.push(' ');
pending_space = false;
}
out.push(c);
}
}
}
out
}
#[cfg(test)]
mod tests {
use super::normalize_name;
#[test]
fn strips_diacritics() {
assert_eq!(normalize_name("Khârn the Betrayer"), "kharn the betrayer");
assert_eq!(normalize_name("Magnús"), "magnus");
}
#[test]
fn removes_quote_variants() {
assert_eq!(normalize_name("T'au"), "tau");
assert_eq!(normalize_name("Be'lakor"), "belakor");
assert_eq!(normalize_name("\u{2018}quoted\u{2019}"), "quoted");
}
#[test]
fn collapses_whitespace_and_hyphens() {
assert_eq!(normalize_name("the betrayer"), "the betrayer");
assert_eq!(normalize_name("space--marines"), "space marines");
assert_eq!(
normalize_name(" leading and trailing "),
"leading and trailing"
);
}
#[test]
fn distinct_names_stay_distinct() {
assert_ne!(normalize_name("Khorne"), normalize_name("Khârn"));
}
}