pub fn remove_diacritics(string: &str) -> String {
let chars = string.chars();
chars.fold(String::with_capacity(string.len()), |mut acc, current| {
escape_diacritic(&mut acc, current);
acc
})
}
fn escape_diacritic(acc: &mut String, current: char) {
match current {
'A' | 'Ⓐ' | 'A' | 'À' | 'Á' | 'Â' | 'Ầ' | 'Ấ' | 'Ẫ' | 'Ẩ' | 'Ã' | 'Ā' | 'Ă' | 'Ằ'
| 'Ắ' | 'Ẵ' | 'Ẳ' | 'Ȧ' | 'Ǡ' | 'Ä' | 'Ǟ' | 'Ả' | 'Å' | 'Ǻ' | 'Ǎ' | 'Ȁ' | 'Ȃ' | 'Ạ'
| 'Ậ' | 'Ặ' | 'Ḁ' | 'Ą' | 'Ⱥ' | 'Ɐ' => acc.push('A'),
'Ꜳ' => acc.push_str("AA"),
'Æ' | 'Ǽ' | 'Ǣ' => acc.push('A'),
'Ꜵ' => acc.push_str("AO"),
'Ꜷ' => acc.push_str("AU"),
'Ꜹ' | 'Ꜻ' => acc.push_str("AV"),
'Ꜽ' => acc.push_str("AY"),
'B' | 'Ⓑ' | 'B' | 'Ḃ' | 'Ḅ' | 'Ḇ' | 'Ƀ' | 'Ƃ' | 'Ɓ' => acc.push('B'),
'C' | 'Ⓒ' | 'C' | 'Ć' | 'Ĉ' | 'Ċ' | 'Č' | 'Ç' | 'Ḉ' | 'Ƈ' | 'Ȼ' | 'Ꜿ' => {
acc.push('C')
}
'D' | 'Ⓓ' | 'D' | 'Ḋ' | 'Ď' | 'Ḍ' | 'Ḑ' | 'Ḓ' | 'Ḏ' | 'Đ' | 'Ƌ' | 'Ɗ' | 'Ɖ' | 'Ꝺ' => {
acc.push('D')
}
'DZ' | 'DŽ' => acc.push_str("DZ"),
'Dz' | 'Dž' => acc.push_str("Dz"),
'E' | 'Ⓔ' | 'E' | 'È' | 'É' | 'Ê' | 'Ề' | 'Ế' | 'Ễ' | 'Ể' | 'Ẽ' | 'Ē' | 'Ḕ' | 'Ḗ'
| 'Ĕ' | 'Ė' | 'Ë' | 'Ẻ' | 'Ě' | 'Ȅ' | 'Ȇ' | 'Ẹ' | 'Ệ' | 'Ȩ' | 'Ḝ' | 'Ę' | 'Ḙ' | 'Ḛ'
| 'Ɛ' | 'Ǝ' => acc.push('E'),
'F' | 'Ⓕ' | 'F' | 'Ḟ' | 'Ƒ' | 'Ꝼ' => acc.push('F'),
'G' | 'Ⓖ' | 'G' | 'Ǵ' | 'Ĝ' | 'Ḡ' | 'Ğ' | 'Ġ' | 'Ǧ' | 'Ģ' | 'Ǥ' | 'Ɠ' | 'Ꞡ' | 'Ᵹ'
| 'Ꝿ' => acc.push('G'),
'H' | 'Ⓗ' | 'H' | 'Ĥ' | 'Ḣ' | 'Ḧ' | 'Ȟ' | 'Ḥ' | 'Ḩ' | 'Ḫ' | 'Ħ' | 'Ⱨ' | 'Ⱶ' | 'Ɥ' => {
acc.push('H')
}
'I' | 'Ⓘ' | 'I' | 'Ì' | 'Í' | 'Î' | 'Ĩ' | 'Ī' | 'Ĭ' | 'İ' | 'Ï' | 'Ḯ' | 'Ỉ' | 'Ǐ'
| 'Ȉ' | 'Ȋ' | 'Ị' | 'Į' | 'Ḭ' | 'Ɨ' => acc.push('I'),
'J' | 'Ⓙ' | 'J' | 'Ĵ' | 'Ɉ' => acc.push('J'),
'K' | 'Ⓚ' | 'K' | 'Ḱ' | 'Ǩ' | 'Ḳ' | 'Ķ' | 'Ḵ' | 'Ƙ' | 'Ⱪ' | 'Ꝁ' | 'Ꝃ' | 'Ꝅ' | 'Ꞣ' => {
acc.push('K')
}
'L' | 'Ⓛ' | 'L' | 'Ŀ' | 'Ĺ' | 'Ľ' | 'Ḷ' | 'Ḹ' | 'Ļ' | 'Ḽ' | 'Ḻ' | 'Ł' | 'Ƚ' | 'Ɫ'
| 'Ⱡ' | 'Ꝉ' | 'Ꝇ' | 'Ꞁ' => acc.push('L'),
'LJ' => acc.push_str("LJ"),
'Lj' => acc.push_str("Lj"),
'M' | 'Ⓜ' | 'M' | 'Ḿ' | 'Ṁ' | 'Ṃ' | 'Ɱ' | 'Ɯ' => acc.push('M'),
'N' | 'Ⓝ' | 'N' | 'Ǹ' | 'Ń' | 'Ñ' | 'Ṅ' | 'Ň' | 'Ṇ' | 'Ņ' | 'Ṋ' | 'Ṉ' | 'Ƞ' | 'Ɲ'
| 'Ꞑ' | 'Ꞥ' => acc.push('N'),
'NJ' => acc.push_str("NJ"),
'Nj' => acc.push_str("Nj"),
'O' | 'Ⓞ' | 'O' | 'Ò' | 'Ó' | 'Ô' | 'Ồ' | 'Ố' | 'Ỗ' | 'Ổ' | 'Õ' | 'Ṍ' | 'Ȭ' | 'Ṏ'
| 'Ō' | 'Ṑ' | 'Ṓ' | 'Ŏ' | 'Ȯ' | 'Ȱ' | 'Ö' | 'Ȫ' | 'Ỏ' | 'Ő' | 'Ǒ' | 'Ȍ' | 'Ȏ' | 'Ơ'
| 'Ờ' | 'Ớ' | 'Ỡ' | 'Ở' | 'Ợ' | 'Ọ' | 'Ộ' | 'Ǫ' | 'Ǭ' | 'Ø' | 'Ǿ' | 'Ɔ' | 'Ɵ' | 'Ꝋ'
| 'Ꝍ' => acc.push('O'),
'Ƣ' => acc.push_str("OI"),
'Ꝏ' => acc.push_str("OO"),
'Ȣ' => acc.push_str("OU"),
'\u{008C}' | 'Œ' => acc.push_str("OE"),
'\u{009C}' | 'œ' => acc.push_str("oe"),
'P' | 'Ⓟ' | 'P' | 'Ṕ' | 'Ṗ' | 'Ƥ' | 'Ᵽ' | 'Ꝑ' | 'Ꝓ' | 'Ꝕ' => acc.push('P'),
'Q' | 'Ⓠ' | 'Q' | 'Ꝗ' | 'Ꝙ' | 'Ɋ' => acc.push('Q'),
'R' | 'Ⓡ' | 'R' | 'Ŕ' | 'Ṙ' | 'Ř' | 'Ȑ' | 'Ȓ' | 'Ṛ' | 'Ṝ' | 'Ŗ' | 'Ṟ' | 'Ɍ' | 'Ɽ'
| 'Ꝛ' | 'Ꞧ' | 'Ꞃ' => acc.push('R'),
'S' | 'Ⓢ' | 'S' | 'ẞ' | 'Ś' | 'Ṥ' | 'Ŝ' | 'Ṡ' | 'Š' | 'Ṧ' | 'Ṣ' | 'Ṩ' | 'Ș' | 'Ş'
| 'Ȿ' | 'Ꞩ' | 'Ꞅ' => acc.push('S'),
'T' | 'Ⓣ' | 'T' | 'Ṫ' | 'Ť' | 'Ṭ' | 'Ț' | 'Ţ' | 'Ṱ' | 'Ṯ' | 'Ŧ' | 'Ƭ' | 'Ʈ' | 'Ⱦ'
| 'Ꞇ' => acc.push('T'),
'Ꜩ' => acc.push_str("TZ"),
'U' | 'Ⓤ' | 'U' | 'Ù' | 'Ú' | 'Û' | 'Ũ' | 'Ṹ' | 'Ū' | 'Ṻ' | 'Ŭ' | 'Ü' | 'Ǜ' | 'Ǘ'
| 'Ǖ' | 'Ǚ' | 'Ủ' | 'Ů' | 'Ű' | 'Ǔ' | 'Ȕ' | 'Ȗ' | 'Ư' | 'Ừ' | 'Ứ' | 'Ữ' | 'Ử' | 'Ự'
| 'Ụ' | 'Ṳ' | 'Ų' | 'Ṷ' | 'Ṵ' | 'Ʉ' => acc.push('U'),
'V' | 'Ⓥ' | 'V' | 'Ṽ' | 'Ṿ' | 'Ʋ' | 'Ꝟ' | 'Ʌ' => acc.push('V'),
'Ꝡ' => acc.push_str("VY"),
'W' | 'Ⓦ' | 'W' | 'Ẁ' | 'Ẃ' | 'Ŵ' | 'Ẇ' | 'Ẅ' | 'Ẉ' | 'Ⱳ' => acc.push('W'),
'X' | 'Ⓧ' | 'X' | 'Ẋ' | 'Ẍ' => acc.push('X'),
'Y' | 'Ⓨ' | 'Y' | 'Ỳ' | 'Ý' | 'Ŷ' | 'Ỹ' | 'Ȳ' | 'Ẏ' | 'Ÿ' | 'Ỷ' | 'Ỵ' | 'Ƴ' | 'Ɏ'
| 'Ỿ' => acc.push('Y'),
'Z' | 'Ⓩ' | 'Z' | 'Ź' | 'Ẑ' | 'Ż' | 'Ž' | 'Ẓ' | 'Ẕ' | 'Ƶ' | 'Ȥ' | 'Ɀ' | 'Ⱬ' | 'Ꝣ' => {
acc.push('Z')
}
'a' | 'ⓐ' | 'a' | 'ẚ' | 'à' | 'á' | 'â' | 'ầ' | 'ấ' | 'ẫ' | 'ẩ' | 'ã' | 'ā' | 'ă'
| 'ằ' | 'ắ' | 'ẵ' | 'ẳ' | 'ȧ' | 'ǡ' | 'ä' | 'ǟ' | 'ả' | 'å' | 'ǻ' | 'ǎ' | 'ȁ' | 'ȃ'
| 'ạ' | 'ậ' | 'ặ' | 'ḁ' | 'ą' | 'ⱥ' | 'ɐ' => acc.push('a'),
'ꜳ' => acc.push_str("aa"),
'æ' | 'ǽ' | 'ǣ' => acc.push('a'),
'ꜵ' => acc.push_str("ao"),
'ꜷ' => acc.push_str("au"),
'ꜹ' | 'ꜻ' => acc.push_str("av"),
'ꜽ' => acc.push_str("ay"),
'b' | 'ⓑ' | 'b' | 'ḃ' | 'ḅ' | 'ḇ' | 'ƀ' | 'ƃ' | 'ɓ' | 'þ' => acc.push('b'),
'c' | 'ⓒ' | 'c' | 'ć' | 'ĉ' | 'ċ' | 'č' | 'ç' | 'ḉ' | 'ƈ' | 'ȼ' | 'ꜿ' | 'ↄ' => {
acc.push('c')
}
'd' | 'ⓓ' | 'd' | 'ḋ' | 'ď' | 'ḍ' | 'ḑ' | 'ḓ' | 'ḏ' | 'đ' | 'ƌ' | 'ɖ' | 'ɗ' | 'ꝺ' => {
acc.push('d')
}
'dz' | 'dž' => acc.push_str("dz"),
'e' | 'ⓔ' | 'e' | 'è' | 'é' | 'ê' | 'ề' | 'ế' | 'ễ' | 'ể' | 'ẽ' | 'ē' | 'ḕ' | 'ḗ'
| 'ĕ' | 'ė' | 'ë' | 'ẻ' | 'ě' | 'ȅ' | 'ȇ' | 'ẹ' | 'ệ' | 'ȩ' | 'ḝ' | 'ę' | 'ḙ' | 'ḛ'
| 'ɇ' | 'ɛ' | 'ǝ' => acc.push('e'),
'f' | 'ⓕ' | 'f' | 'ḟ' | 'ƒ' | 'ꝼ' => acc.push('f'),
'g' | 'ⓖ' | 'g' | 'ǵ' | 'ĝ' | 'ḡ' | 'ğ' | 'ġ' | 'ǧ' | 'ģ' | 'ǥ' | 'ɠ' | 'ꞡ' | 'ᵹ'
| 'ꝿ' => acc.push('g'),
'h' | 'ⓗ' | 'h' | 'ĥ' | 'ḣ' | 'ḧ' | 'ȟ' | 'ḥ' | 'ḩ' | 'ḫ' | 'ẖ' | 'ħ' | 'ⱨ' | 'ⱶ'
| 'ɥ' => acc.push('h'),
'ƕ' => acc.push_str("hv"),
'i' | 'ⓘ' | 'i' | 'ì' | 'í' | 'î' | 'ĩ' | 'ī' | 'ĭ' | 'ï' | 'ḯ' | 'ỉ' | 'ǐ' | 'ȉ'
| 'ȋ' | 'ị' | 'į' | 'ḭ' | 'ɨ' | 'ı' => acc.push('i'),
'j' | 'ⓙ' | 'j' | 'ĵ' | 'ǰ' | 'ɉ' => acc.push('j'),
'k' | 'ⓚ' | 'k' | 'ḱ' | 'ǩ' | 'ḳ' | 'ķ' | 'ḵ' | 'ƙ' | 'ⱪ' | 'ꝁ' | 'ꝃ' | 'ꝅ' | 'ꞣ' => {
acc.push('k')
}
'l' | 'ⓛ' | 'l' | 'ŀ' | 'ĺ' | 'ľ' | 'ḷ' | 'ḹ' | 'ļ' | 'ḽ' | 'ḻ' | 'ſ' | 'ł' | 'ƚ'
| 'ɫ' | 'ⱡ' | 'ꝉ' | 'ꞁ' | 'ꝇ' => acc.push('l'),
'lj' => acc.push_str("lj"),
'm' | 'ⓜ' | 'm' | 'ḿ' | 'ṁ' | 'ṃ' | 'ɱ' | 'ɯ' => acc.push('m'),
'n' | 'ⓝ' | 'n' | 'ǹ' | 'ń' | 'ñ' | 'ṅ' | 'ň' | 'ṇ' | 'ņ' | 'ṋ' | 'ṉ' | 'ƞ' | 'ɲ'
| 'ʼn' | 'ꞑ' | 'ꞥ' => acc.push('n'),
'nj' => acc.push_str("nj"),
'o' | 'ⓞ' | 'o' | 'ò' | 'ó' | 'ô' | 'ồ' | 'ố' | 'ỗ' | 'ổ' | 'õ' | 'ṍ' | 'ȭ' | 'ṏ'
| 'ō' | 'ṑ' | 'ṓ' | 'ŏ' | 'ȯ' | 'ȱ' | 'ö' | 'ȫ' | 'ỏ' | 'ő' | 'ǒ' | 'ȍ' | 'ȏ' | 'ơ'
| 'ờ' | 'ớ' | 'ỡ' | 'ở' | 'ợ' | 'ọ' | 'ộ' | 'ǫ' | 'ǭ' | 'ø' | 'ǿ' | 'ɔ' | 'ꝋ' | 'ꝍ'
| 'ɵ' => acc.push('o'),
'ƣ' => acc.push_str("oi"),
'ȣ' => acc.push_str("ou"),
'ꝏ' => acc.push_str("oo"),
'p' | 'ⓟ' | 'p' | 'ṕ' | 'ṗ' | 'ƥ' | 'ᵽ' | 'ꝑ' | 'ꝓ' | 'ꝕ' => acc.push('p'),
'q' | 'ⓠ' | 'q' | 'ɋ' | 'ꝗ' | 'ꝙ' => acc.push('q'),
'r' | 'ⓡ' | 'r' | 'ŕ' | 'ṙ' | 'ř' | 'ȑ' | 'ȓ' | 'ṛ' | 'ṝ' | 'ŗ' | 'ṟ' | 'ɍ' | 'ɽ'
| 'ꝛ' | 'ꞧ' | 'ꞃ' => acc.push('r'),
's' | 'ⓢ' | 's' | 'ß' | 'ś' | 'ṥ' | 'ŝ' | 'ṡ' | 'š' | 'ṧ' | 'ṣ' | 'ṩ' | 'ș' | 'ş'
| 'ȿ' | 'ꞩ' | 'ꞅ' | 'ẛ' => acc.push('s'),
't' | 'ⓣ' | 't' | 'ṫ' | 'ẗ' | 'ť' | 'ṭ' | 'ț' | 'ţ' | 'ṱ' | 'ṯ' | 'ŧ' | 'ƭ' | 'ʈ'
| 'ⱦ' | 'ꞇ' => acc.push('t'),
'ꜩ' => acc.push_str("tz"),
'u' | 'ⓤ' | 'u' | 'ù' | 'ú' | 'û' | 'ũ' | 'ṹ' | 'ū' | 'ṻ' | 'ŭ' | 'ü' | 'ǜ' | 'ǘ'
| 'ǖ' | 'ǚ' | 'ủ' | 'ů' | 'ű' | 'ǔ' | 'ȕ' | 'ȗ' | 'ư' | 'ừ' | 'ứ' | 'ữ' | 'ử' | 'ự'
| 'ụ' | 'ṳ' | 'ų' | 'ṷ' | 'ṵ' | 'ʉ' => acc.push('u'),
'v' | 'ⓥ' | 'v' | 'ṽ' | 'ṿ' | 'ʋ' | 'ꝟ' | 'ʌ' => acc.push('v'),
'ꝡ' => acc.push_str("vy"),
'w' | 'ⓦ' | 'w' | 'ẁ' | 'ẃ' | 'ŵ' | 'ẇ' | 'ẅ' | 'ẘ' | 'ẉ' | 'ⱳ' => {
acc.push('w')
}
'x' | 'ⓧ' | 'x' | 'ẋ' | 'ẍ' => acc.push('x'),
'y' | 'ⓨ' | 'y' | 'ỳ' | 'ý' | 'ŷ' | 'ỹ' | 'ȳ' | 'ẏ' | 'ÿ' | 'ỷ' | 'ẙ' | 'ỵ' | 'ƴ'
| 'ɏ' | 'ỿ' => acc.push('y'),
'z' | 'ⓩ' | 'z' | 'ź' | 'ẑ' | 'ż' | 'ž' | 'ẓ' | 'ẕ' | 'ƶ' | 'ȥ' | 'ɀ' | 'ⱬ' | 'ꝣ' => {
acc.push('z')
}
'\u{0300}'..='\u{036F}' | '\u{1AB0}'..='\u{1AFF}' | '\u{1DC0}'..='\u{1DFF}' => {}
_ => acc.push(current),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_uppercase() {
assert_eq!(remove_diacritics("TÅRÖÄÆØ"), String::from("TAROAAO"))
}
#[test]
fn test_lowercase() {
assert_eq!(remove_diacritics("čďêƒíó"), String::from("cdefio"))
}
#[test]
fn test_real_diacritics() {
assert_eq!(remove_diacritics("é"), String::from("e"));
assert_eq!(remove_diacritics("e\u{300}"), String::from("e"));
}
}