1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
//! Used for removing diacritics from a string.
//!
//! # Examples
//!
//! ```
//! let new_string = diacritics::remove_diacritics("TÅRÖÄÆØ");
//! assert_eq!("TAROAAO", new_string);
//! ```
//!

/// This function removes diacritics and weird character and returns a more standardized string.
///
/// For a full list of transformations, you can view the [source code](https://github.com/YesSeri/diacritics/blob/main/src/lib.rs).
/// ```
/// let new_string = diacritics::remove_diacritics("TÅRÖÄÆØ");
/// assert_eq!("TAROAAO", new_string);
/// ```
///
pub fn remove_diacritics(string: &str) -> String {
    let chars = string.chars();
    chars.fold(String::with_capacity(string.len()), |mut acc, current| {
        escape_diacritic(&mut acc, current);
        acc
    })
}
fn escape_diacritic(acc: &mut String, current: char) {
    match current {
        'A' | 'Ⓐ' | 'A' | 'À' | 'Á' | 'Â' | 'Ầ' | 'Ấ' | 'Ẫ' | 'Ẩ' | 'Ã' | 'Ā' | 'Ă' | 'Ằ'
        | 'Ắ' | 'Ẵ' | 'Ẳ' | 'Ȧ' | 'Ǡ' | 'Ä' | 'Ǟ' | 'Ả' | 'Å' | 'Ǻ' | 'Ǎ' | 'Ȁ' | 'Ȃ' | 'Ạ'
        | 'Ậ' | 'Ặ' | 'Ḁ' | 'Ą' | 'Ⱥ' | 'Ɐ' => acc.push('A'),
        'Ꜳ' => acc.push_str("AA"),
        'Æ' | 'Ǽ' | 'Ǣ' => acc.push('A'),
        'Ꜵ' => acc.push_str("AO"),
        'Ꜷ' => acc.push_str("AU"),
        'Ꜹ' | 'Ꜻ' => acc.push_str("AV"),
        'Ꜽ' => acc.push_str("AY"),
        'B' | 'Ⓑ' | 'B' | 'Ḃ' | 'Ḅ' | 'Ḇ' | 'Ƀ' | 'Ƃ' | 'Ɓ' => acc.push('B'),
        'C' | 'Ⓒ' | 'C' | 'Ć' | 'Ĉ' | 'Ċ' | 'Č' | 'Ç' | 'Ḉ' | 'Ƈ' | 'Ȼ' | 'Ꜿ' => {
            acc.push('C')
        }
        'D' | 'Ⓓ' | 'D' | 'Ḋ' | 'Ď' | 'Ḍ' | 'Ḑ' | 'Ḓ' | 'Ḏ' | 'Đ' | 'Ƌ' | 'Ɗ' | 'Ɖ' | 'Ꝺ' => {
            acc.push('D')
        }
        'DZ' | 'DŽ' => acc.push_str("DZ"),
        'Dz' | 'Dž' => acc.push_str("Dz"),
        'E' | 'Ⓔ' | 'E' | 'È' | 'É' | 'Ê' | 'Ề' | 'Ế' | 'Ễ' | 'Ể' | 'Ẽ' | 'Ē' | 'Ḕ' | 'Ḗ'
        | 'Ĕ' | 'Ė' | 'Ë' | 'Ẻ' | 'Ě' | 'Ȅ' | 'Ȇ' | 'Ẹ' | 'Ệ' | 'Ȩ' | 'Ḝ' | 'Ę' | 'Ḙ' | 'Ḛ'
        | 'Ɛ' | 'Ǝ' => acc.push('E'),
        'F' | 'Ⓕ' | 'F' | 'Ḟ' | 'Ƒ' | 'Ꝼ' => acc.push('F'),
        'G' | 'Ⓖ' | 'G' | 'Ǵ' | 'Ĝ' | 'Ḡ' | 'Ğ' | 'Ġ' | 'Ǧ' | 'Ģ' | 'Ǥ' | 'Ɠ' | 'Ꞡ' | 'Ᵹ'
        | 'Ꝿ' => acc.push('G'),
        'H' | 'Ⓗ' | 'H' | 'Ĥ' | 'Ḣ' | 'Ḧ' | 'Ȟ' | 'Ḥ' | 'Ḩ' | 'Ḫ' | 'Ħ' | 'Ⱨ' | 'Ⱶ' | 'Ɥ' => {
            acc.push('H')
        }
        'I' | 'Ⓘ' | 'I' | 'Ì' | 'Í' | 'Î' | 'Ĩ' | 'Ī' | 'Ĭ' | 'İ' | 'Ï' | 'Ḯ' | 'Ỉ' | 'Ǐ'
        | 'Ȉ' | 'Ȋ' | 'Ị' | 'Į' | 'Ḭ' | 'Ɨ' => acc.push('I'),
        'J' | 'Ⓙ' | 'J' | 'Ĵ' | 'Ɉ' => acc.push('J'),
        'K' | 'Ⓚ' | 'K' | 'Ḱ' | 'Ǩ' | 'Ḳ' | 'Ķ' | 'Ḵ' | 'Ƙ' | 'Ⱪ' | 'Ꝁ' | 'Ꝃ' | 'Ꝅ' | 'Ꞣ' => {
            acc.push('K')
        }
        'L' | 'Ⓛ' | 'L' | 'Ŀ' | 'Ĺ' | 'Ľ' | 'Ḷ' | 'Ḹ' | 'Ļ' | 'Ḽ' | 'Ḻ' | 'Ł' | 'Ƚ' | 'Ɫ'
        | 'Ⱡ' | 'Ꝉ' | 'Ꝇ' | 'Ꞁ' => acc.push('L'),
        'LJ' => acc.push_str("LJ"),
        'Lj' => acc.push_str("Lj"),
        'M' | 'Ⓜ' | 'M' | 'Ḿ' | 'Ṁ' | 'Ṃ' | 'Ɱ' | 'Ɯ' => acc.push('M'),
        'N' | 'Ⓝ' | 'N' | 'Ǹ' | 'Ń' | 'Ñ' | 'Ṅ' | 'Ň' | 'Ṇ' | 'Ņ' | 'Ṋ' | 'Ṉ' | 'Ƞ' | 'Ɲ'
        | 'Ꞑ' | 'Ꞥ' => acc.push('N'),
        'NJ' => acc.push_str("NJ"),
        'Nj' => acc.push_str("Nj"),
        'O' | 'Ⓞ' | 'O' | 'Ò' | 'Ó' | 'Ô' | 'Ồ' | 'Ố' | 'Ỗ' | 'Ổ' | 'Õ' | 'Ṍ' | 'Ȭ' | 'Ṏ'
        | 'Ō' | 'Ṑ' | 'Ṓ' | 'Ŏ' | 'Ȯ' | 'Ȱ' | 'Ö' | 'Ȫ' | 'Ỏ' | 'Ő' | 'Ǒ' | 'Ȍ' | 'Ȏ' | 'Ơ'
        | 'Ờ' | 'Ớ' | 'Ỡ' | 'Ở' | 'Ợ' | 'Ọ' | 'Ộ' | 'Ǫ' | 'Ǭ' | 'Ø' | 'Ǿ' | 'Ɔ' | 'Ɵ' | 'Ꝋ'
        | 'Ꝍ' => acc.push('O'),
        'Ƣ' => acc.push_str("OI"),
        'Ꝏ' => acc.push_str("OO"),
        'Ȣ' => acc.push_str("OU"),
        '\u{008C}' | 'Œ' => acc.push_str("OE"),
        '\u{009C}' | 'œ' => acc.push_str("oe"),
        'P' | 'Ⓟ' | 'P' | 'Ṕ' | 'Ṗ' | 'Ƥ' | 'Ᵽ' | 'Ꝑ' | 'Ꝓ' | 'Ꝕ' => acc.push('P'),
        'Q' | 'Ⓠ' | 'Q' | 'Ꝗ' | 'Ꝙ' | 'Ɋ' => acc.push('Q'),
        'R' | 'Ⓡ' | 'R' | 'Ŕ' | 'Ṙ' | 'Ř' | 'Ȑ' | 'Ȓ' | 'Ṛ' | 'Ṝ' | 'Ŗ' | 'Ṟ' | 'Ɍ' | 'Ɽ'
        | 'Ꝛ' | 'Ꞧ' | 'Ꞃ' => acc.push('R'),
        'S' | 'Ⓢ' | 'S' | 'ẞ' | 'Ś' | 'Ṥ' | 'Ŝ' | 'Ṡ' | 'Š' | 'Ṧ' | 'Ṣ' | 'Ṩ' | 'Ș' | 'Ş'
        | 'Ȿ' | 'Ꞩ' | 'Ꞅ' => acc.push('S'),
        'T' | 'Ⓣ' | 'T' | 'Ṫ' | 'Ť' | 'Ṭ' | 'Ț' | 'Ţ' | 'Ṱ' | 'Ṯ' | 'Ŧ' | 'Ƭ' | 'Ʈ' | 'Ⱦ'
        | 'Ꞇ' => acc.push('T'),
        'Ꜩ' => acc.push_str("TZ"),
        'U' | 'Ⓤ' | 'U' | 'Ù' | 'Ú' | 'Û' | 'Ũ' | 'Ṹ' | 'Ū' | 'Ṻ' | 'Ŭ' | 'Ü' | 'Ǜ' | 'Ǘ'
        | 'Ǖ' | 'Ǚ' | 'Ủ' | 'Ů' | 'Ű' | 'Ǔ' | 'Ȕ' | 'Ȗ' | 'Ư' | 'Ừ' | 'Ứ' | 'Ữ' | 'Ử' | 'Ự'
        | 'Ụ' | 'Ṳ' | 'Ų' | 'Ṷ' | 'Ṵ' | 'Ʉ' => acc.push('U'),
        'V' | 'Ⓥ' | 'V' | 'Ṽ' | 'Ṿ' | 'Ʋ' | 'Ꝟ' | 'Ʌ' => acc.push('V'),
        'Ꝡ' => acc.push_str("VY"),
        'W' | 'Ⓦ' | 'W' | 'Ẁ' | 'Ẃ' | 'Ŵ' | 'Ẇ' | 'Ẅ' | 'Ẉ' | 'Ⱳ' => acc.push('W'),
        'X' | 'Ⓧ' | 'X' | 'Ẋ' | 'Ẍ' => acc.push('X'),
        'Y' | 'Ⓨ' | 'Y' | 'Ỳ' | 'Ý' | 'Ŷ' | 'Ỹ' | 'Ȳ' | 'Ẏ' | 'Ÿ' | 'Ỷ' | 'Ỵ' | 'Ƴ' | 'Ɏ'
        | 'Ỿ' => acc.push('Y'),
        'Z' | 'Ⓩ' | 'Z' | 'Ź' | 'Ẑ' | 'Ż' | 'Ž' | 'Ẓ' | 'Ẕ' | 'Ƶ' | 'Ȥ' | 'Ɀ' | 'Ⱬ' | 'Ꝣ' => {
            acc.push('Z')
        }
        'a' | 'ⓐ' | 'a' | 'ẚ' | 'à' | 'á' | 'â' | 'ầ' | 'ấ' | 'ẫ' | 'ẩ' | 'ã' | 'ā' | 'ă'
        | 'ằ' | 'ắ' | 'ẵ' | 'ẳ' | 'ȧ' | 'ǡ' | 'ä' | 'ǟ' | 'ả' | 'å' | 'ǻ' | 'ǎ' | 'ȁ' | 'ȃ'
        | 'ạ' | 'ậ' | 'ặ' | 'ḁ' | 'ą' | 'ⱥ' | 'ɐ' => acc.push('a'),
        'ꜳ' => acc.push_str("aa"),
        'æ' | 'ǽ' | 'ǣ' => acc.push('a'),
        'ꜵ' => acc.push_str("ao"),
        'ꜷ' => acc.push_str("au"),
        'ꜹ' | 'ꜻ' => acc.push_str("av"),
        'ꜽ' => acc.push_str("ay"),
        'b' | 'ⓑ' | 'b' | 'ḃ' | 'ḅ' | 'ḇ' | 'ƀ' | 'ƃ' | 'ɓ' | 'þ' => acc.push('b'),
        'c' | 'ⓒ' | 'c' | 'ć' | 'ĉ' | 'ċ' | 'č' | 'ç' | 'ḉ' | 'ƈ' | 'ȼ' | 'ꜿ' | 'ↄ' => {
            acc.push('c')
        }
        'd' | 'ⓓ' | 'd' | 'ḋ' | 'ď' | 'ḍ' | 'ḑ' | 'ḓ' | 'ḏ' | 'đ' | 'ƌ' | 'ɖ' | 'ɗ' | 'ꝺ' => {
            acc.push('d')
        }
        'dz' | 'dž' => acc.push_str("dz"),
        'e' | 'ⓔ' | 'e' | 'è' | 'é' | 'ê' | 'ề' | 'ế' | 'ễ' | 'ể' | 'ẽ' | 'ē' | 'ḕ' | 'ḗ'
        | 'ĕ' | 'ė' | 'ë' | 'ẻ' | 'ě' | 'ȅ' | 'ȇ' | 'ẹ' | 'ệ' | 'ȩ' | 'ḝ' | 'ę' | 'ḙ' | 'ḛ'
        | 'ɇ' | 'ɛ' | 'ǝ' => acc.push('e'),
        'f' | 'ⓕ' | 'f' | 'ḟ' | 'ƒ' | 'ꝼ' => acc.push('f'),
        'g' | 'ⓖ' | 'g' | 'ǵ' | 'ĝ' | 'ḡ' | 'ğ' | 'ġ' | 'ǧ' | 'ģ' | 'ǥ' | 'ɠ' | 'ꞡ' | 'ᵹ'
        | 'ꝿ' => acc.push('g'),
        'h' | 'ⓗ' | 'h' | 'ĥ' | 'ḣ' | 'ḧ' | 'ȟ' | 'ḥ' | 'ḩ' | 'ḫ' | 'ẖ' | 'ħ' | 'ⱨ' | 'ⱶ'
        | 'ɥ' => acc.push('h'),
        'ƕ' => acc.push_str("hv"),
        'i' | 'ⓘ' | 'i' | 'ì' | 'í' | 'î' | 'ĩ' | 'ī' | 'ĭ' | 'ï' | 'ḯ' | 'ỉ' | 'ǐ' | 'ȉ'
        | 'ȋ' | 'ị' | 'į' | 'ḭ' | 'ɨ' | 'ı' => acc.push('i'),
        'j' | 'ⓙ' | 'j' | 'ĵ' | 'ǰ' | 'ɉ' => acc.push('j'),
        'k' | 'ⓚ' | 'k' | 'ḱ' | 'ǩ' | 'ḳ' | 'ķ' | 'ḵ' | 'ƙ' | 'ⱪ' | 'ꝁ' | 'ꝃ' | 'ꝅ' | 'ꞣ' => {
            acc.push('k')
        }
        'l' | 'ⓛ' | 'l' | 'ŀ' | 'ĺ' | 'ľ' | 'ḷ' | 'ḹ' | 'ļ' | 'ḽ' | 'ḻ' | 'ſ' | 'ł' | 'ƚ'
        | 'ɫ' | 'ⱡ' | 'ꝉ' | 'ꞁ' | 'ꝇ' => acc.push('l'),
        'lj' => acc.push_str("lj"),
        'm' | 'ⓜ' | 'm' | 'ḿ' | 'ṁ' | 'ṃ' | 'ɱ' | 'ɯ' => acc.push('m'),
        'n' | 'ⓝ' | 'n' | 'ǹ' | 'ń' | 'ñ' | 'ṅ' | 'ň' | 'ṇ' | 'ņ' | 'ṋ' | 'ṉ' | 'ƞ' | 'ɲ'
        | 'ʼn' | 'ꞑ' | 'ꞥ' => acc.push('n'),
        'nj' => acc.push_str("nj"),
        'o' | 'ⓞ' | 'o' | 'ò' | 'ó' | 'ô' | 'ồ' | 'ố' | 'ỗ' | 'ổ' | 'õ' | 'ṍ' | 'ȭ' | 'ṏ'
        | 'ō' | 'ṑ' | 'ṓ' | 'ŏ' | 'ȯ' | 'ȱ' | 'ö' | 'ȫ' | 'ỏ' | 'ő' | 'ǒ' | 'ȍ' | 'ȏ' | 'ơ'
        | 'ờ' | 'ớ' | 'ỡ' | 'ở' | 'ợ' | 'ọ' | 'ộ' | 'ǫ' | 'ǭ' | 'ø' | 'ǿ' | 'ɔ' | 'ꝋ' | 'ꝍ'
        | 'ɵ' => acc.push('o'),
        'ƣ' => acc.push_str("oi"),
        'ȣ' => acc.push_str("ou"),
        'ꝏ' => acc.push_str("oo"),
        'p' | 'ⓟ' | 'p' | 'ṕ' | 'ṗ' | 'ƥ' | 'ᵽ' | 'ꝑ' | 'ꝓ' | 'ꝕ' => acc.push('p'),
        'q' | 'ⓠ' | 'q' | 'ɋ' | 'ꝗ' | 'ꝙ' => acc.push('q'),
        'r' | 'ⓡ' | 'r' | 'ŕ' | 'ṙ' | 'ř' | 'ȑ' | 'ȓ' | 'ṛ' | 'ṝ' | 'ŗ' | 'ṟ' | 'ɍ' | 'ɽ'
        | 'ꝛ' | 'ꞧ' | 'ꞃ' => acc.push('r'),
        's' | 'ⓢ' | 's' | 'ß' | 'ś' | 'ṥ' | 'ŝ' | 'ṡ' | 'š' | 'ṧ' | 'ṣ' | 'ṩ' | 'ș' | 'ş'
        | 'ȿ' | 'ꞩ' | 'ꞅ' | 'ẛ' => acc.push('s'),
        't' | 'ⓣ' | 't' | 'ṫ' | 'ẗ' | 'ť' | 'ṭ' | 'ț' | 'ţ' | 'ṱ' | 'ṯ' | 'ŧ' | 'ƭ' | 'ʈ'
        | 'ⱦ' | 'ꞇ' => acc.push('t'),
        'ꜩ' => acc.push_str("tz"),
        'u' | 'ⓤ' | 'u' | 'ù' | 'ú' | 'û' | 'ũ' | 'ṹ' | 'ū' | 'ṻ' | 'ŭ' | 'ü' | 'ǜ' | 'ǘ'
        | 'ǖ' | 'ǚ' | 'ủ' | 'ů' | 'ű' | 'ǔ' | 'ȕ' | 'ȗ' | 'ư' | 'ừ' | 'ứ' | 'ữ' | 'ử' | 'ự'
        | 'ụ' | 'ṳ' | 'ų' | 'ṷ' | 'ṵ' | 'ʉ' => acc.push('u'),
        'v' | 'ⓥ' | 'v' | 'ṽ' | 'ṿ' | 'ʋ' | 'ꝟ' | 'ʌ' => acc.push('v'),
        'ꝡ' => acc.push_str("vy"),
        'w' | 'ⓦ' | 'w' | 'ẁ' | 'ẃ' | 'ŵ' | 'ẇ' | 'ẅ' | 'ẘ' | 'ẉ' | 'ⱳ' => {
            acc.push('w')
        }
        'x' | 'ⓧ' | 'x' | 'ẋ' | 'ẍ' => acc.push('x'),
        'y' | 'ⓨ' | 'y' | 'ỳ' | 'ý' | 'ŷ' | 'ỹ' | 'ȳ' | 'ẏ' | 'ÿ' | 'ỷ' | 'ẙ' | 'ỵ' | 'ƴ'
        | 'ɏ' | 'ỿ' => acc.push('y'),
        'z' | 'ⓩ' | 'z' | 'ź' | 'ẑ' | 'ż' | 'ž' | 'ẓ' | 'ẕ' | 'ƶ' | 'ȥ' | 'ɀ' | 'ⱬ' | 'ꝣ' => {
            acc.push('z')
        }
        '\u{0300}'..='\u{036F}' | '\u{1AB0}'..='\u{1AFF}' | '\u{1DC0}'..='\u{1DFF}' => {}
        _ => acc.push(current),
    }
}
#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_uppercase() {
        assert_eq!(remove_diacritics("TÅRÖÄÆØ"), String::from("TAROAAO"))
    }
    #[test]
    fn test_lowercase() {
        assert_eq!(remove_diacritics("čďêƒíó"), String::from("cdefio"))
    }
    #[test]
    fn test_real_diacritics() {
        // this is not a traditional é, but a combination of e and \u{300}
        assert_eq!(remove_diacritics("é"), String::from("e"));
        assert_eq!(remove_diacritics("e\u{300}"), String::from("e"));
    }
}