1use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
39
40const FROM_X_CI: &[&str] = &[
42 "cx", "gx", "hx", "jx", "sx", "ux",
43];
44
45const FROM_UTF8: &[&str] = &[
49 "ĉ", "ĝ", "ĥ", "ĵ", "ŝ", "ŭ",
50 "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ",
51];
52
53const FROM_H_CI: &[&str] = &[
60 "komenchor", "kuracherb", "potenchav", "prononchelp", "senchav",
62 "pruchelp", "drogherb", "flughaven", "longhar",
63 "lesvigholstini", "vanghar", "gajhumor", "amashisteri",
64 "tobushaltej", "bushaltej", "ashund", "dishak",
65 "disharmoni", "dishelig", "dishirtig", "fikshejm", "grashav",
66 "grashepata", "invershav", "kashal", "misharmoni", "mishelp",
67 "mishumor", "neinvershav", "plushor", "sekshontem", "seshektar",
68 "seshor", "sukceshav",
69
70 "blankaurs", "doganauni", "ropauni", "grandaursin",
72 "imaginaraunu", "kakauj", "malgrandaursin", "matricaunu",
73 "naur", "praul", "saudaarabuj", "tiaul", "traurb", "unuaul",
74
75 "ch", "gh", "hh", "jh", "sh",
77
78 "au",
80];
81
82pub fn utf8_to_x_system(s: &str) -> String {
84 let ac = AhoCorasick::new(FROM_UTF8);
85 let mut result = String::new();
86 ac.replace_all_with(s, &mut result, |m, found, dst| {
87 let leading_capital = match dst.chars().rev().next() {
88 Some(c) if c.is_uppercase() => false,
89 Some(_) => true,
90 None => true,
91 };
92 let (_, tail) = s.split_at(m.end());
93 let capital_follows = match tail.chars().next() {
94 Some(c) if c.is_uppercase() => true,
95 Some(_) => false,
96 None => false,
97 };
98 dst.push_str(match found {
99 "ĉ" => "cx",
100 "ĝ" => "gx",
101 "ĥ" => "hx",
102 "ĵ" => "jx",
103 "ŝ" => "sx",
104 "ŭ" => "ux",
105 other => match (other, leading_capital && !capital_follows) {
106 ("Ĉ", false) => "CX",
107 ("Ĝ", false) => "GX",
108 ("Ĥ", false) => "HX",
109 ("Ĵ", false) => "JX",
110 ("Ŝ", false) => "SX",
111 ("Ŭ", false) => "UX",
112 ("Ĉ", true) => "Cx",
113 ("Ĝ", true) => "Gx",
114 ("Ĥ", true) => "Hx",
115 ("Ĵ", true) => "Jx",
116 ("Ŝ", true) => "Sx",
117 ("Ŭ", true) => "Ux",
118 _ => other,
119 }
120 });
121 true
122 });
123 result
124}
125
126pub fn utf8_to_h_system(s: &str) -> String {
128 let ac = AhoCorasick::new(FROM_UTF8);
129 let mut result = String::new();
130 ac.replace_all_with(s, &mut result, |m, found, dst| {
131 let leading_capital = match dst.chars().rev().next() {
132 Some(c) if c.is_uppercase() => false,
133 Some(_) => true,
134 None => true,
135 };
136 let (_, tail) = s.split_at(m.end());
137 let capital_follows = match tail.chars().next() {
138 Some(c) if c.is_uppercase() => true,
139 Some(_) => false,
140 None => false,
141 };
142 dst.push_str(match found {
143 "ĉ" => "ch",
144 "ĝ" => "gh",
145 "ĥ" => "hh",
146 "ĵ" => "jh",
147 "ŝ" => "sh",
148 "ŭ" => "u",
149 other => match (other, leading_capital && !capital_follows) {
150 ("Ĉ", false) => "CH",
151 ("Ĝ", false) => "GH",
152 ("Ĥ", false) => "HH",
153 ("Ĵ", false) => "JH",
154 ("Ŝ", false) => "SH",
155 ("Ŭ", false) => "U",
156 ("Ĉ", true) => "Ch",
157 ("Ĝ", true) => "Gh",
158 ("Ĥ", true) => "Hh",
159 ("Ĵ", true) => "Jh",
160 ("Ŝ", true) => "Sh",
161 ("Ŭ", true) => "U",
162 _ => other,
163 }
164 });
165 true
166 });
167 result
168}
169
170pub fn x_system_to_utf8(s: &str) -> String {
172 let ac = AhoCorasickBuilder::new()
173 .ascii_case_insensitive(true)
174 .build(FROM_X_CI);
175 let mut result = String::new();
176 ac.replace_all_with(s, &mut result, |_, found, dst| {
177 dst.push_str(match found {
178 "cx" => "ĉ",
179 "gx" => "ĝ",
180 "hx" => "ĥ",
181 "jx" => "ĵ",
182 "sx" => "ŝ",
183 "ux" => "ŭ",
184 "CX" | "Cx" | "cX" => "Ĉ",
185 "GX" | "Gx" | "gX" => "Ĝ",
186 "HX" | "Hx" | "hX" => "Ĥ",
187 "JX" | "Jx" | "jX" => "Ĵ",
188 "SX" | "Sx" | "sX" => "Ŝ",
189 "UX" | "Ux" | "uX" => "Ŭ",
190 _ => found,
191 });
192 true
193 });
194 result
195}
196
197pub fn h_system_to_utf8(s: &str) -> String {
199 let ac = AhoCorasickBuilder::new()
200 .ascii_case_insensitive(true)
201 .match_kind(MatchKind::LeftmostLongest)
202 .build(FROM_H_CI);
203 let mut result = String::new();
204 ac.replace_all_with(s, &mut result, |_, found, dst| {
205 dst.push_str(match found {
206 "ch" => "ĉ",
207 "gh" => "ĝ",
208 "hh" => "ĥ",
209 "jh" => "ĵ",
210 "sh" => "ŝ",
211 "au" => "aŭ",
212 "CH" | "Ch" | "cH" => "Ĉ",
213 "GH" | "Gh" | "gH" => "Ĝ",
214 "HH" | "Hh" | "hH" => "Ĥ",
215 "JH" | "Jh" | "jH" => "Ĵ",
216 "SH" | "Sh" | "sH" => "Ŝ",
217 "AU" => "AŬ",
218 "Au" => "Aŭ",
219 "aU" => "aŬ",
220 _ => found,
224 });
225 true
226 });
227 result
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233
234 #[test]
235 fn test_x_system_to_utf8_noop() {
236 let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
237 assert_eq!(input, x_system_to_utf8(&input));
238 }
239
240 #[test]
241 fn test_x_system_to_utf8_echo_change() {
242 let input = "ehxosxangxo cxiujxauxde EHXOSXANGXO CXIUJXAUXDE";
243 let expected = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
244 assert_eq!(&x_system_to_utf8(input), expected);
245 }
246
247 #[test]
248 fn test_x_system_to_utf8_mixed_case() {
249 let input = "eHxoSxanGxo CxiuJxaUxde ehXosXangXo cXiujXauXde";
250 let expected = "eĤoŜanĜo ĈiuĴaŬde eĤoŜanĜo ĈiuĴaŬde";
251 assert_eq!(&x_system_to_utf8(input), expected);
252 }
253
254 #[test]
255 fn test_utf8_to_x_system_noop() {
256 let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
257 assert_eq!(input, utf8_to_x_system(&input));
258 }
259
260 #[test]
261 fn test_utf8_to_x_system_echo_change() {
262 let input = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
263 let expected = "ehxosxangxo cxiujxauxde EHXOSXANGXO CXIUJXAUXDE";
264 assert_eq!(&utf8_to_x_system(input), expected);
265 }
266
267 #[test]
268 fn test_utf8_to_h_system_noop() {
269 let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
270 assert_eq!(input, utf8_to_h_system(&input));
271 }
272
273 #[test]
274 fn test_utf8_to_h_system_echo_change() {
275 let input = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
276 let expected = "ehhoshangho chiujhaude EHHOSHANGHO CHIUJHAUDE";
277 assert_eq!(&utf8_to_h_system(input), expected);
278 }
279
280 #[test]
281 fn test_h_system_to_utf8_noop() {
282 let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
283 assert_eq!(input, h_system_to_utf8(&input));
284 }
285
286 #[test]
287 fn test_h_system_to_utf8_echo_change() {
288 let input = "ehhoshangho chiujhaude EHHOSHANGHO CHIUJHAUDE";
289 let expected = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
290 assert_eq!(&h_system_to_utf8(input), expected);
291 }
292
293 #[test]
294 fn test_h_system_to_utf8_mixed_case() {
295 let input = "eHhoShanGho ChiuJhAUde ehHosHangHo cHiujHaUde";
296 let expected = "eĤoŜanĜo ĈiuĴAŬde eĤoŜanĜo ĈiuĴaŬde";
297 assert_eq!(&h_system_to_utf8(input), expected);
298 }
299
300 #[test]
301 fn test_h_system_ambiguous_h() {
302 let input = "Chiuj estas senchavaj ideoj.";
303 let expected = "Ĉiuj estas senchavaj ideoj.";
304 assert_eq!(&h_system_to_utf8(input), expected);
305 }
306
307 #[test]
308 fn test_h_system_ambiguous_u() {
309 let input = "Hierau mi vizitis Nauron.";
310 let expected = "Hieraŭ mi vizitis Nauron.";
311 assert_eq!(&h_system_to_utf8(input), expected);
312 }
313
314 #[test]
315 fn test_leading_capital_x_system() {
316 let input = "Ĉiuj estas belaj. Ĥ Ŝ Ĝ Ĉ Ĵ Ŭ ĤO ŜO ĜO ĈO ĴO ŬO";
317 let expected = "Cxiuj estas belaj. Hx Sx Gx Cx Jx Ux HXO SXO GXO CXO JXO UXO";
318 assert_eq!(&utf8_to_x_system(input), expected);
319 }
320
321 #[test]
322 fn test_leading_capital_h_system() {
323 let input = "Ĉiuj estas belaj. Ĥ Ŝ Ĝ Ĉ Ĵ Ŭ ĤO ŜO ĜO ĈO ĴO ŬO";
324 let expected = "Chiuj estas belaj. Hh Sh Gh Ch Jh U HHO SHO GHO CHO JHO UO";
325 assert_eq!(&utf8_to_h_system(input), expected);
326 }
327}