esperanto_text/
lib.rs

1/*!
2Convert Esperanto text between UTF-8, x-system and h-system transliterations.
3
4When correctly printed, Esperanto text has various diacritics that can be
5properly represented in UTF-8. Those who are limited to ASCII or are unable
6to type these characters often resort to the "h-system" or "x-system". In
7these, a suffix is added to those letters which should have a diacritic.
8
9This crate provides convenience functions for converting a string from one
10transliteration to another. For the x-system this can be done with complete
11accuracy as there is no ambiguity. For the h-system, a small vocabulary list
12is used to avoid changing the meaning of real words.
13
14A binary called `eotext` is included to use these functions from a CLI.
15
16# Example: UTF-8 to x-system
17
18```
19let input = "eĥoŝanĝo ĉiuĵaŭde";
20assert_eq!(
21    esperanto_text::utf8_to_x_system(input),
22    "ehxosxangxo cxiujxauxde".to_owned(),
23);
24```
25
26# Example: h-system to UTF-8
27
28```
29let input = "Chiuj estas senchavaj kaj taugaj ideoj.";
30assert_eq!(
31    esperanto_text::h_system_to_utf8(input),
32    "Ĉiuj estas senchavaj kaj taŭgaj ideoj.".to_owned(),
33);
34```
35
36*/
37
38use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
39
40/// Patterns to match for x-system input (case-insensitive)
41const FROM_X_CI: &[&str] = &[
42    "cx", "gx", "hx", "jx", "sx", "ux",
43];
44
45/// Patterns to match for UTF-8 input
46///
47/// Need to specify both cases as AhoCorasick's insensitive mode is ASCII-only.
48const FROM_UTF8: &[&str] = &[
49    "ĉ", "ĝ", "ĥ", "ĵ", "ŝ", "ŭ",
50    "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ",
51];
52
53/// Patterns to match for h-system input (case-insensitive)
54///
55/// This includes all the transliterations but also a reasonably exhaustive
56/// list of word fragments that need to be left alone, rather than blindly
57/// substituting "something+h" with a diacritic. These longer segments will
58/// be allowed to pass through unchanged.
59const FROM_H_CI: &[&str] = &[
60    // Uses of "h" to leave alone
61    "komenchor", "kuracherb", "potenchav", "prononchelp", "senchav",
62    /* (ŝ) */ "pruchelp", "drogherb", "flughaven", "longhar",
63    /* (ŝ) */ "lesvigholstini", "vanghar", "gajhumor", "amashisteri",
64    /* (aŭ) */ "tobushaltej", "bushaltej", /* (ĉ) */ "ashund", "dishak",
65    "disharmoni", "dishelig", "dishirtig", "fikshejm", "grashav",
66    "grashepata", "invershav", "kashal", "misharmoni", "mishelp",
67    "mishumor", "neinvershav", "plushor", "sekshontem", "seshektar",
68    "seshor", "sukceshav",
69
70    // Uses of "au" (without circumflex) to leave alone
71    "blankaurs", "doganauni", /* (eŭ) */ "ropauni", "grandaursin",
72    "imaginaraunu", "kakauj", "malgrandaursin", "matricaunu",
73    "naur", "praul", "saudaarabuj", "tiaul", "traurb", "unuaul",
74
75    // Regular letters to transliterate
76    "ch", "gh", "hh", "jh", "sh",
77
78    // In most situations this is meant to become "aŭ"
79    "au",
80];
81
82/// Convert UTF-8 "ĵaŭdo" to x-system "jxauxdo"
83pub fn utf8_to_x_system(s: &str) -> String {
84    let ac = AhoCorasick::new(FROM_UTF8);
85    let mut result = String::new();
86    ac.replace_all_with(s, &mut result, |m, found, dst| {
87        let leading_capital = match dst.chars().rev().next() {
88            Some(c) if c.is_uppercase() => false,
89            Some(_) => true,
90            None => true,
91        };
92        let (_, tail) = s.split_at(m.end());
93        let capital_follows = match tail.chars().next() {
94            Some(c) if c.is_uppercase() => true,
95            Some(_) => false,
96            None => false,
97        };
98        dst.push_str(match found {
99            "ĉ" => "cx",
100            "ĝ" => "gx",
101            "ĥ" => "hx",
102            "ĵ" => "jx",
103            "ŝ" => "sx",
104            "ŭ" => "ux",
105            other => match (other, leading_capital && !capital_follows) {
106                ("Ĉ", false) => "CX",
107                ("Ĝ", false) => "GX",
108                ("Ĥ", false) => "HX",
109                ("Ĵ", false) => "JX",
110                ("Ŝ", false) => "SX",
111                ("Ŭ", false) => "UX",
112                ("Ĉ", true) => "Cx",
113                ("Ĝ", true) => "Gx",
114                ("Ĥ", true) => "Hx",
115                ("Ĵ", true) => "Jx",
116                ("Ŝ", true) => "Sx",
117                ("Ŭ", true) => "Ux",
118                _ => other,
119            }
120        });
121        true
122    });
123    result
124}
125
126/// Convert UTF-8 "ĵaŭdo" to h-system "jhaudo"
127pub fn utf8_to_h_system(s: &str) -> String {
128    let ac = AhoCorasick::new(FROM_UTF8);
129    let mut result = String::new();
130    ac.replace_all_with(s, &mut result, |m, found, dst| {
131        let leading_capital = match dst.chars().rev().next() {
132            Some(c) if c.is_uppercase() => false,
133            Some(_) => true,
134            None => true,
135        };
136        let (_, tail) = s.split_at(m.end());
137        let capital_follows = match tail.chars().next() {
138            Some(c) if c.is_uppercase() => true,
139            Some(_) => false,
140            None => false,
141        };
142        dst.push_str(match found {
143            "ĉ" => "ch",
144            "ĝ" => "gh",
145            "ĥ" => "hh",
146            "ĵ" => "jh",
147            "ŝ" => "sh",
148            "ŭ" => "u",
149            other => match (other, leading_capital && !capital_follows) {
150                ("Ĉ", false) => "CH",
151                ("Ĝ", false) => "GH",
152                ("Ĥ", false) => "HH",
153                ("Ĵ", false) => "JH",
154                ("Ŝ", false) => "SH",
155                ("Ŭ", false) => "U",
156                ("Ĉ", true) => "Ch",
157                ("Ĝ", true) => "Gh",
158                ("Ĥ", true) => "Hh",
159                ("Ĵ", true) => "Jh",
160                ("Ŝ", true) => "Sh",
161                ("Ŭ", true) => "U",
162                _ => other,
163            }
164        });
165        true
166    });
167    result
168}
169
170/// Convert x-system "jxauxdo" to UTF-8 "ĵaŭdo"
171pub fn x_system_to_utf8(s: &str) -> String {
172    let ac = AhoCorasickBuilder::new()
173        .ascii_case_insensitive(true)
174        .build(FROM_X_CI);
175    let mut result = String::new();
176    ac.replace_all_with(s, &mut result, |_, found, dst| {
177        dst.push_str(match found {
178            "cx" => "ĉ",
179            "gx" => "ĝ",
180            "hx" => "ĥ",
181            "jx" => "ĵ",
182            "sx" => "ŝ",
183            "ux" => "ŭ",
184            "CX" | "Cx" | "cX" => "Ĉ",
185            "GX" | "Gx" | "gX" => "Ĝ",
186            "HX" | "Hx" | "hX" => "Ĥ",
187            "JX" | "Jx" | "jX" => "Ĵ",
188            "SX" | "Sx" | "sX" => "Ŝ",
189            "UX" | "Ux" | "uX" => "Ŭ",
190            _ => found,
191        });
192        true
193    });
194    result
195}
196
197/// Convert h-system "jhaudo" to UTF-8 "ĵaŭdo"
198pub fn h_system_to_utf8(s: &str) -> String {
199    let ac = AhoCorasickBuilder::new()
200        .ascii_case_insensitive(true)
201        .match_kind(MatchKind::LeftmostLongest)
202        .build(FROM_H_CI);
203    let mut result = String::new();
204    ac.replace_all_with(s, &mut result, |_, found, dst| {
205        dst.push_str(match found {
206            "ch" => "ĉ",
207            "gh" => "ĝ",
208            "hh" => "ĥ",
209            "jh" => "ĵ",
210            "sh" => "ŝ",
211            "au" => "aŭ",
212            "CH" | "Ch" | "cH" => "Ĉ",
213            "GH" | "Gh" | "gH" => "Ĝ",
214            "HH" | "Hh" | "hH" => "Ĥ",
215            "JH" | "Jh" | "jH" => "Ĵ",
216            "SH" | "Sh" | "sH" => "Ŝ",
217            "AU" => "AŬ",
218            "Au" => "Aŭ",
219            "aU" => "aŬ",
220            // all the word fragments go through with existing casing
221            // and without messing up the legitimate usage of "h"
222            // or the legitimate usage of "au"
223            _ => found,
224        });
225        true
226    });
227    result
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233
234    #[test]
235    fn test_x_system_to_utf8_noop() {
236        let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
237        assert_eq!(input, x_system_to_utf8(&input));
238    }
239
240    #[test]
241    fn test_x_system_to_utf8_echo_change() {
242        let input = "ehxosxangxo cxiujxauxde EHXOSXANGXO CXIUJXAUXDE";
243        let expected = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
244        assert_eq!(&x_system_to_utf8(input), expected);
245    }
246
247    #[test]
248    fn test_x_system_to_utf8_mixed_case() {
249        let input = "eHxoSxanGxo CxiuJxaUxde ehXosXangXo cXiujXauXde";
250        let expected = "eĤoŜanĜo ĈiuĴaŬde eĤoŜanĜo ĈiuĴaŬde";
251        assert_eq!(&x_system_to_utf8(input), expected);
252    }
253
254    #[test]
255    fn test_utf8_to_x_system_noop() {
256        let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
257        assert_eq!(input, utf8_to_x_system(&input));
258    }
259
260    #[test]
261    fn test_utf8_to_x_system_echo_change() {
262        let input = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
263        let expected = "ehxosxangxo cxiujxauxde EHXOSXANGXO CXIUJXAUXDE";
264        assert_eq!(&utf8_to_x_system(input), expected);
265    }
266
267    #[test]
268    fn test_utf8_to_h_system_noop() {
269        let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
270        assert_eq!(input, utf8_to_h_system(&input));
271    }
272
273    #[test]
274    fn test_utf8_to_h_system_echo_change() {
275        let input = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
276        let expected = "ehhoshangho chiujhaude EHHOSHANGHO CHIUJHAUDE";
277        assert_eq!(&utf8_to_h_system(input), expected);
278    }
279
280    #[test]
281    fn test_h_system_to_utf8_noop() {
282        let input = "The quick brown fox jumps over the lazy dog. And my axe.".to_owned();
283        assert_eq!(input, h_system_to_utf8(&input));
284    }
285
286    #[test]
287    fn test_h_system_to_utf8_echo_change() {
288        let input = "ehhoshangho chiujhaude EHHOSHANGHO CHIUJHAUDE";
289        let expected = "eĥoŝanĝo ĉiuĵaŭde EĤOŜANĜO ĈIUĴAŬDE";
290        assert_eq!(&h_system_to_utf8(input), expected);
291    }
292
293    #[test]
294    fn test_h_system_to_utf8_mixed_case() {
295        let input = "eHhoShanGho ChiuJhAUde ehHosHangHo cHiujHaUde";
296        let expected = "eĤoŜanĜo ĈiuĴAŬde eĤoŜanĜo ĈiuĴaŬde";
297        assert_eq!(&h_system_to_utf8(input), expected);
298    }
299
300    #[test]
301    fn test_h_system_ambiguous_h() {
302        let input = "Chiuj estas senchavaj ideoj.";
303        let expected = "Ĉiuj estas senchavaj ideoj.";
304        assert_eq!(&h_system_to_utf8(input), expected);
305    }
306
307    #[test]
308    fn test_h_system_ambiguous_u() {
309        let input = "Hierau mi vizitis Nauron.";
310        let expected = "Hieraŭ mi vizitis Nauron.";
311        assert_eq!(&h_system_to_utf8(input), expected);
312    }
313
314    #[test]
315    fn test_leading_capital_x_system() {
316        let input = "Ĉiuj estas belaj. Ĥ Ŝ Ĝ Ĉ Ĵ Ŭ ĤO ŜO ĜO ĈO ĴO ŬO";
317        let expected = "Cxiuj estas belaj. Hx Sx Gx Cx Jx Ux HXO SXO GXO CXO JXO UXO";
318        assert_eq!(&utf8_to_x_system(input), expected);
319    }
320
321    #[test]
322    fn test_leading_capital_h_system() {
323        let input = "Ĉiuj estas belaj. Ĥ Ŝ Ĝ Ĉ Ĵ Ŭ ĤO ŜO ĜO ĈO ĴO ŬO";
324        let expected = "Chiuj estas belaj. Hh Sh Gh Ch Jh U HHO SHO GHO CHO JHO UO";
325        assert_eq!(&utf8_to_h_system(input), expected);
326    }
327}