japanese/
converter.rs

1//! Provides functions for converting between the different character sets in the Japanese language.
2//!
3//! ### Leniency
4//!
5//! Conversion functions assume a lenient approach in which the same `char` is returned if conversion can't be done.
6//!
7//! As an example, `convert_katakana_to_hiragana` expects a katakana character. If a non-katakana `char` is provided then the same `char` will be returned.
8//!
9//! In conversion functions that work with strings, any offending chars are simply skipped over (i.e included in the output string as is).
10
11use maplit::hashmap;
12use once_cell::sync::Lazy;
13use std::collections::HashMap;
14
15use crate::{charset, Vowel};
16
17struct TwoWayMap {
18    normal: HashMap<Vowel, char>,
19    reversed: HashMap<char, Vowel>,
20}
21
22impl TwoWayMap {
23    pub fn new(normal: HashMap<Vowel, char>) -> TwoWayMap {
24        let reversed: HashMap<_, _> = normal.iter().map(|(k, v)| (*v, *k)).collect();
25
26        TwoWayMap { normal, reversed }
27    }
28}
29
30const HIRAGANA_KATAKANA_DIFF: u32 = 'ア' as u32 - 'あ' as u32;
31
32static VOWEL_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
33    TwoWayMap::new(hashmap! {
34        Vowel::A => 'あ',
35        Vowel::I => 'い',
36        Vowel::U => 'う',
37        Vowel::E => 'え',
38        Vowel::O => 'お',
39    })
40});
41static VOWEL_SMALL_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
42    TwoWayMap::new(hashmap! {
43        Vowel::A => 'ぁ',
44        Vowel::I => 'ぃ',
45        Vowel::U => 'ぅ',
46        Vowel::E => 'ぇ',
47        Vowel::O => 'ぉ',
48    })
49});
50static K_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
51    TwoWayMap::new(hashmap! {
52        Vowel::A => 'か',
53        Vowel::I => 'き',
54        Vowel::U => 'く',
55        Vowel::E => 'け',
56        Vowel::O => 'こ',
57    })
58});
59static G_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
60    TwoWayMap::new(hashmap! {
61        Vowel::A => 'が',
62        Vowel::I => 'ぎ',
63        Vowel::U => 'ぐ',
64        Vowel::E => 'げ',
65        Vowel::O => 'ご',
66    })
67});
68static S_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
69    TwoWayMap::new(hashmap! {
70        Vowel::A => 'さ',
71        Vowel::I => 'し',
72        Vowel::U => 'す',
73        Vowel::E => 'せ',
74        Vowel::O => 'そ',
75    })
76});
77static Z_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
78    TwoWayMap::new(hashmap! {
79        Vowel::A => 'ざ',
80        Vowel::I => 'じ',
81        Vowel::U => 'ず',
82        Vowel::E => 'ぜ',
83        Vowel::O => 'ぞ',
84    })
85});
86static T_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
87    TwoWayMap::new(hashmap! {
88        Vowel::A => 'た',
89        Vowel::I => 'ち',
90        Vowel::U => 'つ',
91        Vowel::E => 'て',
92        Vowel::O => 'と',
93    })
94});
95static D_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
96    TwoWayMap::new(hashmap! {
97        Vowel::A => 'だ',
98        Vowel::I => 'ぢ',
99        Vowel::U => 'づ',
100        Vowel::E => 'で',
101        Vowel::O => 'ど',
102    })
103});
104static N_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
105    TwoWayMap::new(hashmap! {
106        Vowel::A => 'な',
107        Vowel::I => 'に',
108        Vowel::U => 'ぬ',
109        Vowel::E => 'ね',
110        Vowel::O => 'の',
111    })
112});
113static H_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
114    TwoWayMap::new(hashmap! {
115        Vowel::A => 'は',
116        Vowel::I => 'ひ',
117        Vowel::U => 'ふ',
118        Vowel::E => 'へ',
119        Vowel::O => 'ほ',
120    })
121});
122static B_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
123    TwoWayMap::new(hashmap! {
124        Vowel::A => 'ば',
125        Vowel::I => 'び',
126        Vowel::U => 'ぶ',
127        Vowel::E => 'べ',
128        Vowel::O => 'ぼ',
129    })
130});
131static P_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
132    TwoWayMap::new(hashmap! {
133        Vowel::A => 'ぱ',
134        Vowel::I => 'ぴ',
135        Vowel::U => 'ぷ',
136        Vowel::E => 'ぺ',
137        Vowel::O => 'ぽ',
138    })
139});
140static M_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
141    TwoWayMap::new(hashmap! {
142        Vowel::A => 'ま',
143        Vowel::I => 'み',
144        Vowel::U => 'む',
145        Vowel::E => 'め',
146        Vowel::O => 'も',
147    })
148});
149static R_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
150    TwoWayMap::new(hashmap! {
151        Vowel::A => 'ら',
152        Vowel::I => 'り',
153        Vowel::U => 'る',
154        Vowel::E => 'れ',
155        Vowel::O => 'ろ',
156    })
157});
158static Y_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
159    TwoWayMap::new(hashmap! {
160        Vowel::A => 'や',
161        Vowel::U => 'ゆ',
162        Vowel::O => 'よ',
163    })
164});
165static Y_SMALL_MAP: Lazy<TwoWayMap> = Lazy::new(|| {
166    TwoWayMap::new(hashmap! {
167        Vowel::A => 'ゃ',
168        Vowel::U => 'ゅ',
169        Vowel::O => 'ょ',
170    })
171});
172static MAPS: Lazy<Vec<&'static TwoWayMap>> = Lazy::new(|| {
173    vec![
174        &VOWEL_MAP,
175        &VOWEL_SMALL_MAP,
176        &K_MAP,
177        &G_MAP,
178        &S_MAP,
179        &Z_MAP,
180        &T_MAP,
181        &D_MAP,
182        &N_MAP,
183        &H_MAP,
184        &B_MAP,
185        &P_MAP,
186        &M_MAP,
187        &R_MAP,
188        &Y_MAP,
189        &Y_SMALL_MAP,
190    ]
191});
192
193fn get_map_for_hiragana(hiragana: char) -> Option<&'static TwoWayMap> {
194    for map in MAPS.iter() {
195        if map.reversed.contains_key(&hiragana) {
196            return Some(map);
197        }
198    }
199
200    None
201}
202
203/// Gets the `Vowel` of the given hiragana `char`.
204pub fn get_vowel_for_hiragana(hiragana: char) -> Option<Vowel> {
205    let map = get_map_for_hiragana(hiragana)?;
206    map.reversed.get(&hiragana).copied()
207}
208
209/// Converts a hiragana `char` to another [Vowel] according to how agglutination works in stems.
210///
211/// This basically means we have to add special handling of わ.
212///
213/// One example is when the char is a vowel itself and we want to convert it to `Vowel::A`.
214/// In this case わ will be returned.
215///
216/// In case there's anything wrong in the process, the same provided `char` will be returned.
217pub fn convert_to_vowel_in_stem(hiragana: char, to_vowel: Vowel) -> char {
218    if !charset::is_hiragana(hiragana) {
219        return hiragana;
220    }
221
222    let is_special_wa = hiragana == 'わ';
223
224    // Special handling for わ in stems.
225    let map_option = if is_special_wa {
226        Some(&*VOWEL_MAP)
227    } else {
228        get_map_for_hiragana(hiragana)
229    };
230    let map = match map_option {
231        Some(v) => v,
232        None => return hiragana,
233    };
234
235    // If the chosen map is the vowel map and we want to change it to A we'll just return わ.
236    if std::ptr::eq(map, &*VOWEL_MAP) && to_vowel == Vowel::A {
237        return 'わ';
238    }
239
240    *map.normal.get(&to_vowel).unwrap()
241}
242
243/// Converts the given katakana `char` to hiragana.
244pub fn convert_katakana_to_hiragana(katakana: char) -> char {
245    if !charset::is_katakana(katakana) {
246        return katakana;
247    }
248    char::from_u32(katakana as u32 - HIRAGANA_KATAKANA_DIFF).unwrap_or(katakana)
249}
250
251/// Converts the given hiragana `char` to katakana.
252pub fn convert_hiragana_to_katakana(hiragana: char) -> char {
253    if !charset::is_hiragana(hiragana) {
254        return hiragana;
255    }
256    char::from_u32(hiragana as u32 + HIRAGANA_KATAKANA_DIFF).unwrap_or(hiragana)
257}
258
259/// Gets the prolonged hiragana `char` that's used when the preceding character has the given `Vowel`.
260fn get_prolonged_hiragana_for_vowel(vowel: Vowel) -> char {
261    match vowel {
262        Vowel::A => 'あ',
263        Vowel::I => 'い',
264        Vowel::U => 'う',
265        Vowel::E => 'い',
266        Vowel::O => 'う',
267    }
268}
269
270/// Converts the given katakana string to hiragana.
271///
272/// This takes care of 'ー' used in prolonged voices. i.e キョービ -> きょうび
273pub fn convert_katakana_to_hiragana_string(katakana: &str) -> String {
274    let mut hiragana_string = String::with_capacity(katakana.len());
275
276    let chars: Vec<_> = katakana.chars().collect();
277    for (i, ch) in chars.iter().copied().enumerate() {
278        let hiragana_ch = if charset::is_hiragana(ch) || !charset::is_katakana(ch) {
279            ch
280        } else if ch == 'ー' {
281            if i == 0 {
282                // Nothing before it, no need to convert.
283                ch
284            } else {
285                let previous_ch = chars[i - 1];
286
287                // There's special handling for ワ and ヲ.
288                match previous_ch {
289                    'ワ' => 'あ',
290                    'ヮ' => 'ぁ',
291                    'ヲ' => 'お',
292                    _ => {
293                        // Standard case.
294                        let vowel =
295                            get_vowel_for_hiragana(convert_katakana_to_hiragana(previous_ch));
296                        if let Some(vowel) = vowel {
297                            // A valid converted hiragana that we the vowel of.
298                            get_prolonged_hiragana_for_vowel(vowel)
299                        } else {
300                            // We don't know how to do this, return the same character.
301                            ch
302                        }
303                    }
304                }
305            }
306        } else {
307            convert_katakana_to_hiragana(ch)
308        };
309        hiragana_string.push(hiragana_ch);
310    }
311
312    hiragana_string
313}
314
315/// Converts the given hiragana string to katakana.
316///
317/// This **does not** change long voices into 'ー'. i.e きょうび -> キョウビ
318pub fn convert_hiragana_to_katakana_string(hiragana: &str) -> String {
319    let mut katakana_string = String::with_capacity(hiragana.len());
320
321    let chars: Vec<_> = hiragana.chars().collect();
322    for ch in chars.iter().copied() {
323        let katakana_ch = if charset::is_katakana(ch) || !charset::is_hiragana(ch) {
324            ch
325        } else {
326            convert_hiragana_to_katakana(ch)
327        };
328        katakana_string.push(katakana_ch);
329    }
330
331    katakana_string
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use rstest::*;
338
339    #[rstest]
340    #[case('わ', Vowel::I, 'い')]
341    #[case('い', Vowel::A, 'わ')]
342    #[case('き', Vowel::A, 'か')]
343    #[case('し', Vowel::E, 'せ')]
344    fn convert_vowel_in_stem_test(
345        #[case] hiragana: char,
346        #[case] to_vowel: Vowel,
347        #[case] expected: char,
348    ) {
349        assert_eq!(expected, convert_to_vowel_in_stem(hiragana, to_vowel));
350    }
351
352    #[rstest]
353    fn convert_katakana_to_hiragana_returns_same_char_if_invalid() {
354        assert_eq!('a', convert_katakana_to_hiragana('a'));
355    }
356
357    #[rstest]
358    fn convert_hiragana_to_katakana_returns_same_char_if_invalid() {
359        assert_eq!('a', convert_hiragana_to_katakana('a'));
360    }
361
362    #[rstest]
363    #[case("モン", "もん")]
364    #[case("キヨウビ", "きようび")]
365    #[case("キョウビ", "きょうび")]
366    #[case("キヨービ", "きようび")]
367    #[case("キョービ", "きょうび")]
368    #[case("キープ", "きいぷ")]
369    #[case("チームワーク", "ちいむわあく")]
370    #[case("ヲー", "をお")]
371    #[case("ー", "ー")]
372    fn convert_katakana_to_hiragana_string_test(#[case] katakana: &str, #[case] expected: &str) {
373        assert_eq!(expected, convert_katakana_to_hiragana_string(katakana));
374    }
375
376    #[rstest]
377    #[case("チームワーク")]
378    #[case("ヲー")]
379    #[case("ー")]
380    fn convert_katakana_to_hiragana_string_does_not_panic(#[case] katakana: &str) {
381        convert_katakana_to_hiragana_string(katakana);
382    }
383
384    #[rstest]
385    #[case("もん", "モン")]
386    #[case("ひトリ", "ヒトリ")]
387    #[case("きようび", "キヨウビ")]
388    fn convert_hiragana_to_katakana_string_test(#[case] katakana: &str, #[case] expected: &str) {
389        assert_eq!(expected, convert_hiragana_to_katakana_string(katakana));
390    }
391}