jpreprocess/
normalize_text.rs

1use phf::{phf_map, phf_set, Map, Set};
2
3/// Normalize input text
4pub fn normalize_text_for_naist_jdic(input_text: &str) -> String {
5    let (mut s, c) = input_text
6        .chars()
7        .map(|c| {
8            if let Some(replacement) = HALFWIDTH.get(&c) {
9                *replacement
10            } else if '\u{0020}' < c && c < '\u{007f}' {
11                char::from_u32((c as u32) + 0xfee0).unwrap()
12            } else {
13                c
14            }
15        })
16        .fold(
17            (String::with_capacity(input_text.len()), None),
18            |(mut acc, prev), curr| {
19                let semivoiced = SEMIVOICED_SOUND_MARK.contains(&curr);
20                let voiced = VOICED_SOUND_MARK.contains(&curr);
21
22                let combined = if semivoiced {
23                    prev.and_then(|p| SEMIVOICED.get(&p))
24                } else if voiced {
25                    prev.and_then(|p| VOICED.get(&p))
26                } else {
27                    None
28                };
29
30                if let Some(combined) = combined {
31                    acc.push(*combined);
32                } else if let Some(prev_char) = prev {
33                    acc.push(prev_char);
34                }
35
36                if semivoiced || voiced {
37                    (acc, None)
38                } else {
39                    (acc, Some(curr))
40                }
41            },
42        );
43
44    if let Some(c) = c {
45        s.push(c);
46    }
47    s
48}
49
50const HALFWIDTH: Map<char, char> = phf_map! {
51    // Symbols
52    ' ' => '\u{3000}', //   U+3000 Ideographic Space
53    '\u{a5}' => '\u{FFE5}', // ¥ U+FFE5 Fullwidth Yen Sign
54    '\\' => '\u{FFE5}', // ¥ U+FFE5 Fullwidth Yen Sign
55    '-' => '\u{2212}', // − U+2212 MINUS SIGN
56    '~' => '\u{301C}', // 〜 U+301C WAVE DASH
57    '`' => '\u{2018}', // ‘ U+2018 LEFT SINGLE QUOTATION MARK
58    '\"' => '\u{201D}', // ” U+201D RIGHT DOUBLE QUOTATION MARK
59    '\'' => '\u{2019}', // ’ U+2019 RIGHT SINGLE QUOTATION MARK
60    // Halfwidth japanese symbols
61    '\u{FF61}' => '\u{3002}', // 。 U+3002 Ideographic Full Stop
62    '\u{FF62}' => '\u{300C}', // 「 U+300C Left Corner Bracket Ideographic Full Stop
63    '\u{FF63}' => '\u{300D}', // 」 U+300D Right Corner Bracket
64    '\u{FF64}' => '\u{3001}', // 、 U+3001 Ideographic Comma
65    '\u{FF65}' => '\u{30FB}', // ・ U+30FB Katakana Middle Dot
66    // Katakana
67    'ヲ' => 'ヲ',
68    'ァ' => 'ァ',
69    'ィ' => 'ィ',
70    'ゥ' => 'ゥ',
71    'ェ' => 'ェ',
72    'ォ' => 'ォ',
73    'ャ' => 'ャ',
74    'ュ' => 'ュ',
75    'ョ' => 'ョ',
76    'ッ' => 'ッ',
77    'ー' => 'ー',
78    'ア' => 'ア',
79    'イ' => 'イ',
80    'ウ' => 'ウ',
81    'エ' => 'エ',
82    'オ' => 'オ',
83    'カ' => 'カ',
84    'キ' => 'キ',
85    'ク' => 'ク',
86    'ケ' => 'ケ',
87    'コ' => 'コ',
88    'サ' => 'サ',
89    'シ' => 'シ',
90    'ス' => 'ス',
91    'セ' => 'セ',
92    'ソ' => 'ソ',
93    'タ' => 'タ',
94    'チ' => 'チ',
95    'ツ' => 'ツ',
96    'テ' => 'テ',
97    'ト' => 'ト',
98    'ナ' => 'ナ',
99    'ニ' => 'ニ',
100    'ヌ' => 'ヌ',
101    'ネ' => 'ネ',
102    'ノ' => 'ノ',
103    'ハ' => 'ハ',
104    'ヒ' => 'ヒ',
105    'フ' => 'フ',
106    'ヘ' => 'ヘ',
107    'ホ' => 'ホ',
108    'マ' => 'マ',
109    'ミ' => 'ミ',
110    'ム' => 'ム',
111    'メ' => 'メ',
112    'モ' => 'モ',
113    'ヤ' => 'ヤ',
114    'ユ' => 'ユ',
115    'ヨ' => 'ヨ',
116    'ラ' => 'ラ',
117    'リ' => 'リ',
118    'ル' => 'ル',
119    'レ' => 'レ',
120    'ロ' => 'ロ',
121    'ワ' => 'ワ',
122    'ン' => 'ン',
123};
124
125const SEMIVOICED_SOUND_MARK: Set<char> = phf_set! {
126    '\u{309A}', // U+309A Combining Katakana-Hiragana Semi-Voiced Sound Mark
127    '\u{309C}', // U+309C Katakana-Hiragana Semi-Voiced Sound Mark
128    '\u{FF9F}', // U+FF9F Halfwidth Katakana Semi-Voiced Sound Mark
129};
130const SEMIVOICED: Map<char, char> = phf_map! {
131    'ハ' => 'パ',
132    'ヒ' => 'ピ',
133    'フ' => 'プ',
134    'ヘ' => 'ペ',
135    'ホ' => 'ポ',
136    'は' => 'ぱ',
137    'ひ' => 'ぴ',
138    'ふ' => 'ぷ',
139    'へ' => 'ぺ',
140    'ほ' => 'ぽ',
141};
142
143const VOICED_SOUND_MARK: Set<char> = phf_set! {
144    '\u{3099}', // U+3099 Combining Katakana-Hiragana Voiced Sound Mark
145    '\u{309B}', // U+309B Katakana-Hiragana Voiced Sound Mark
146    '\u{FF9E}', // U+FF9E Halfwidth Katakana Voiced Sound Mark
147};
148const VOICED: Map<char, char> = phf_map! {
149    'カ' => 'ガ',
150    'キ' => 'ギ',
151    'ク' => 'グ',
152    'ケ' => 'ゲ',
153    'コ' => 'ゴ',
154    'サ' => 'ザ',
155    'シ' => 'ジ',
156    'ス' => 'ズ',
157    'セ' => 'ゼ',
158    'ソ' => 'ゾ',
159    'タ' => 'ダ',
160    'チ' => 'ヂ',
161    'ツ' => 'ヅ',
162    'テ' => 'デ',
163    'ト' => 'ド',
164    'ハ' => 'バ',
165    'ヒ' => 'ビ',
166    'フ' => 'ブ',
167    'ヘ' => 'ベ',
168    'ホ' => 'ボ',
169    'ウ' => 'ヴ',
170    'ワ' => 'ヷ',
171    'ヰ' => 'ヸ',
172    'ヱ' => 'ヹ',
173    'ヲ' => 'ヺ',
174    'ヽ' => 'ヾ',
175    'か' => 'が',
176    'き' => 'ぎ',
177    'く' => 'ぐ',
178    'け' => 'げ',
179    'こ' => 'ご',
180    'さ' => 'ざ',
181    'し' => 'じ',
182    'す' => 'ず',
183    'せ' => 'ぜ',
184    'そ' => 'ぞ',
185    'た' => 'だ',
186    'ち' => 'ぢ',
187    'つ' => 'づ',
188    'て' => 'で',
189    'と' => 'ど',
190    'は' => 'ば',
191    'ひ' => 'び',
192    'ふ' => 'ぶ',
193    'へ' => 'べ',
194    'ほ' => 'ぼ',
195    'う' => 'ゔ',
196};
197
198#[cfg(test)]
199mod tests {
200    use crate::normalize_text_for_naist_jdic;
201
202    #[test]
203    fn ascii() {
204        assert_eq!(
205            normalize_text_for_naist_jdic(" !\"#$%&'()*+,-./"),
206            " !”#$%&’()*+,−./"
207        );
208        assert_eq!(
209            normalize_text_for_naist_jdic("0123456789"),
210            "0123456789"
211        );
212        assert_eq!(normalize_text_for_naist_jdic(":;<=>?@"), ":;<=>?@");
213        assert_eq!(
214            normalize_text_for_naist_jdic("ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
215            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
216        );
217        assert_eq!(normalize_text_for_naist_jdic("[\\]^_`"), "[¥]^_‘");
218        assert_eq!(
219            normalize_text_for_naist_jdic("abcdefghijklmnopqrstuvwxyz"),
220            "abcdefghijklmnopqrstuvwxyz"
221        );
222        assert_eq!(normalize_text_for_naist_jdic("{|}~"), "{|}〜");
223    }
224
225    #[test]
226    fn kana() {
227        assert_eq!(
228            normalize_text_for_naist_jdic("ヴガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポ"),
229            "ヴガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポ"
230        );
231        assert_eq!(normalize_text_for_naist_jdic("。「」、・"), "。「」、・");
232        assert_eq!(
233            normalize_text_for_naist_jdic("ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン"),
234            "ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン"
235        );
236    }
237
238    #[test]
239    fn diacritical() {
240        assert_eq!(normalize_text_for_naist_jdic("゙゚"), "");
241        assert_eq!(normalize_text_for_naist_jdic("あ゛"), "あ");
242        assert_eq!(normalize_text_for_naist_jdic("あ゜"), "あ");
243        assert_eq!(normalize_text_for_naist_jdic("は゛"), "ば");
244        assert_eq!(normalize_text_for_naist_jdic("は゜"), "ぱ");
245    }
246}