1use phf::{phf_map, phf_set, Map, Set};
2
3pub fn normalize_text_for_naist_jdic(input_text: &str) -> String {
5 let (mut s, c) = input_text
6 .chars()
7 .map(|c| {
8 if let Some(replacement) = HALFWIDTH.get(&c) {
9 *replacement
10 } else if '\u{0020}' < c && c < '\u{007f}' {
11 char::from_u32((c as u32) + 0xfee0).unwrap()
12 } else {
13 c
14 }
15 })
16 .fold(
17 (String::with_capacity(input_text.len()), None),
18 |(mut acc, prev), curr| {
19 let semivoiced = SEMIVOICED_SOUND_MARK.contains(&curr);
20 let voiced = VOICED_SOUND_MARK.contains(&curr);
21
22 let combined = if semivoiced {
23 prev.and_then(|p| SEMIVOICED.get(&p))
24 } else if voiced {
25 prev.and_then(|p| VOICED.get(&p))
26 } else {
27 None
28 };
29
30 if let Some(combined) = combined {
31 acc.push(*combined);
32 } else if let Some(prev_char) = prev {
33 acc.push(prev_char);
34 }
35
36 if semivoiced || voiced {
37 (acc, None)
38 } else {
39 (acc, Some(curr))
40 }
41 },
42 );
43
44 if let Some(c) = c {
45 s.push(c);
46 }
47 s
48}
49
50const HALFWIDTH: Map<char, char> = phf_map! {
51 ' ' => '\u{3000}', '\u{a5}' => '\u{FFE5}', '\\' => '\u{FFE5}', '-' => '\u{2212}', '~' => '\u{301C}', '`' => '\u{2018}', '\"' => '\u{201D}', '\'' => '\u{2019}', '\u{FF61}' => '\u{3002}', '\u{FF62}' => '\u{300C}', '\u{FF63}' => '\u{300D}', '\u{FF64}' => '\u{3001}', '\u{FF65}' => '\u{30FB}', 'ヲ' => 'ヲ',
68 'ァ' => 'ァ',
69 'ィ' => 'ィ',
70 'ゥ' => 'ゥ',
71 'ェ' => 'ェ',
72 'ォ' => 'ォ',
73 'ャ' => 'ャ',
74 'ュ' => 'ュ',
75 'ョ' => 'ョ',
76 'ッ' => 'ッ',
77 'ー' => 'ー',
78 'ア' => 'ア',
79 'イ' => 'イ',
80 'ウ' => 'ウ',
81 'エ' => 'エ',
82 'オ' => 'オ',
83 'カ' => 'カ',
84 'キ' => 'キ',
85 'ク' => 'ク',
86 'ケ' => 'ケ',
87 'コ' => 'コ',
88 'サ' => 'サ',
89 'シ' => 'シ',
90 'ス' => 'ス',
91 'セ' => 'セ',
92 'ソ' => 'ソ',
93 'タ' => 'タ',
94 'チ' => 'チ',
95 'ツ' => 'ツ',
96 'テ' => 'テ',
97 'ト' => 'ト',
98 'ナ' => 'ナ',
99 'ニ' => 'ニ',
100 'ヌ' => 'ヌ',
101 'ネ' => 'ネ',
102 'ノ' => 'ノ',
103 'ハ' => 'ハ',
104 'ヒ' => 'ヒ',
105 'フ' => 'フ',
106 'ヘ' => 'ヘ',
107 'ホ' => 'ホ',
108 'マ' => 'マ',
109 'ミ' => 'ミ',
110 'ム' => 'ム',
111 'メ' => 'メ',
112 'モ' => 'モ',
113 'ヤ' => 'ヤ',
114 'ユ' => 'ユ',
115 'ヨ' => 'ヨ',
116 'ラ' => 'ラ',
117 'リ' => 'リ',
118 'ル' => 'ル',
119 'レ' => 'レ',
120 'ロ' => 'ロ',
121 'ワ' => 'ワ',
122 'ン' => 'ン',
123};
124
125const SEMIVOICED_SOUND_MARK: Set<char> = phf_set! {
126 '\u{309A}', '\u{309C}', '\u{FF9F}', };
130const SEMIVOICED: Map<char, char> = phf_map! {
131 'ハ' => 'パ',
132 'ヒ' => 'ピ',
133 'フ' => 'プ',
134 'ヘ' => 'ペ',
135 'ホ' => 'ポ',
136 'は' => 'ぱ',
137 'ひ' => 'ぴ',
138 'ふ' => 'ぷ',
139 'へ' => 'ぺ',
140 'ほ' => 'ぽ',
141};
142
143const VOICED_SOUND_MARK: Set<char> = phf_set! {
144 '\u{3099}', '\u{309B}', '\u{FF9E}', };
148const VOICED: Map<char, char> = phf_map! {
149 'カ' => 'ガ',
150 'キ' => 'ギ',
151 'ク' => 'グ',
152 'ケ' => 'ゲ',
153 'コ' => 'ゴ',
154 'サ' => 'ザ',
155 'シ' => 'ジ',
156 'ス' => 'ズ',
157 'セ' => 'ゼ',
158 'ソ' => 'ゾ',
159 'タ' => 'ダ',
160 'チ' => 'ヂ',
161 'ツ' => 'ヅ',
162 'テ' => 'デ',
163 'ト' => 'ド',
164 'ハ' => 'バ',
165 'ヒ' => 'ビ',
166 'フ' => 'ブ',
167 'ヘ' => 'ベ',
168 'ホ' => 'ボ',
169 'ウ' => 'ヴ',
170 'ワ' => 'ヷ',
171 'ヰ' => 'ヸ',
172 'ヱ' => 'ヹ',
173 'ヲ' => 'ヺ',
174 'ヽ' => 'ヾ',
175 'か' => 'が',
176 'き' => 'ぎ',
177 'く' => 'ぐ',
178 'け' => 'げ',
179 'こ' => 'ご',
180 'さ' => 'ざ',
181 'し' => 'じ',
182 'す' => 'ず',
183 'せ' => 'ぜ',
184 'そ' => 'ぞ',
185 'た' => 'だ',
186 'ち' => 'ぢ',
187 'つ' => 'づ',
188 'て' => 'で',
189 'と' => 'ど',
190 'は' => 'ば',
191 'ひ' => 'び',
192 'ふ' => 'ぶ',
193 'へ' => 'べ',
194 'ほ' => 'ぼ',
195 'う' => 'ゔ',
196};
197
198#[cfg(test)]
199mod tests {
200 use crate::normalize_text_for_naist_jdic;
201
202 #[test]
203 fn ascii() {
204 assert_eq!(
205 normalize_text_for_naist_jdic(" !\"#$%&'()*+,-./"),
206 " !”#$%&’()*+,−./"
207 );
208 assert_eq!(
209 normalize_text_for_naist_jdic("0123456789"),
210 "0123456789"
211 );
212 assert_eq!(normalize_text_for_naist_jdic(":;<=>?@"), ":;<=>?@");
213 assert_eq!(
214 normalize_text_for_naist_jdic("ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
215 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
216 );
217 assert_eq!(normalize_text_for_naist_jdic("[\\]^_`"), "[¥]^_‘");
218 assert_eq!(
219 normalize_text_for_naist_jdic("abcdefghijklmnopqrstuvwxyz"),
220 "abcdefghijklmnopqrstuvwxyz"
221 );
222 assert_eq!(normalize_text_for_naist_jdic("{|}~"), "{|}〜");
223 }
224
225 #[test]
226 fn kana() {
227 assert_eq!(
228 normalize_text_for_naist_jdic("ヴガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポ"),
229 "ヴガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポ"
230 );
231 assert_eq!(normalize_text_for_naist_jdic("。「」、・"), "。「」、・");
232 assert_eq!(
233 normalize_text_for_naist_jdic("ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン"),
234 "ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン"
235 );
236 }
237
238 #[test]
239 fn diacritical() {
240 assert_eq!(normalize_text_for_naist_jdic("゙゚"), "");
241 assert_eq!(normalize_text_for_naist_jdic("あ゛"), "あ");
242 assert_eq!(normalize_text_for_naist_jdic("あ゜"), "あ");
243 assert_eq!(normalize_text_for_naist_jdic("は゛"), "ば");
244 assert_eq!(normalize_text_for_naist_jdic("は゜"), "ぱ");
245 }
246}