1
2#[macro_use] extern crate lazy_static;
27extern crate regex;
28
29use std::char;
30use std::collections::HashMap;
31use regex::Regex;
32
33const CH_VOICED_COMBI: char = '\u{3099}';
42const CH_SEMIVOICED_COMBI: char = '\u{309A}';
43const CH_VOICED_FULL: char = '\u{309B}';
44const CH_SEMIVOICED_FULL: char = '\u{309C}';
45const CH_VOICED_HALF: char = '\u{FF9E}';
46const CH_SEMIVOICED_HALF: char = '\u{FF9F}';
47const CH_SPACE: char = '\u{20}';
48
49const VOICED_COMBI: &'static str = "\u{3099}";
50const SEMIVOICED_COMBI: &'static str = "\u{309A}";
51const VOICED_WITH_SPACE: &'static str = "\u{20}\u{3099}";
52const SEMIVOICED_WITH_SPACE: &'static str = "\u{20}\u{309A}";
53
54const RE_VOICED_MARKS: &'static str
55 = r"(?:\x20??\x{3099}|\x{309B}|\x{FF9E})";
56const RE_SEMIVOICED_MARKS: &'static str
57 = r"(?:\x20??\x{309A}|\x{309C}|\x{FF9F})";
58
59lazy_static! {
60 static ref SEMIVOICED_HALVES: HashMap<char,char> = [
61 ('\u{FF8A}', '\u{30D1}'), ('\u{FF8B}', '\u{30D4}'), ('\u{FF8C}', '\u{30D7}'), ('\u{FF8D}', '\u{30DA}'), ('\u{FF8E}', '\u{30DD}'), ].iter().copied().collect();
67
68 static ref VOICED_HALVES: HashMap<char,char> = [
69 ('\u{FF66}', '\u{30FA}'), ('\u{FF73}', '\u{30F4}'), ('\u{FF76}', '\u{30AC}'), ('\u{FF77}', '\u{30AE}'), ('\u{FF78}', '\u{30B0}'), ('\u{FF79}', '\u{30B2}'), ('\u{FF7A}', '\u{30B4}'), ('\u{FF7B}', '\u{30B6}'), ('\u{FF7C}', '\u{30B8}'), ('\u{FF7D}', '\u{30BA}'), ('\u{FF7E}', '\u{30BC}'), ('\u{FF7F}', '\u{30BE}'), ('\u{FF80}', '\u{30C0}'), ('\u{FF81}', '\u{30C2}'), ('\u{FF82}', '\u{30C5}'), ('\u{FF83}', '\u{30C7}'), ('\u{FF84}', '\u{30C9}'), ('\u{FF8A}', '\u{30D0}'), ('\u{FF8B}', '\u{30D3}'), ('\u{FF8C}', '\u{30D6}'), ('\u{FF8D}', '\u{30D9}'), ('\u{FF8E}', '\u{30DC}'), ('\u{FF9C}', '\u{30F7}'), ].iter().copied().collect();
93
94 static ref SEMIVOICES: HashMap<char,char> = [
95 ('\u{30CF}', '\u{30D1}'), ('\u{30D2}', '\u{30D4}'), ('\u{30D5}', '\u{30D7}'), ('\u{30D8}', '\u{30DA}'), ('\u{30DB}', '\u{30DD}'), ('\u{306F}', '\u{3071}'), ('\u{3072}', '\u{3074}'), ('\u{3075}', '\u{3077}'), ('\u{3078}', '\u{307A}'), ('\u{307B}', '\u{307D}'), ].iter().copied().collect();
106
107 static ref VOICES: HashMap<char,char> = [
108 ('\u{30A6}', '\u{30F4}'), ('\u{30AB}', '\u{30AC}'), ('\u{30AD}', '\u{30AE}'), ('\u{30AF}', '\u{30B0}'), ('\u{30B1}', '\u{30B2}'), ('\u{30B3}', '\u{30B4}'), ('\u{30B5}', '\u{30B6}'), ('\u{30B7}', '\u{30B8}'), ('\u{30B9}', '\u{30BA}'), ('\u{30BB}', '\u{30BC}'), ('\u{30BD}', '\u{30BE}'), ('\u{30BF}', '\u{30C0}'), ('\u{30C1}', '\u{30C2}'), ('\u{30C4}', '\u{30C5}'), ('\u{30C6}', '\u{30C7}'), ('\u{30C8}', '\u{30C9}'), ('\u{30CF}', '\u{30D0}'), ('\u{30D2}', '\u{30D3}'), ('\u{30D5}', '\u{30D6}'), ('\u{30D8}', '\u{30D9}'), ('\u{30DB}', '\u{30DC}'), ('\u{30EF}', '\u{30F7}'), ('\u{30F0}', '\u{30F8}'), ('\u{30F1}', '\u{30F9}'), ('\u{30F2}', '\u{30FA}'), ('\u{3046}', '\u{3094}'), ('\u{304B}', '\u{304C}'), ('\u{304D}', '\u{304E}'), ('\u{304F}', '\u{3050}'), ('\u{3051}', '\u{3052}'), ('\u{3053}', '\u{3054}'), ('\u{3055}', '\u{3056}'), ('\u{3057}', '\u{3058}'), ('\u{3059}', '\u{305A}'), ('\u{305B}', '\u{305C}'), ('\u{305D}', '\u{305E}'), ('\u{305F}', '\u{3060}'), ('\u{3061}', '\u{3062}'), ('\u{3064}', '\u{3065}'), ('\u{3066}', '\u{3067}'), ('\u{3068}', '\u{3069}'), ('\u{306F}', '\u{3070}'), ('\u{3072}', '\u{3073}'), ('\u{3075}', '\u{3076}'), ('\u{3078}', '\u{3079}'), ('\u{307B}', '\u{307C}'), ('\u{309D}', '\u{309E}'), ].iter().copied().collect();
156
157 static ref HALVES: HashMap<char,char> = [
158 ('\u{FF61}', '\u{3002}'), ('\u{FF62}', '\u{300C}'), ('\u{FF63}', '\u{300D}'), ('\u{FF64}', '\u{3001}'), ('\u{FF65}', '\u{30FB}'), ('\u{FF66}', '\u{30F2}'), ('\u{FF67}', '\u{30A1}'), ('\u{FF68}', '\u{30A3}'), ('\u{FF69}', '\u{30A5}'), ('\u{FF6A}', '\u{30A7}'), ('\u{FF6B}', '\u{30A9}'), ('\u{FF6C}', '\u{30E3}'), ('\u{FF6D}', '\u{30E5}'), ('\u{FF6E}', '\u{30E7}'), ('\u{FF6F}', '\u{30C3}'), ('\u{FF70}', '\u{30FC}'), ('\u{FF71}', '\u{30A2}'), ('\u{FF72}', '\u{30A4}'), ('\u{FF73}', '\u{30A6}'), ('\u{FF74}', '\u{30A8}'), ('\u{FF75}', '\u{30AA}'), ('\u{FF76}', '\u{30AB}'), ('\u{FF77}', '\u{30AD}'), ('\u{FF78}', '\u{30AF}'), ('\u{FF79}', '\u{30B1}'), ('\u{FF7A}', '\u{30B3}'), ('\u{FF7B}', '\u{30B5}'), ('\u{FF7C}', '\u{30B7}'), ('\u{FF7D}', '\u{30B9}'), ('\u{FF7E}', '\u{30BB}'), ('\u{FF7F}', '\u{30BD}'), ('\u{FF80}', '\u{30BF}'), ('\u{FF81}', '\u{30C1}'), ('\u{FF82}', '\u{30C4}'), ('\u{FF83}', '\u{30C6}'), ('\u{FF84}', '\u{30C8}'), ('\u{FF85}', '\u{30CA}'), ('\u{FF86}', '\u{30CB}'), ('\u{FF87}', '\u{30CC}'), ('\u{FF88}', '\u{30CD}'), ('\u{FF89}', '\u{30CE}'), ('\u{FF8A}', '\u{30CF}'), ('\u{FF8B}', '\u{30D2}'), ('\u{FF8C}', '\u{30D5}'), ('\u{FF8D}', '\u{30D8}'), ('\u{FF8E}', '\u{30DB}'), ('\u{FF8F}', '\u{30DE}'), ('\u{FF90}', '\u{30DF}'), ('\u{FF91}', '\u{30E0}'), ('\u{FF92}', '\u{30E1}'), ('\u{FF93}', '\u{30E2}'), ('\u{FF94}', '\u{30E4}'), ('\u{FF95}', '\u{30E6}'), ('\u{FF96}', '\u{30E8}'), ('\u{FF97}', '\u{30E9}'), ('\u{FF98}', '\u{30EA}'), ('\u{FF99}', '\u{30EB}'), ('\u{FF9A}', '\u{30EC}'), ('\u{FF9B}', '\u{30ED}'), ('\u{FF9C}', '\u{30EF}'), ('\u{FF9D}', '\u{30F3}'), ('\u{FF9E}', '\u{3099}'), ('\u{FF9F}', '\u{309A}'), ].iter().copied().collect();
224}
225
226fn shift_code<F,G>(judge: F, convert: G, src: &str) -> String
227 where F: Fn(u32) -> bool,
228 G: Fn(u32) -> u32
229{
230 src.chars().map(|c| {
231 let k = c as u32;
232 if judge(k) { char::from_u32(convert(k)).unwrap() } else { c }
233 } ).collect()
234}
235
236pub fn wide2ascii(s: &str) -> String {
242 shift_code(|x| 0xff00 < x && x < 0xff5f, |x| x - 0xfee0, s)
243}
244
245pub fn ascii2wide(s: &str) -> String {
251 shift_code(|x| 0x0020 < x && x < 0x007f, |x| x + 0xfee0, s)
252}
253
254pub fn hira2kata(s: &str) -> String {
260 shift_code(|x| 0x3041 <= x && x <= 0x3096, |x| x + 0x0060, s)
261}
262
263pub fn kata2hira(s: &str) -> String {
269 shift_code(|x| 0x30A1 <= x && x <= 0x30F6, |x| x - 0x0060, s)
270}
271
272macro_rules! push_content {
273 ($judge:expr, $table:expr, $res:expr, $a:expr, $b:expr) => {
274 if $judge($b) {
275 if let Some(v) = $table.get(&$a) {
276 $res.push(*v);
277 return None;
278 }
279 }
280 };
281}
282
283pub fn half2full(s: &str) -> String {
292 s.chars().map(|c| consult(&HALVES, &c)).collect()
293}
294
295pub fn half2kana(s: &str) -> String {
301 let mut line = String::with_capacity(s.len());
302 format!("{} ", s).chars().fold(None, |prev, b| {
303 if let Some(a) = prev {
304 push_content!(|b| b == CH_VOICED_HALF,
305 VOICED_HALVES, line, a, b);
306 push_content!(|b| b == CH_SEMIVOICED_HALF,
307 SEMIVOICED_HALVES, line, a, b);
308 if a == CH_VOICED_HALF ||
309 a == CH_SEMIVOICED_HALF { line.push(CH_SPACE); }
310 line.push(consult(&HALVES, &a));
311 }
312 Some(b)
313 } );
314
315 line
316}
317
318pub fn combine(s: &str) -> String {
324 let ss = despace(s);
325 let mut line = String::with_capacity(ss.len());
326 format!("{} ", ss).chars().fold(None, |prev, b| {
327 if let Some(a) = prev {
328 push_content!(|b| b == CH_VOICED_HALF ||
329 b == CH_VOICED_FULL ||
330 b == CH_VOICED_COMBI,
331 VOICES, line, a, b);
332 push_content!(|b| b == CH_SEMIVOICED_HALF ||
333 b == CH_SEMIVOICED_FULL ||
334 b == CH_SEMIVOICED_COMBI,
335 SEMIVOICES, line, a, b);
336 line.push(a);
337 }
338 Some(b)
339 } );
340
341 enspace(&line)
342}
343
344fn consult(table: &HashMap<char,char>, c: &char) -> char {
345 match table.get(c) {
346 None => *c,
347 Some(x) => *x,
348 }
349}
350
351fn despace(s: &str) -> String {
352 let s_ = &s.replace(VOICED_WITH_SPACE, VOICED_COMBI);
353 s_.replace(SEMIVOICED_WITH_SPACE, SEMIVOICED_COMBI)
354}
355
356fn enspace(s: &str) -> String {
357 let s_ = &s.replace(VOICED_COMBI, VOICED_WITH_SPACE);
358 s_.replace(SEMIVOICED_COMBI, SEMIVOICED_WITH_SPACE)
359}
360
361fn replace_marks(vmark: &str, svmark: &str, src: &str) -> String {
362 lazy_static! {
363 static ref RE1: Regex = Regex::new(RE_VOICED_MARKS).unwrap();
364 static ref RE2: Regex = Regex::new(RE_SEMIVOICED_MARKS).unwrap();
365 }
366 let s_ = RE1.replace_all(src, vmark);
367 RE2.replace_all(&s_, svmark)
368}
369
370pub fn vsmark2half(s: &str) -> String {
376 replace_marks(&CH_VOICED_HALF.to_string(),
377 &CH_SEMIVOICED_HALF.to_string(), s)
378}
379
380pub fn vsmark2full(s: &str) -> String {
386 replace_marks(&CH_VOICED_FULL.to_string(),
387 &CH_SEMIVOICED_FULL.to_string(), s)
388}
389
390pub fn vsmark2combi(s: &str) -> String {
396 replace_marks(&VOICED_WITH_SPACE, &SEMIVOICED_WITH_SPACE, s)
397}
398
399pub fn nowidespace(s: &str) -> String { s.replace("\u{3000}", "\u{20}") }
401
402pub fn space2wide(s: &str) -> String { s.replace("\u{20}", "\u{3000}") }
404
405pub fn nowideyen(s: &str) -> String { s.replace("\u{ffe5}", "\u{a5}") }
407
408pub fn yen2wide(s: &str) -> String { s.replace("\u{a5}", "\u{ffe5}") }
410
411
412#[cfg(test)]
413mod tests {
414 use super::*;
415
416 #[test]
417 fn pub_fn_t1() {
418 assert_eq!("!rust-0;", wide2ascii("!rust-0;"));
419 assert_eq!("!rust-0;", ascii2wide("!rust-0;"));
420 assert_eq!("カナ", hira2kata("かな"));
421 assert_eq!("かな", kata2hira("カナ"));
422 }
423
424 #[test]
425 fn pub_fn_t2() {
426 assert_eq!(" ", nowidespace(" "));
427 assert_eq!(" ", space2wide(" "));
428 assert_eq!("¥", nowideyen("¥"));
429 assert_eq!("¥", yen2wide("¥"));
430 }
431
432 #[test]
433 fn kana_t1() {
434 assert_eq!(Some(&'\u{30A2}'), HALVES.get(&'\u{FF71}'));
435 assert_eq!("ガナ", half2full("ガナ"));
436 assert_eq!("ガナ", half2kana("ガナ"));
437 assert_eq!("がな", combine("か゛な"));
438 }
439}