1#[macro_use] extern crate lazy_static;
2#[macro_use] extern crate maplit;
3
4use std::char;
5use std::collections::HashMap;
6use std::hash::Hash;
7
8include!("constants.rs");
9
10fn get_score<T: Eq + Hash>(d: &HashMap<T, i32>, s: &T) -> i32 {
11 d.get(s).cloned().unwrap_or(0)
12}
13
14fn get_ctype(c: char) -> char {
15 match c as u32 {
16 0x4E00|0x4E8C|0x4E09|0x56DB|0x4E94|0x516D|0x4E03|0x516B|0x4E5D|0x5341 => 'M',
17 0x767E|0x5343|0x4E07|0x5104|0x5146 => 'M',
18 0x4E00...0x9FA0|0x3005|0x3006|0x30F5|0x30F6 => 'H',
19 0x3041...0x3093 => 'I',
20 0x30A1...0x30F4|0x30FC|0xFF71...0xFF9D|0xFF9E|0xFF70 => 'K',
21 0x61...0x7A|0x41...0x5A|0xFF41...0xFF5A|0xFF21...0xFF3A => 'A',
22 0x30...0x3a|0xFF10...0xFF19 => 'N',
23 _ => 'O',
24 }
25}
26
27pub fn tokenize(s: &str) -> Vec<String> {
28 if s.is_empty() {
29 return Vec::new();
30 }
31
32 let mut result = Vec::with_capacity(s.chars().count());
33
34 let segments =
35 vec!(*B3, *B2, *B1).into_iter()
36 .chain(s.chars())
37 .chain(vec!(*E1, *E2, *E3).into_iter())
38 .collect::<Vec<_>>();
39
40 let ctypes =
41 vec!('O'; 3).into_iter()
42 .chain(s.chars().map(get_ctype))
43 .chain(vec!('O'; 3).into_iter())
44 .collect::<Vec<_>>();
45
46 let mut word = segments[3].to_string();
47 let mut p = vec!('U'; 3);
48
49 for index in 4 .. segments.len() - 3 {
50 let mut score = BIAS;
51 let w = &segments[index - 3 .. index + 3];
52 let c = &ctypes[index - 3 .. index + 3];
53
54 score = score + get_score(&*UP1, &p[0]);
55 score = score + get_score(&*UP2, &p[1]);
56 score = score + get_score(&*UP3, &p[2]);
57 score = score + get_score(&*BP1, &(p[0], p[1]));
58 score = score + get_score(&*BP2, &(p[1], p[2]));
59 score = score + get_score(&*UW1, &w[0]);
60 score = score + get_score(&*UW2, &w[1]);
61 score = score + get_score(&*UW3, &w[2]);
62 score = score + get_score(&*UW4, &w[3]);
63 score = score + get_score(&*UW5, &w[4]);
64 score = score + get_score(&*UW6, &w[5]);
65 score = score + get_score(&*BW1, &(w[1], w[2]));
66 score = score + get_score(&*BW2, &(w[2], w[3]));
67 score = score + get_score(&*BW3, &(w[3], w[4]));
68 score = score + get_score(&*TW1, &(w[0], w[1], w[2]));
69 score = score + get_score(&*TW2, &(w[1], w[2], w[3]));
70 score = score + get_score(&*TW3, &(w[2], w[3], w[4]));
71 score = score + get_score(&*TW4, &(w[3], w[4], w[5]));
72 score = score + get_score(&*UC1, &c[0]);
73 score = score + get_score(&*UC2, &c[1]);
74 score = score + get_score(&*UC3, &c[2]);
75 score = score + get_score(&*UC4, &c[3]);
76 score = score + get_score(&*UC5, &c[4]);
77 score = score + get_score(&*UC6, &c[5]);
78 score = score + get_score(&*BC1, &(c[1], c[2]));
79 score = score + get_score(&*BC2, &(c[2], c[3]));
80 score = score + get_score(&*BC3, &(c[3], c[4]));
81 score = score + get_score(&*TC1, &(c[0], c[1], c[2]));
82 score = score + get_score(&*TC2, &(c[1], c[2], c[3]));
83 score = score + get_score(&*TC3, &(c[2], c[3], c[4]));
84 score = score + get_score(&*TC4, &(c[3], c[4], c[5]));
85 score = score + get_score(&*UQ1, &(p[0], c[0]));
86 score = score + get_score(&*UQ2, &(p[1], c[1]));
87 score = score + get_score(&*UQ3, &(p[2], c[2]));
88 score = score + get_score(&*BQ1, &(p[1], c[1], c[2]));
89 score = score + get_score(&*BQ2, &(p[1], c[2], c[3]));
90 score = score + get_score(&*BQ3, &(p[2], c[1], c[2]));
91 score = score + get_score(&*BQ4, &(p[2], c[2], c[3]));
92 score = score + get_score(&*TQ1, &(p[1], c[0], c[1], c[2]));
93 score = score + get_score(&*TQ2, &(p[1], c[1], c[2], c[3]));
94 score = score + get_score(&*TQ3, &(p[2], c[0], c[1], c[2]));
95 score = score + get_score(&*TQ4, &(p[2], c[1], c[2], c[3]));
96
97 p.remove(0);
98 p.push(if score < 0 { 'O' } else { 'B' });
99
100 if 0 < score {
101 result.push(word.clone());
102 word.clear();
103 }
104 word.push(segments[index]);
105 }
106
107 result.push(word.clone());
108 result
109}