tinysegmenter/
lib.rs

1#[macro_use] extern crate lazy_static;
2#[macro_use] extern crate maplit;
3
4use std::char;
5use std::collections::HashMap;
6use std::hash::Hash;
7
8include!("constants.rs");
9
10fn get_score<T: Eq + Hash>(d: &HashMap<T, i32>, s: &T) -> i32 {
11  d.get(s).cloned().unwrap_or(0)
12}
13
14fn get_ctype(c: char) -> char {
15  match c as u32 {
16    0x4E00|0x4E8C|0x4E09|0x56DB|0x4E94|0x516D|0x4E03|0x516B|0x4E5D|0x5341 => 'M',
17    0x767E|0x5343|0x4E07|0x5104|0x5146 => 'M',
18    0x4E00...0x9FA0|0x3005|0x3006|0x30F5|0x30F6 => 'H',
19    0x3041...0x3093 => 'I',
20    0x30A1...0x30F4|0x30FC|0xFF71...0xFF9D|0xFF9E|0xFF70 => 'K',
21    0x61...0x7A|0x41...0x5A|0xFF41...0xFF5A|0xFF21...0xFF3A => 'A',
22    0x30...0x3a|0xFF10...0xFF19 => 'N',
23    _ => 'O',
24  }
25}
26
27pub fn tokenize(s: &str) -> Vec<String> {
28  if s.is_empty() {
29    return Vec::new();
30  }
31
32  let mut result = Vec::with_capacity(s.chars().count());
33
34  let segments =
35    vec!(*B3, *B2, *B1).into_iter()
36    .chain(s.chars())
37    .chain(vec!(*E1, *E2, *E3).into_iter())
38    .collect::<Vec<_>>();
39
40  let ctypes =
41    vec!('O'; 3).into_iter()
42    .chain(s.chars().map(get_ctype))
43    .chain(vec!('O'; 3).into_iter())
44    .collect::<Vec<_>>();
45
46  let mut word = segments[3].to_string();
47  let mut p = vec!('U'; 3);
48
49  for index in 4 .. segments.len() - 3 {
50    let mut score = BIAS;
51    let w = &segments[index - 3 .. index + 3];
52    let c = &ctypes[index - 3 .. index + 3];
53
54    score = score + get_score(&*UP1, &p[0]);
55    score = score + get_score(&*UP2, &p[1]);
56    score = score + get_score(&*UP3, &p[2]);
57    score = score + get_score(&*BP1, &(p[0], p[1]));
58    score = score + get_score(&*BP2, &(p[1], p[2]));
59    score = score + get_score(&*UW1, &w[0]);
60    score = score + get_score(&*UW2, &w[1]);
61    score = score + get_score(&*UW3, &w[2]);
62    score = score + get_score(&*UW4, &w[3]);
63    score = score + get_score(&*UW5, &w[4]);
64    score = score + get_score(&*UW6, &w[5]);
65    score = score + get_score(&*BW1, &(w[1], w[2]));
66    score = score + get_score(&*BW2, &(w[2], w[3]));
67    score = score + get_score(&*BW3, &(w[3], w[4]));
68    score = score + get_score(&*TW1, &(w[0], w[1], w[2]));
69    score = score + get_score(&*TW2, &(w[1], w[2], w[3]));
70    score = score + get_score(&*TW3, &(w[2], w[3], w[4]));
71    score = score + get_score(&*TW4, &(w[3], w[4], w[5]));
72    score = score + get_score(&*UC1, &c[0]);
73    score = score + get_score(&*UC2, &c[1]);
74    score = score + get_score(&*UC3, &c[2]);
75    score = score + get_score(&*UC4, &c[3]);
76    score = score + get_score(&*UC5, &c[4]);
77    score = score + get_score(&*UC6, &c[5]);
78    score = score + get_score(&*BC1, &(c[1], c[2]));
79    score = score + get_score(&*BC2, &(c[2], c[3]));
80    score = score + get_score(&*BC3, &(c[3], c[4]));
81    score = score + get_score(&*TC1, &(c[0], c[1], c[2]));
82    score = score + get_score(&*TC2, &(c[1], c[2], c[3]));
83    score = score + get_score(&*TC3, &(c[2], c[3], c[4]));
84    score = score + get_score(&*TC4, &(c[3], c[4], c[5]));
85    score = score + get_score(&*UQ1, &(p[0], c[0]));
86    score = score + get_score(&*UQ2, &(p[1], c[1]));
87    score = score + get_score(&*UQ3, &(p[2], c[2]));
88    score = score + get_score(&*BQ1, &(p[1], c[1], c[2]));
89    score = score + get_score(&*BQ2, &(p[1], c[2], c[3]));
90    score = score + get_score(&*BQ3, &(p[2], c[1], c[2]));
91    score = score + get_score(&*BQ4, &(p[2], c[2], c[3]));
92    score = score + get_score(&*TQ1, &(p[1], c[0], c[1], c[2]));
93    score = score + get_score(&*TQ2, &(p[1], c[1], c[2], c[3]));
94    score = score + get_score(&*TQ3, &(p[2], c[0], c[1], c[2]));
95    score = score + get_score(&*TQ4, &(p[2], c[1], c[2], c[3]));
96
97    p.remove(0);
98    p.push(if score < 0 { 'O' } else { 'B' });
99
100    if 0 < score {
101      result.push(word.clone());
102      word.clear();
103    }
104    word.push(segments[index]);
105  }
106
107  result.push(word.clone());
108  result
109}