1use crate::model::Model;
2#[cfg(feature = "ja")]
3use crate::model_ja;
4#[cfg(feature = "ja_knbc")]
5use crate::model_ja_knbc;
6#[cfg(feature = "th")]
7use crate::model_th;
8#[cfg(feature = "zh_hans")]
9use crate::model_zh_hans;
10#[cfg(feature = "zh_hant")]
11use crate::model_zh_hant;
12
13pub struct Parser {
14 pub model: Model,
15}
16
17impl Parser {
18 pub fn new(model: Model) -> Parser {
19 Self { model }
20 }
21
22 #[cfg(feature = "ja")]
23 pub fn japanese_parser() -> Parser {
24 Parser {
25 model: model_ja::new(),
26 }
27 }
28 #[cfg(feature = "ja_knbc")]
29 pub fn japanese_knbc_parser() -> Parser {
30 Parser {
31 model: model_ja_knbc::new(),
32 }
33 }
34 #[cfg(feature = "zh_hans")]
35 pub fn simplified_chinese_parser() -> Parser {
36 Parser {
37 model: model_zh_hans::new(),
38 }
39 }
40 #[cfg(feature = "zh_hant")]
41 pub fn traditional_chinese_parser() -> Parser {
42 Parser {
43 model: model_zh_hant::new(),
44 }
45 }
46 #[cfg(feature = "th")]
47 pub fn thai_parser() -> Parser {
48 Parser {
49 model: model_th::new(),
50 }
51 }
52
53 pub fn parse<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
54 if sentence.is_empty() {
55 return vec![];
56 }
57 let total_score = -(self.model.total_score() / 2);
58 let ci: Vec<usize> = sentence.char_indices().map(|(i, _)| i).collect();
59 let len = ci.len();
60 let mut start = ci[0];
61 let mut chunks = vec![];
62
63 for i in 1..len {
64 let mut score = total_score;
65 if i > 2 {
66 score += self.get_score_uw1(&sentence[ci[i - 3]..ci[i - 2]]);
67 }
68 if i > 1 {
69 score += self.get_score_uw2(&sentence[ci[i - 2]..ci[i - 1]]);
70 }
71 score += self.get_score_uw3(&sentence[ci[i - 1]..ci[i]]);
72
73 if i == len - 1 {
74 score += self.get_score_uw4(&sentence[ci[i]..]);
75 } else {
76 score += self.get_score_uw4(&sentence[ci[i]..ci[i + 1]]);
77 }
78 if i < len - 1 {
79 if i + 1 >= len - 1 {
80 score += self.get_score_uw5(&sentence[ci[i + 1]..]);
81 } else {
82 score += self.get_score_uw5(&sentence[ci[i + 1]..ci[i + 2]]);
83 }
84 }
85 if i < len - 2 {
86 if i + 2 >= len - 1 {
87 score += self.get_score_uw6(&sentence[ci[i + 2]..]);
88 } else {
89 score += self.get_score_uw6(&sentence[ci[i + 2]..ci[i + 3]]);
90 }
91 }
92
93 if i > 1 {
94 score += self.get_score_bw1(&sentence[ci[i - 2]..ci[i]]);
95 }
96 if i >= len - 1 {
97 score += self.get_score_bw2(&sentence[ci[i - 1]..]);
98 } else {
99 score += self.get_score_bw2(&sentence[ci[i - 1]..ci[i + 1]]);
100 }
101 if i < len - 1 {
102 if i >= len - 2 {
103 score += self.get_score_bw3(&sentence[ci[i]..]);
104 } else {
105 score += self.get_score_bw3(&sentence[ci[i]..ci[i + 2]]);
106 }
107 }
108
109 if i > 2 {
110 score += self.get_score_tw1(&sentence[ci[i - 3]..ci[i]]);
111 }
112 if i > 1 && i < len - 1 {
113 score += self.get_score_tw2(&sentence[ci[i - 2]..ci[i + 1]]);
114 }
115 if i < len - 2 {
116 if i + 2 >= len - 1 {
117 score += self.get_score_tw3(&sentence[ci[i - 1]..]);
118 } else {
119 score += self.get_score_tw3(&sentence[ci[i - 1]..ci[i + 2]]);
120 }
121 }
122 if i < len - 3 {
123 if i + 3 >= len - 1 {
124 score += self.get_score_tw4(&sentence[ci[i]..]);
125 } else {
126 score += self.get_score_tw4(&sentence[ci[i]..ci[i + 3]]);
127 }
128 }
129 if score > 0 {
130 chunks.push(&sentence[start..ci[i]]);
131 start = ci[i];
132 }
133 }
134 if start <= ci[len - 1] {
135 chunks.push(&sentence[start..]);
136 }
137 chunks
138 }
139
140 fn get_score_uw1(&self, s: &str) -> i32 {
141 *self.model.uw1.get(s).unwrap_or(&0) as i32
142 }
143 fn get_score_uw2(&self, s: &str) -> i32 {
144 *self.model.uw2.get(s).unwrap_or(&0) as i32
145 }
146 fn get_score_uw3(&self, s: &str) -> i32 {
147 *self.model.uw3.get(s).unwrap_or(&0) as i32
148 }
149 fn get_score_uw4(&self, s: &str) -> i32 {
150 *self.model.uw4.get(s).unwrap_or(&0) as i32
151 }
152 fn get_score_uw5(&self, s: &str) -> i32 {
153 *self.model.uw5.get(s).unwrap_or(&0) as i32
154 }
155 fn get_score_uw6(&self, s: &str) -> i32 {
156 *self.model.uw6.get(s).unwrap_or(&0) as i32
157 }
158 fn get_score_bw1(&self, s: &str) -> i32 {
159 *self.model.bw1.get(s).unwrap_or(&0) as i32
160 }
161 fn get_score_bw2(&self, s: &str) -> i32 {
162 *self.model.bw2.get(s).unwrap_or(&0) as i32
163 }
164 fn get_score_bw3(&self, s: &str) -> i32 {
165 *self.model.bw3.get(s).unwrap_or(&0) as i32
166 }
167 fn get_score_tw1(&self, s: &str) -> i32 {
168 *self.model.tw1.get(s).unwrap_or(&0) as i32
169 }
170 fn get_score_tw2(&self, s: &str) -> i32 {
171 *self.model.tw2.get(s).unwrap_or(&0) as i32
172 }
173 fn get_score_tw3(&self, s: &str) -> i32 {
174 *self.model.tw3.get(s).unwrap_or(&0) as i32
175 }
176 fn get_score_tw4(&self, s: &str) -> i32 {
177 *self.model.tw4.get(s).unwrap_or(&0) as i32
178 }
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184
185 #[test]
186 fn test_parse() {
187 let td = vec![
188 "今日は▁とても▁良い▁天気です。",
189 "abcdefg の▁使命は、▁世界中の▁情報を▁整理し、▁世界中の▁人が▁アクセスできて▁使えるように▁する▁ことです。",
190 "これ以上▁利用する▁場合は▁教えてください。",
191 "食器は▁そのまま▁入れて▁大丈夫です。",
192 "ダウンロード▁ありがとう▁ございます。",
193 "ご利用▁ありがとう▁ございました。",
194 "要点を▁まとめる▁必要が▁ある。",
195 "目指すのは▁あらゆる▁人に▁便利な▁ソフトウェア",
196 "商品が▁まもなく▁到着します。",
197 "プロジェクトが▁ようやく▁日の▁目を▁見る。",
198 "明け方に▁ようやく▁目覚めると、",
199 "明け方▁ようやく▁目覚めると、",
200 "これは▁たまたま▁見つけた▁宝物",
201 "歩いていて▁たまたま▁目に▁入った▁光景",
202 "あなたの▁意図した▁とおりに▁情報を▁伝える。",
203 "あの▁イーハトーヴォの▁すきとおった▁風、▁夏でも▁底に▁冷たさを▁もつ▁青い▁そら、▁うつくしい▁森で▁飾られた▁モリーオ市、▁郊外の▁ぎらぎら▁ひかる▁草の▁波。",
204 "購入された▁お客様のみ▁入れます。",
205 "購入された▁お客様のみ▁入場できます。",
206 "パワーのみ▁有効だ",
207 "小さな▁つぶや▁空気中の▁ちり",
208 "光が▁どんどん▁空▁いっぱいに▁広がる",
209 "太陽の▁位置が▁ちがうから",
210 "太陽が▁しずむころに▁帰る",
211 "多すぎると▁うまく▁いかない",
212 "世界の▁子どもの▁命や▁権利",
213 "「ふだん▁どおり」を▁保つ",
214 "おもちゃや▁遊びに▁使える",
215 "コントロールできない▁ほど▁感情移入してしまう",
216 "いつも▁甘えがちに▁なる",
217 "存在が▁浮かび▁上がった。",
218 "雀の▁宿"
219 ];
220 let p = Parser::japanese_parser();
221 for d in td {
222 let expect: Vec<_> = d.split("▁").collect();
223 let input = d.replace("▁", "");
224 assert_eq!(p.parse(&input), expect);
225 }
226 }
227
228 #[test]
229 fn test_parser_zh_hans() {
230 let parser_zh_hans = Parser::simplified_chinese_parser();
231 let r = parser_zh_hans.parse("今天是晴天。");
232 assert_eq!(r, vec!["今天", "是", "晴天。"]);
233 }
234
235 #[test]
236 fn test_parser_zh_hant() {
237 let parser_zh_hant = Parser::traditional_chinese_parser();
238 let r = parser_zh_hant.parse("今天是晴天。");
239 assert_eq!(r, vec!["今天", "是", "晴天。"]);
240 }
241
242 #[test]
243 fn test_parser_th() {
244 let parser_th = Parser::thai_parser();
245 let r = parser_th.parse("วันนี้อากาศดี");
246 assert_eq!(r, vec!["วัน", "นี้", "อากาศ", "ดี"]);
247 }
248
249 #[test]
250 fn test_custom_model() {
251 use crate::model::ScoreMap;
252 static F: ScoreMap = ::phf::Map {
253 key: 0,
254 disps: &[],
255 entries: &[],
256 };
257 let model = Model {
258 total_score: 0,
259 uw1: &F,
260 uw2: &F,
261 uw3: &F,
262 uw4: &F,
263 uw5: &F,
264 uw6: &F,
265 bw1: &F,
266 bw2: &F,
267 bw3: &F,
268 tw1: &F,
269 tw2: &F,
270 tw3: &F,
271 tw4: &F,
272 };
273 let parser = Parser::new(model);
274 let r = parser.parse("今日は天気です。");
275 assert_eq!(r, vec!["今日は天気です。"]);
276 }
277}