1use std::str::CharIndices;
2
3#[derive(Clone)]
5pub struct Tokenizer;
6
7pub struct TokenStream<'a> {
8 skip_list: Option<Vec<(usize, usize)>>,
9 skip_iter: usize,
10 chars: CharIndices<'a>,
11 token: tantivy::tokenizer::Token,
12 stacked_char: Option<(char, usize)>,
13
14 base_offset: usize,
15}
16
17#[inline]
18pub fn accept_char(token: &mut tantivy::tokenizer::Token, c: char, offset: usize) {
19 if token.offset_from == usize::MAX {
20 token.offset_from = offset;
21 }
22 token.offset_to = offset + c.len_utf8();
23 token.text.push(c);
24}
25
26impl<'a> TokenStream<'a> {
27 pub fn new(text: &'a str) -> TokenStream<'a> {
28 TokenStream {
29 skip_list: None,
30 skip_iter: 0,
31 chars: text.char_indices(),
32 token: tantivy::tokenizer::Token::default(),
33 stacked_char: None,
34 base_offset: 0,
35 }
36 }
37
38 pub fn new_with_offset_and_position(text: &'a str, offset: usize, position: usize, skip_list: Option<Vec<(usize, usize)>>) -> TokenStream<'a> {
39 let token = tantivy::tokenizer::Token {
40 position,
41 ..Default::default()
42 };
43 TokenStream {
44 skip_list,
45 skip_iter: 0,
46 chars: text.char_indices(),
47 token,
48 stacked_char: None,
49 base_offset: offset,
50 }
51 }
52
53 pub fn token(&self) -> &tantivy::tokenizer::Token {
54 &self.token
55 }
56
57 pub fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
58 &mut self.token
59 }
60}
61
62impl tantivy::tokenizer::Tokenizer for Tokenizer {
63 type TokenStream<'a> = TokenStream<'a>;
64
65 fn token_stream<'a>(&'a mut self, text: &'a str) -> TokenStream<'a> {
66 TokenStream::new(text)
67 }
68}
69
70#[inline]
71fn is_cjk(c: &char) -> bool {
72 (0x4e00 <= *c as u32 && *c as u32 <= 0x9FFF)
73 || (0x3400 <= *c as u32 && *c as u32 <= 0x4DBF)
74 || (0x20000 <= *c as u32 && *c as u32 <= 0x2A6DF)
75 || (0x2A700 <= *c as u32 && *c as u32 <= 0x2B73F)
76 || (0x2B740 <= *c as u32 && *c as u32 <= 0x2B81F)
77}
78
79impl<'a> tantivy::tokenizer::TokenStream for TokenStream<'a> {
80 fn advance(&mut self) -> bool {
81 self.token.text.clear();
82 self.token.position = self.token.position.wrapping_add(1);
83 self.token.offset_from = usize::MAX;
84
85 if let Some((stacked_char, stacked_offset)) = self.stacked_char.take() {
86 accept_char(&mut self.token, stacked_char, self.base_offset + stacked_offset);
87 if is_cjk(&stacked_char) {
88 return true;
89 }
90 }
91
92 for (offset, c) in &mut self.chars {
93 let real_offset = self.base_offset + offset;
94 if let Some(skip_list) = &self.skip_list {
95 while self.skip_iter < skip_list.len() && skip_list[self.skip_iter].1 <= real_offset {
96 self.skip_iter += 1;
97 }
98 if self.skip_iter < skip_list.len() && skip_list[self.skip_iter].0 <= real_offset && real_offset < skip_list[self.skip_iter].1 {
99 continue;
100 }
101 }
102
103 if is_cjk(&c) {
104 if !self.token.text.is_empty() {
105 self.stacked_char = Some((c, offset));
106 return true;
107 }
108 accept_char(&mut self.token, c, real_offset);
109 return true;
110 } else if c.is_alphanumeric() || c == '#' || c == '+' {
111 accept_char(&mut self.token, c, real_offset);
112 continue;
113 } else if !self.token.text.is_empty() {
114 break;
115 }
116 }
117 !self.token.text.is_empty()
118 }
119
120 fn token(&self) -> &tantivy::tokenizer::Token {
121 &self.token
122 }
123
124 fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
125 &mut self.token
126 }
127}
128
129#[cfg(test)]
130pub mod tests {
131 use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenizerManager};
132
133 use super::Tokenizer;
134
135 pub fn assert_tokenization(tokenizer: &mut TextAnalyzer, text: &str, response: &[Token]) {
136 let mut tokens: Vec<Token> = vec![];
137 {
138 let mut add_token = |token: &Token| {
139 tokens.push(token.clone());
140 };
141 tokenizer.token_stream(text).process(&mut add_token);
142 }
143 assert_eq!(tokens, response);
144 }
145
146 #[test]
147 fn test_en_tokenizer() {
148 let tokenizer_manager = TokenizerManager::default();
149 tokenizer_manager.register(
150 "tokenizer",
151 TextAnalyzer::builder(Tokenizer).filter(RemoveLongFilter::limit(40)).filter(LowerCaser).build(),
152 );
153 let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
154
155 assert_tokenization(
156 &mut tokenizer,
157 "Hello, world!",
158 &[
159 Token {
160 offset_from: 0,
161 offset_to: 5,
162 position: 0,
163 text: "hello".to_string(),
164 position_length: 1,
165 },
166 Token {
167 offset_from: 7,
168 offset_to: 12,
169 position: 1,
170 text: "world".to_string(),
171 position_length: 1,
172 },
173 ],
174 );
175 }
176
177 #[test]
178 fn test_zh_tokenizer() {
179 let tokenizer_manager = TokenizerManager::default();
180 tokenizer_manager.register(
181 "tokenizer",
182 TextAnalyzer::builder(Tokenizer).filter(RemoveLongFilter::limit(40)).filter(LowerCaser).build(),
183 );
184 let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
185 assert_tokenization(
186 &mut tokenizer,
187 "在查hello, worl土d动!",
188 &[
189 Token {
190 offset_from: 0,
191 offset_to: 3,
192 position: 0,
193 text: "在".to_string(),
194 position_length: 1,
195 },
196 Token {
197 offset_from: 3,
198 offset_to: 6,
199 position: 1,
200 text: "查".to_string(),
201 position_length: 1,
202 },
203 Token {
204 offset_from: 6,
205 offset_to: 11,
206 position: 2,
207 text: "hello".to_string(),
208 position_length: 1,
209 },
210 Token {
211 offset_from: 13,
212 offset_to: 17,
213 position: 3,
214 text: "worl".to_string(),
215 position_length: 1,
216 },
217 Token {
218 offset_from: 17,
219 offset_to: 20,
220 position: 4,
221 text: "土".to_string(),
222 position_length: 1,
223 },
224 Token {
225 offset_from: 20,
226 offset_to: 21,
227 position: 5,
228 text: "d".to_string(),
229 position_length: 1,
230 },
231 Token {
232 offset_from: 21,
233 offset_to: 24,
234 position: 6,
235 text: "动".to_string(),
236 position_length: 1,
237 },
238 ],
239 );
240 assert_tokenization(
241 &mut tokenizer,
242 "在查土d动",
243 &[
244 Token {
245 offset_from: 0,
246 offset_to: 3,
247 position: 0,
248 text: "在".to_string(),
249 position_length: 1,
250 },
251 Token {
252 offset_from: 3,
253 offset_to: 6,
254 position: 1,
255 text: "查".to_string(),
256 position_length: 1,
257 },
258 Token {
259 offset_from: 6,
260 offset_to: 9,
261 position: 2,
262 text: "土".to_string(),
263 position_length: 1,
264 },
265 Token {
266 offset_from: 9,
267 offset_to: 10,
268 position: 3,
269 text: "d".to_string(),
270 position_length: 1,
271 },
272 Token {
273 offset_from: 10,
274 offset_to: 13,
275 position: 4,
276 text: "动".to_string(),
277 position_length: 1,
278 },
279 ],
280 );
281 assert_tokenization(
282 &mut tokenizer,
283 "Veri 在查hello, c查m p查 查lex worl土d动!",
284 &[
285 Token {
286 offset_from: 0,
287 offset_to: 4,
288 position: 0,
289 text: "veri".to_string(),
290 position_length: 1,
291 },
292 Token {
293 offset_from: 5,
294 offset_to: 8,
295 position: 1,
296 text: "在".to_string(),
297 position_length: 1,
298 },
299 Token {
300 offset_from: 8,
301 offset_to: 11,
302 position: 2,
303 text: "查".to_string(),
304 position_length: 1,
305 },
306 Token {
307 offset_from: 11,
308 offset_to: 16,
309 position: 3,
310 text: "hello".to_string(),
311 position_length: 1,
312 },
313 Token {
314 offset_from: 18,
315 offset_to: 19,
316 position: 4,
317 text: "c".to_string(),
318 position_length: 1,
319 },
320 Token {
321 offset_from: 19,
322 offset_to: 22,
323 position: 5,
324 text: "查".to_string(),
325 position_length: 1,
326 },
327 Token {
328 offset_from: 22,
329 offset_to: 23,
330 position: 6,
331 text: "m".to_string(),
332 position_length: 1,
333 },
334 Token {
335 offset_from: 24,
336 offset_to: 25,
337 position: 7,
338 text: "p".to_string(),
339 position_length: 1,
340 },
341 Token {
342 offset_from: 25,
343 offset_to: 28,
344 position: 8,
345 text: "查".to_string(),
346 position_length: 1,
347 },
348 Token {
349 offset_from: 29,
350 offset_to: 32,
351 position: 9,
352 text: "查".to_string(),
353 position_length: 1,
354 },
355 Token {
356 offset_from: 32,
357 offset_to: 35,
358 position: 10,
359 text: "lex".to_string(),
360 position_length: 1,
361 },
362 Token {
363 offset_from: 37,
364 offset_to: 41,
365 position: 11,
366 text: "worl".to_string(),
367 position_length: 1,
368 },
369 Token {
370 offset_from: 41,
371 offset_to: 44,
372 position: 12,
373 text: "土".to_string(),
374 position_length: 1,
375 },
376 Token {
377 offset_from: 44,
378 offset_to: 45,
379 position: 13,
380 text: "d".to_string(),
381 position_length: 1,
382 },
383 Token {
384 offset_from: 45,
385 offset_to: 48,
386 position: 14,
387 text: "动".to_string(),
388 position_length: 1,
389 },
390 ],
391 );
392 assert_tokenization(&mut tokenizer, "。", &[]);
393 }
394}