summa_core/components/tokenizers/
tokenizer.rs

1use std::str::CharIndices;
2
3/// Tokenize the text by splitting on whitespaces and punctuation.
4#[derive(Clone)]
5pub struct Tokenizer;
6
7pub struct TokenStream<'a> {
8    skip_list: Option<Vec<(usize, usize)>>,
9    skip_iter: usize,
10    chars: CharIndices<'a>,
11    token: tantivy::tokenizer::Token,
12    stacked_char: Option<(char, usize)>,
13
14    base_offset: usize,
15}
16
17#[inline]
18pub fn accept_char(token: &mut tantivy::tokenizer::Token, c: char, offset: usize) {
19    if token.offset_from == usize::MAX {
20        token.offset_from = offset;
21    }
22    token.offset_to = offset + c.len_utf8();
23    token.text.push(c);
24}
25
26impl<'a> TokenStream<'a> {
27    pub fn new(text: &'a str) -> TokenStream<'a> {
28        TokenStream {
29            skip_list: None,
30            skip_iter: 0,
31            chars: text.char_indices(),
32            token: tantivy::tokenizer::Token::default(),
33            stacked_char: None,
34            base_offset: 0,
35        }
36    }
37
38    pub fn new_with_offset_and_position(text: &'a str, offset: usize, position: usize, skip_list: Option<Vec<(usize, usize)>>) -> TokenStream<'a> {
39        let token = tantivy::tokenizer::Token {
40            position,
41            ..Default::default()
42        };
43        TokenStream {
44            skip_list,
45            skip_iter: 0,
46            chars: text.char_indices(),
47            token,
48            stacked_char: None,
49            base_offset: offset,
50        }
51    }
52
53    pub fn token(&self) -> &tantivy::tokenizer::Token {
54        &self.token
55    }
56
57    pub fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
58        &mut self.token
59    }
60}
61
62impl tantivy::tokenizer::Tokenizer for Tokenizer {
63    type TokenStream<'a> = TokenStream<'a>;
64
65    fn token_stream<'a>(&'a mut self, text: &'a str) -> TokenStream<'a> {
66        TokenStream::new(text)
67    }
68}
69
70#[inline]
71fn is_cjk(c: &char) -> bool {
72    (0x4e00 <= *c as u32 && *c as u32 <= 0x9FFF)
73        || (0x3400 <= *c as u32 && *c as u32 <= 0x4DBF)
74        || (0x20000 <= *c as u32 && *c as u32 <= 0x2A6DF)
75        || (0x2A700 <= *c as u32 && *c as u32 <= 0x2B73F)
76        || (0x2B740 <= *c as u32 && *c as u32 <= 0x2B81F)
77}
78
79impl<'a> tantivy::tokenizer::TokenStream for TokenStream<'a> {
80    fn advance(&mut self) -> bool {
81        self.token.text.clear();
82        self.token.position = self.token.position.wrapping_add(1);
83        self.token.offset_from = usize::MAX;
84
85        if let Some((stacked_char, stacked_offset)) = self.stacked_char.take() {
86            accept_char(&mut self.token, stacked_char, self.base_offset + stacked_offset);
87            if is_cjk(&stacked_char) {
88                return true;
89            }
90        }
91
92        for (offset, c) in &mut self.chars {
93            let real_offset = self.base_offset + offset;
94            if let Some(skip_list) = &self.skip_list {
95                while self.skip_iter < skip_list.len() && skip_list[self.skip_iter].1 <= real_offset {
96                    self.skip_iter += 1;
97                }
98                if self.skip_iter < skip_list.len() && skip_list[self.skip_iter].0 <= real_offset && real_offset < skip_list[self.skip_iter].1 {
99                    continue;
100                }
101            }
102
103            if is_cjk(&c) {
104                if !self.token.text.is_empty() {
105                    self.stacked_char = Some((c, offset));
106                    return true;
107                }
108                accept_char(&mut self.token, c, real_offset);
109                return true;
110            } else if c.is_alphanumeric() || c == '#' || c == '+' {
111                accept_char(&mut self.token, c, real_offset);
112                continue;
113            } else if !self.token.text.is_empty() {
114                break;
115            }
116        }
117        !self.token.text.is_empty()
118    }
119
120    fn token(&self) -> &tantivy::tokenizer::Token {
121        &self.token
122    }
123
124    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
125        &mut self.token
126    }
127}
128
129#[cfg(test)]
130pub mod tests {
131    use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenizerManager};
132
133    use super::Tokenizer;
134
135    pub fn assert_tokenization(tokenizer: &mut TextAnalyzer, text: &str, response: &[Token]) {
136        let mut tokens: Vec<Token> = vec![];
137        {
138            let mut add_token = |token: &Token| {
139                tokens.push(token.clone());
140            };
141            tokenizer.token_stream(text).process(&mut add_token);
142        }
143        assert_eq!(tokens, response);
144    }
145
146    #[test]
147    fn test_en_tokenizer() {
148        let tokenizer_manager = TokenizerManager::default();
149        tokenizer_manager.register(
150            "tokenizer",
151            TextAnalyzer::builder(Tokenizer).filter(RemoveLongFilter::limit(40)).filter(LowerCaser).build(),
152        );
153        let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
154
155        assert_tokenization(
156            &mut tokenizer,
157            "Hello, world!",
158            &[
159                Token {
160                    offset_from: 0,
161                    offset_to: 5,
162                    position: 0,
163                    text: "hello".to_string(),
164                    position_length: 1,
165                },
166                Token {
167                    offset_from: 7,
168                    offset_to: 12,
169                    position: 1,
170                    text: "world".to_string(),
171                    position_length: 1,
172                },
173            ],
174        );
175    }
176
177    #[test]
178    fn test_zh_tokenizer() {
179        let tokenizer_manager = TokenizerManager::default();
180        tokenizer_manager.register(
181            "tokenizer",
182            TextAnalyzer::builder(Tokenizer).filter(RemoveLongFilter::limit(40)).filter(LowerCaser).build(),
183        );
184        let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
185        assert_tokenization(
186            &mut tokenizer,
187            "在查hello, worl土d动!",
188            &[
189                Token {
190                    offset_from: 0,
191                    offset_to: 3,
192                    position: 0,
193                    text: "在".to_string(),
194                    position_length: 1,
195                },
196                Token {
197                    offset_from: 3,
198                    offset_to: 6,
199                    position: 1,
200                    text: "查".to_string(),
201                    position_length: 1,
202                },
203                Token {
204                    offset_from: 6,
205                    offset_to: 11,
206                    position: 2,
207                    text: "hello".to_string(),
208                    position_length: 1,
209                },
210                Token {
211                    offset_from: 13,
212                    offset_to: 17,
213                    position: 3,
214                    text: "worl".to_string(),
215                    position_length: 1,
216                },
217                Token {
218                    offset_from: 17,
219                    offset_to: 20,
220                    position: 4,
221                    text: "土".to_string(),
222                    position_length: 1,
223                },
224                Token {
225                    offset_from: 20,
226                    offset_to: 21,
227                    position: 5,
228                    text: "d".to_string(),
229                    position_length: 1,
230                },
231                Token {
232                    offset_from: 21,
233                    offset_to: 24,
234                    position: 6,
235                    text: "动".to_string(),
236                    position_length: 1,
237                },
238            ],
239        );
240        assert_tokenization(
241            &mut tokenizer,
242            "在查土d动",
243            &[
244                Token {
245                    offset_from: 0,
246                    offset_to: 3,
247                    position: 0,
248                    text: "在".to_string(),
249                    position_length: 1,
250                },
251                Token {
252                    offset_from: 3,
253                    offset_to: 6,
254                    position: 1,
255                    text: "查".to_string(),
256                    position_length: 1,
257                },
258                Token {
259                    offset_from: 6,
260                    offset_to: 9,
261                    position: 2,
262                    text: "土".to_string(),
263                    position_length: 1,
264                },
265                Token {
266                    offset_from: 9,
267                    offset_to: 10,
268                    position: 3,
269                    text: "d".to_string(),
270                    position_length: 1,
271                },
272                Token {
273                    offset_from: 10,
274                    offset_to: 13,
275                    position: 4,
276                    text: "动".to_string(),
277                    position_length: 1,
278                },
279            ],
280        );
281        assert_tokenization(
282            &mut tokenizer,
283            "Veri 在查hello, c查m p查 查lex  worl土d动!",
284            &[
285                Token {
286                    offset_from: 0,
287                    offset_to: 4,
288                    position: 0,
289                    text: "veri".to_string(),
290                    position_length: 1,
291                },
292                Token {
293                    offset_from: 5,
294                    offset_to: 8,
295                    position: 1,
296                    text: "在".to_string(),
297                    position_length: 1,
298                },
299                Token {
300                    offset_from: 8,
301                    offset_to: 11,
302                    position: 2,
303                    text: "查".to_string(),
304                    position_length: 1,
305                },
306                Token {
307                    offset_from: 11,
308                    offset_to: 16,
309                    position: 3,
310                    text: "hello".to_string(),
311                    position_length: 1,
312                },
313                Token {
314                    offset_from: 18,
315                    offset_to: 19,
316                    position: 4,
317                    text: "c".to_string(),
318                    position_length: 1,
319                },
320                Token {
321                    offset_from: 19,
322                    offset_to: 22,
323                    position: 5,
324                    text: "查".to_string(),
325                    position_length: 1,
326                },
327                Token {
328                    offset_from: 22,
329                    offset_to: 23,
330                    position: 6,
331                    text: "m".to_string(),
332                    position_length: 1,
333                },
334                Token {
335                    offset_from: 24,
336                    offset_to: 25,
337                    position: 7,
338                    text: "p".to_string(),
339                    position_length: 1,
340                },
341                Token {
342                    offset_from: 25,
343                    offset_to: 28,
344                    position: 8,
345                    text: "查".to_string(),
346                    position_length: 1,
347                },
348                Token {
349                    offset_from: 29,
350                    offset_to: 32,
351                    position: 9,
352                    text: "查".to_string(),
353                    position_length: 1,
354                },
355                Token {
356                    offset_from: 32,
357                    offset_to: 35,
358                    position: 10,
359                    text: "lex".to_string(),
360                    position_length: 1,
361                },
362                Token {
363                    offset_from: 37,
364                    offset_to: 41,
365                    position: 11,
366                    text: "worl".to_string(),
367                    position_length: 1,
368                },
369                Token {
370                    offset_from: 41,
371                    offset_to: 44,
372                    position: 12,
373                    text: "土".to_string(),
374                    position_length: 1,
375                },
376                Token {
377                    offset_from: 44,
378                    offset_to: 45,
379                    position: 13,
380                    text: "d".to_string(),
381                    position_length: 1,
382                },
383                Token {
384                    offset_from: 45,
385                    offset_to: 48,
386                    position: 14,
387                    text: "动".to_string(),
388                    position_length: 1,
389                },
390            ],
391        );
392        assert_tokenization(&mut tokenizer, "。", &[]);
393    }
394}