1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
use stop_word::STOP_WORDS;
use tantivy::tokenizer::{
AsciiFoldingFilter, BoxTokenStream, LowerCaser, RemoveLongFilter, StopWordFilter, TextAnalyzer,
Token, TokenStream, Tokenizer,
};
mod chinese;
mod latin;
mod stop_word;
pub mod utils;
pub const META_TOKENIZER: &str = "meta_tokenizer";
pub fn get_tokenizer() -> TextAnalyzer {
TextAnalyzer::from(MetaTokenizer)
.filter(RemoveLongFilter::limit(20))
.filter(AsciiFoldingFilter)
.filter(StopWordFilter::remove(
STOP_WORDS.iter().map(|&word| word.to_owned()),
))
.filter(LowerCaser)
}
#[derive(Clone)]
pub struct MetaTokenizer;
pub struct MetaTokenStream {
tokens: Vec<Token>,
index: usize,
}
impl TokenStream for MetaTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.tokens.len() {
self.index += 1;
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.tokens[self.index - 1]
}
fn token_mut(&mut self) -> &mut Token {
&mut self.tokens[self.index - 1]
}
}
impl Tokenizer for MetaTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
if text.is_empty() {
return BoxTokenStream::from(MetaTokenStream {
tokens: vec![],
index: 0,
});
}
if utils::is_chinese(text) {
return chinese::token_stream(text);
}
return latin::token_stream(text);
}
}