1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
//! Tantivy Meta Tokenizer
//! This is copied and modified from https://github.com/jiegec/tantivy-jieba/blob/master/src/lib.rs

use stop_word::STOP_WORDS;
use tantivy::tokenizer::{
    AsciiFoldingFilter, BoxTokenStream, LowerCaser, RemoveLongFilter, StopWordFilter, TextAnalyzer,
    Token, TokenStream, Tokenizer,
};
mod chinese;
mod latin;
mod stop_word;
pub mod utils;

pub const META_TOKENIZER: &str = "meta_tokenizer";

pub fn get_tokenizer() -> TextAnalyzer {
    TextAnalyzer::from(MetaTokenizer)
        .filter(RemoveLongFilter::limit(20))
        .filter(AsciiFoldingFilter)
        .filter(StopWordFilter::remove(
            STOP_WORDS.iter().map(|&word| word.to_owned()),
        ))
        .filter(LowerCaser)
}

#[derive(Clone)]
pub struct MetaTokenizer;

pub struct MetaTokenStream {
    tokens: Vec<Token>,
    index: usize,
}

impl TokenStream for MetaTokenStream {
    fn advance(&mut self) -> bool {
        if self.index < self.tokens.len() {
            self.index += 1;
            true
        } else {
            false
        }
    }

    fn token(&self) -> &Token {
        &self.tokens[self.index - 1]
    }

    fn token_mut(&mut self) -> &mut Token {
        &mut self.tokens[self.index - 1]
    }
}

impl Tokenizer for MetaTokenizer {
    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
        if text.is_empty() {
            return BoxTokenStream::from(MetaTokenStream {
                tokens: vec![],
                index: 0,
            });
        }

        if utils::is_chinese(text) {
            return chinese::token_stream(text);
        }

        return latin::token_stream(text);
    }
}