tantivy_pinyin/
lib.rs

1use std::str::Chars;
2
3use tantivy::tokenizer::{ Token, Tokenizer, BoxTokenStream, TokenStream};
4
5use pinyin::ToPinyin;
6
7#[cfg(feature = "stop_words")]
8pub mod stop_words;
9
10#[derive(Clone)]
11pub struct PinyinTokenizer;
12
13pub struct PinyinTokenStream<'a> {
14    chars: Chars<'a>,
15    offset: usize,
16    token: Token,
17}
18
19impl Tokenizer for PinyinTokenizer {
20    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
21
22        BoxTokenStream::from(PinyinTokenStream {
23            chars: text.chars(),
24            offset: 0,
25            token: Token::default(),
26        })
27    }
28}
29
30impl<'a> TokenStream for PinyinTokenStream<'a> {
31    fn advance(&mut self) -> bool {
32        self.token.text.clear();
33        self.token.position = self.token.position.wrapping_add(1);
34        while let Some(c) = self.chars.next() {
35            let offset_to = self.offset + c.len_utf8();
36            self.token.offset_from = self.offset;
37            self.token.offset_to = offset_to;
38            self.offset = offset_to;
39            if let Some(pinyin) = c.to_pinyin() {
40                self.token.text.push_str(pinyin.plain());
41            }
42            return true;
43        }
44        false
45    }
46
47    fn token(&self) -> &Token {
48        &self.token
49    }
50
51    fn token_mut(&mut self) -> &mut Token {
52        &mut self.token
53    }
54}
55
56#[cfg(test)]
57mod tests{
58    use tantivy::tokenizer::{Tokenizer, Token, TokenStream};
59    use crate::{PinyinTokenStream, PinyinTokenizer};
60
61    #[test]
62    fn test_pinyin_tokenizer() {
63        let tokens = token_stream_helper("大多数知识,不需要我们记住");
64        assert_eq!(tokens.len(), 13);
65
66        assert_token(&tokens[0], 0, "da", 0, 3);
67        assert_token(&tokens[1], 1, "duo", 3, 6);
68        assert_token(&tokens[2], 2, "shu", 6, 9);
69        assert_token(&tokens[3], 3, "zhi", 9, 12);
70        assert_token(&tokens[4], 4, "shi", 12, 15);
71
72        assert_token(&tokens[5], 5, "", 15, 18);
73
74        assert_token(&tokens[6], 6, "bu", 18, 21);
75    }
76
77    #[test]
78    fn test_advance(){
79        let text = "知识";
80        let mut token_stream = PinyinTokenStream {
81            chars: text.chars(),
82            offset: 0,
83            token: Token::default(),
84        };
85
86        assert_eq!(token_stream.advance(), true);
87        assert_eq!(token_stream.advance(), true);
88        assert_eq!(token_stream.advance(), false);
89    }
90
91    fn token_stream_helper(text: &str) -> Vec<Token> {
92        let mut token_stream = PinyinTokenizer.token_stream(text);
93        let mut tokens = vec![];
94        while token_stream.advance() {
95            tokens.push(token_stream.token().clone());
96        }
97        tokens
98    }
99
100    pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
101        assert_eq!(
102            token.position, position,
103            "expected position {} but {:?}",
104            position, token
105        );
106        assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
107        assert_eq!(
108            token.offset_from, from,
109            "expected offset_from {} but {:?}",
110            from, token
111        );
112        assert_eq!(
113            token.offset_to, to,
114            "expected offset_to {} but {:?}",
115            to, token
116        );
117    }
118}