1use std::str::Chars;
2
3use tantivy::tokenizer::{ Token, Tokenizer, BoxTokenStream, TokenStream};
4
5use pinyin::ToPinyin;
6
7#[cfg(feature = "stop_words")]
8pub mod stop_words;
9
10#[derive(Clone)]
11pub struct PinyinTokenizer;
12
13pub struct PinyinTokenStream<'a> {
14 chars: Chars<'a>,
15 offset: usize,
16 token: Token,
17}
18
19impl Tokenizer for PinyinTokenizer {
20 fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
21
22 BoxTokenStream::from(PinyinTokenStream {
23 chars: text.chars(),
24 offset: 0,
25 token: Token::default(),
26 })
27 }
28}
29
30impl<'a> TokenStream for PinyinTokenStream<'a> {
31 fn advance(&mut self) -> bool {
32 self.token.text.clear();
33 self.token.position = self.token.position.wrapping_add(1);
34 while let Some(c) = self.chars.next() {
35 let offset_to = self.offset + c.len_utf8();
36 self.token.offset_from = self.offset;
37 self.token.offset_to = offset_to;
38 self.offset = offset_to;
39 if let Some(pinyin) = c.to_pinyin() {
40 self.token.text.push_str(pinyin.plain());
41 }
42 return true;
43 }
44 false
45 }
46
47 fn token(&self) -> &Token {
48 &self.token
49 }
50
51 fn token_mut(&mut self) -> &mut Token {
52 &mut self.token
53 }
54}
55
56#[cfg(test)]
57mod tests{
58 use tantivy::tokenizer::{Tokenizer, Token, TokenStream};
59 use crate::{PinyinTokenStream, PinyinTokenizer};
60
61 #[test]
62 fn test_pinyin_tokenizer() {
63 let tokens = token_stream_helper("大多数知识,不需要我们记住");
64 assert_eq!(tokens.len(), 13);
65
66 assert_token(&tokens[0], 0, "da", 0, 3);
67 assert_token(&tokens[1], 1, "duo", 3, 6);
68 assert_token(&tokens[2], 2, "shu", 6, 9);
69 assert_token(&tokens[3], 3, "zhi", 9, 12);
70 assert_token(&tokens[4], 4, "shi", 12, 15);
71
72 assert_token(&tokens[5], 5, "", 15, 18);
73
74 assert_token(&tokens[6], 6, "bu", 18, 21);
75 }
76
77 #[test]
78 fn test_advance(){
79 let text = "知识";
80 let mut token_stream = PinyinTokenStream {
81 chars: text.chars(),
82 offset: 0,
83 token: Token::default(),
84 };
85
86 assert_eq!(token_stream.advance(), true);
87 assert_eq!(token_stream.advance(), true);
88 assert_eq!(token_stream.advance(), false);
89 }
90
91 fn token_stream_helper(text: &str) -> Vec<Token> {
92 let mut token_stream = PinyinTokenizer.token_stream(text);
93 let mut tokens = vec![];
94 while token_stream.advance() {
95 tokens.push(token_stream.token().clone());
96 }
97 tokens
98 }
99
100 pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
101 assert_eq!(
102 token.position, position,
103 "expected position {} but {:?}",
104 position, token
105 );
106 assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
107 assert_eq!(
108 token.offset_from, from,
109 "expected offset_from {} but {:?}",
110 from, token
111 );
112 assert_eq!(
113 token.offset_to, to,
114 "expected offset_to {} but {:?}",
115 to, token
116 );
117 }
118}