izihawa_tantivy_tokenizer_api/
lib.rs1use std::borrow::{Borrow, BorrowMut};
9use std::ops::{Deref, DerefMut};
10
11use serde::{Deserialize, Serialize};
12
13#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
15pub struct Token {
16 pub offset_from: usize,
19 pub offset_to: usize,
23 pub position: usize,
25 pub text: String,
27 pub position_length: usize,
29}
30
31impl Default for Token {
32 fn default() -> Token {
33 Token {
34 offset_from: 0,
35 offset_to: 0,
36 position: usize::MAX,
37 text: String::new(),
38 position_length: 1,
39 }
40 }
41}
42
43impl Token {
44 pub fn reset(&mut self) {
46 self.offset_from = 0;
47 self.offset_to = 0;
48 self.position = usize::MAX;
49 self.text.clear();
50 self.position_length = 1;
51 }
52}
53
54pub trait Tokenizer: 'static + Clone + Send + Sync {
57 type TokenStream<'a>: TokenStream;
59 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
61}
62
63pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
65
66impl TokenStream for BoxTokenStream<'_> {
67 fn advance(&mut self) -> bool {
68 self.0.advance()
69 }
70
71 fn token(&self) -> &Token {
72 self.0.token()
73 }
74
75 fn token_mut(&mut self) -> &mut Token {
76 self.0.token_mut()
77 }
78}
79
80impl<'a> BoxTokenStream<'a> {
81 pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
82 BoxTokenStream(Box::new(token_stream))
83 }
84}
85
86impl<'a> Deref for BoxTokenStream<'a> {
87 type Target = dyn TokenStream + 'a;
88
89 fn deref(&self) -> &Self::Target {
90 &*self.0
91 }
92}
93impl DerefMut for BoxTokenStream<'_> {
94 fn deref_mut(&mut self) -> &mut Self::Target {
95 &mut *self.0
96 }
97}
98
99impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
100 fn advance(&mut self) -> bool {
101 let token_stream: &mut dyn TokenStream = self.borrow_mut();
102 token_stream.advance()
103 }
104
105 fn token<'b>(&'b self) -> &'b Token {
106 let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
107 token_stream.token()
108 }
109
110 fn token_mut<'b>(&'b mut self) -> &'b mut Token {
111 let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
112 token_stream.token_mut()
113 }
114}
115
116pub trait TokenStream {
120 fn advance(&mut self) -> bool;
124
125 fn token(&self) -> &Token;
127
128 fn token_mut(&mut self) -> &mut Token;
130
131 fn next(&mut self) -> Option<&Token> {
135 if self.advance() {
136 Some(self.token())
137 } else {
138 None
139 }
140 }
141
142 fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
145 while self.advance() {
146 sink(self.token());
147 }
148 }
149}
150
151pub trait TokenFilter: 'static + Send + Sync {
153 type Tokenizer<T: Tokenizer>: Tokenizer;
156 fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
158}
159
160#[cfg(test)]
161mod test {
162 use super::*;
163
164 #[test]
165 fn clone() {
166 let t1 = Token {
167 position: 1,
168 offset_from: 2,
169 offset_to: 3,
170 text: "abc".to_string(),
171 position_length: 1,
172 };
173 let t2 = t1.clone();
174
175 assert_eq!(t1.position, t2.position);
176 assert_eq!(t1.offset_from, t2.offset_from);
177 assert_eq!(t1.offset_to, t2.offset_to);
178 assert_eq!(t1.text, t2.text);
179 }
180}