summavy_tokenizer_api/
lib.rs1use std::borrow::{Borrow, BorrowMut};
9use std::ops::{Deref, DerefMut};
10
11use serde::{Deserialize, Serialize};
12
13#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
15pub struct Token {
16 pub offset_from: usize,
19 pub offset_to: usize,
23 pub position: usize,
25 pub text: String,
27 pub position_length: usize,
29}
30
31impl Default for Token {
32 fn default() -> Token {
33 Token {
34 offset_from: 0,
35 offset_to: 0,
36 position: usize::MAX,
37 text: String::with_capacity(200),
38 position_length: 1,
39 }
40 }
41}
42
43pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
50 fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
52}
53
54pub trait TokenizerClone {
55 fn box_clone(&self) -> Box<dyn Tokenizer>;
56}
57
58impl<T: Tokenizer + Clone> TokenizerClone for T {
59 fn box_clone(&self) -> Box<dyn Tokenizer> {
60 Box::new(self.clone())
61 }
62}
63
64pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
68
69impl<'a, T> From<T> for BoxTokenStream<'a>
70where T: TokenStream + 'a
71{
72 fn from(token_stream: T) -> BoxTokenStream<'a> {
73 BoxTokenStream(Box::new(token_stream))
74 }
75}
76
77impl<'a> Deref for BoxTokenStream<'a> {
78 type Target = dyn TokenStream + 'a;
79
80 fn deref(&self) -> &Self::Target {
81 &*self.0
82 }
83}
84impl<'a> DerefMut for BoxTokenStream<'a> {
85 fn deref_mut(&mut self) -> &mut Self::Target {
86 &mut *self.0
87 }
88}
89
90impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
91 fn advance(&mut self) -> bool {
92 let token_stream: &mut dyn TokenStream = self.borrow_mut();
93 token_stream.advance()
94 }
95
96 fn token<'b>(&'b self) -> &'b Token {
97 let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
98 token_stream.token()
99 }
100
101 fn token_mut<'b>(&'b mut self) -> &'b mut Token {
102 let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
103 token_stream.token_mut()
104 }
105}
106
107pub trait TokenStream {
111 fn advance(&mut self) -> bool;
115
116 fn token(&self) -> &Token;
118
119 fn token_mut(&mut self) -> &mut Token;
121
122 fn next(&mut self) -> Option<&Token> {
126 if self.advance() {
127 Some(self.token())
128 } else {
129 None
130 }
131 }
132
133 fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
136 while self.advance() {
137 sink(self.token());
138 }
139 }
140}
141
142pub struct BoxTokenFilter(Box<dyn TokenFilter>);
146
147impl Deref for BoxTokenFilter {
148 type Target = dyn TokenFilter;
149
150 fn deref(&self) -> &dyn TokenFilter {
151 &*self.0
152 }
153}
154
155impl<T: TokenFilter> From<T> for BoxTokenFilter {
156 fn from(tokenizer: T) -> BoxTokenFilter {
157 BoxTokenFilter(Box::new(tokenizer))
158 }
159}
160
161pub trait TokenFilterClone {
162 fn box_clone(&self) -> BoxTokenFilter;
163}
164
165pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
167 fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
169}
170
171impl<T: TokenFilter + Clone> TokenFilterClone for T {
172 fn box_clone(&self) -> BoxTokenFilter {
173 BoxTokenFilter::from(self.clone())
174 }
175}
176
177#[cfg(test)]
178mod test {
179 use super::*;
180
181 #[test]
182 fn clone() {
183 let t1 = Token {
184 position: 1,
185 offset_from: 2,
186 offset_to: 3,
187 text: "abc".to_string(),
188 position_length: 1,
189 };
190 let t2 = t1.clone();
191
192 assert_eq!(t1.position, t2.position);
193 assert_eq!(t1.offset_from, t2.offset_from);
194 assert_eq!(t1.offset_to, t2.offset_to);
195 assert_eq!(t1.text, t2.text);
196 }
197}