1use super::{Tokenizer, TokenizerType};
2use crate::{
3 helper::{ByReference, ByValue},
4 BaseTokenizer, MutTokenizer,
5};
6use derive_more::Display;
7use hashbrown::HashMap;
8use itertools::Itertools;
9
10#[derive(Display)]
11#[display(fmt = "CharacterTokenizer")]
12pub struct CharacterTokenizer;
13
14#[derive(Display)]
15#[display(fmt = "WhitespaceTokenizer")]
16pub struct WhitespaceTokenizer;
17
18#[derive(Display)]
19#[display(fmt = "WhitespaceCharSeqTokenizer")]
20pub struct WhitespaceCharSeqTokenizer;
21
22pub struct CachedWhitespaceTokenizer {
23 cache: HashMap<String, Vec<String>>,
24 unique_cache: HashMap<String, Vec<String>>,
25}
26
27impl BaseTokenizer<Vec<char>> for CharacterTokenizer {
28 type Return = ByValue;
29
30 fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
31 !tok_type.has_nested()
32 }
33}
34
35impl Tokenizer<Vec<char>> for CharacterTokenizer {
36 fn tokenize<'t>(&'t self, s: &str) -> Vec<char> {
37 s.chars().collect()
38 }
39
40 fn tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<char>, Vec<char>) {
41 (key.chars().collect(), query.chars().collect())
42 }
43
44 fn unique_tokenize<'t>(&'t self, s: &str) -> Vec<char> {
45 s.chars().unique().collect::<Vec<char>>()
46 }
47
48 fn unique_tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<char>, Vec<char>) {
49 (self.unique_tokenize(key), self.unique_tokenize(query))
50 }
51}
52
53impl BaseTokenizer<Vec<String>> for WhitespaceTokenizer {
54 type Return = ByValue;
55
56 fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
57 !tok_type.has_nested()
58 }
59}
60
61impl Tokenizer<Vec<String>> for WhitespaceTokenizer {
62 fn tokenize<'t>(&'t self, s: &str) -> Vec<String> {
63 s.split_whitespace().map(|s| s.to_owned()).collect()
64 }
65
66 fn tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<String>, Vec<String>) {
67 (
68 key.split_whitespace().map(|s| s.to_owned()).collect(),
69 query.split_whitespace().map(|s| s.to_owned()).collect(),
70 )
71 }
72
73 fn unique_tokenize<'t>(&'t self, s: &str) -> Vec<String> {
74 s.split_whitespace()
75 .unique()
76 .map(|s| s.to_owned())
77 .collect()
78 }
79
80 fn unique_tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<String>, Vec<String>) {
81 let key_tokens: Vec<_> = key
82 .split_whitespace()
83 .unique()
84 .map(|s| s.to_owned())
85 .collect();
86 let query_tokens: Vec<_> = query
87 .split_whitespace()
88 .unique()
89 .map(|s| s.to_owned())
90 .collect();
91
92 (key_tokens, query_tokens)
93 }
94}
95
96impl BaseTokenizer<Vec<Vec<char>>> for WhitespaceCharSeqTokenizer {
97 type Return = ByValue;
98
99 fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
100 if let Some(nested_tok_type) = tok_type.get_nested() {
101 nested_tok_type.is_outer_seq()
102 } else {
103 false
104 }
105 }
106}
107impl Tokenizer<Vec<Vec<char>>> for WhitespaceCharSeqTokenizer {
108 fn tokenize<'t>(&'t self, s: &str) -> Vec<Vec<char>> {
109 s.split_whitespace().map(|s| s.chars().collect()).collect()
110 }
111
112 fn tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<Vec<char>>, Vec<Vec<char>>) {
113 (
114 key.split_whitespace()
115 .map(|s| s.chars().collect())
116 .collect(),
117 query
118 .split_whitespace()
119 .map(|s| s.chars().collect())
120 .collect(),
121 )
122 }
123
124 fn unique_tokenize<'t>(&'t self, s: &str) -> Vec<Vec<char>> {
125 s.split_whitespace()
126 .unique()
127 .map(|s| s.chars().collect())
128 .collect()
129 }
130
131 fn unique_tokenize_pair<'t>(
132 &'t self,
133 key: &str,
134 query: &str,
135 ) -> (Vec<Vec<char>>, Vec<Vec<char>>) {
136 let key_tokens: Vec<_> = key
137 .split_whitespace()
138 .unique()
139 .map(|s| s.chars().collect())
140 .collect();
141 let query_tokens: Vec<_> = query
142 .split_whitespace()
143 .unique()
144 .map(|s| s.chars().collect())
145 .collect();
146
147 (key_tokens, query_tokens)
148 }
149}
150
151impl BaseTokenizer<Vec<String>> for CachedWhitespaceTokenizer {
152 type Return = ByReference;
153
154 fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
155 !tok_type.has_nested()
156 }
157}
158
159impl MutTokenizer<Vec<String>> for CachedWhitespaceTokenizer {
160 fn tokenize<'t>(&'t mut self, s: &str) -> &'t Vec<String> {
161 if !self.cache.contains_key(s) {
162 self.cache.insert(
163 s.to_owned(),
164 s.split_whitespace().map(|s| s.to_owned()).collect(),
165 );
166 }
167
168 self.cache.get(s).unwrap()
169 }
170
171 fn tokenize_pair<'t>(
172 &'t mut self,
173 key: &str,
174 query: &str,
175 ) -> (&'t Vec<String>, &'t Vec<String>) {
176 if !self.cache.contains_key(key) {
177 self.cache.insert(
178 key.to_owned(),
179 key.split_whitespace().map(|s| s.to_owned()).collect(),
180 );
181 }
182
183 if !self.cache.contains_key(query) {
184 self.cache.insert(
185 query.to_owned(),
186 query.split_whitespace().map(|s| s.to_owned()).collect(),
187 );
188 }
189
190 (self.cache.get(key).unwrap(), self.cache.get(query).unwrap())
191 }
192
193 fn unique_tokenize<'t>(&'t mut self, s: &str) -> &'t Vec<String> {
194 if !self.unique_cache.contains_key(s) {
195 self.unique_cache.insert(
196 s.to_owned(),
197 s.split_whitespace()
198 .unique()
199 .map(|s| s.to_owned())
200 .collect(),
201 );
202 }
203
204 self.unique_cache.get(s).unwrap()
205 }
206
207 fn unique_tokenize_pair<'t>(
208 &'t mut self,
209 key: &str,
210 query: &str,
211 ) -> (&'t Vec<String>, &'t Vec<String>) {
212 if !self.unique_cache.contains_key(key) {
213 self.unique_cache.insert(
214 key.to_owned(),
215 key.split_whitespace()
216 .unique()
217 .map(|s| s.to_owned())
218 .collect(),
219 );
220 }
221
222 if !self.unique_cache.contains_key(query) {
223 self.unique_cache.insert(
224 query.to_owned(),
225 query
226 .split_whitespace()
227 .unique()
228 .map(|s| s.to_owned())
229 .collect(),
230 );
231 }
232
233 (
234 self.unique_cache.get(key).unwrap(),
235 self.unique_cache.get(query).unwrap(),
236 )
237 }
238}