yass/
tokenizers.rs

1use super::{Tokenizer, TokenizerType};
2use crate::{
3    helper::{ByReference, ByValue},
4    BaseTokenizer, MutTokenizer,
5};
6use derive_more::Display;
7use hashbrown::HashMap;
8use itertools::Itertools;
9
10#[derive(Display)]
11#[display(fmt = "CharacterTokenizer")]
12pub struct CharacterTokenizer;
13
14#[derive(Display)]
15#[display(fmt = "WhitespaceTokenizer")]
16pub struct WhitespaceTokenizer;
17
18#[derive(Display)]
19#[display(fmt = "WhitespaceCharSeqTokenizer")]
20pub struct WhitespaceCharSeqTokenizer;
21
22pub struct CachedWhitespaceTokenizer {
23    cache: HashMap<String, Vec<String>>,
24    unique_cache: HashMap<String, Vec<String>>,
25}
26
27impl BaseTokenizer<Vec<char>> for CharacterTokenizer {
28    type Return = ByValue;
29
30    fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
31        !tok_type.has_nested()
32    }
33}
34
35impl Tokenizer<Vec<char>> for CharacterTokenizer {
36    fn tokenize<'t>(&'t self, s: &str) -> Vec<char> {
37        s.chars().collect()
38    }
39
40    fn tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<char>, Vec<char>) {
41        (key.chars().collect(), query.chars().collect())
42    }
43
44    fn unique_tokenize<'t>(&'t self, s: &str) -> Vec<char> {
45        s.chars().unique().collect::<Vec<char>>()
46    }
47
48    fn unique_tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<char>, Vec<char>) {
49        (self.unique_tokenize(key), self.unique_tokenize(query))
50    }
51}
52
53impl BaseTokenizer<Vec<String>> for WhitespaceTokenizer {
54    type Return = ByValue;
55
56    fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
57        !tok_type.has_nested()
58    }
59}
60
61impl Tokenizer<Vec<String>> for WhitespaceTokenizer {
62    fn tokenize<'t>(&'t self, s: &str) -> Vec<String> {
63        s.split_whitespace().map(|s| s.to_owned()).collect()
64    }
65
66    fn tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<String>, Vec<String>) {
67        (
68            key.split_whitespace().map(|s| s.to_owned()).collect(),
69            query.split_whitespace().map(|s| s.to_owned()).collect(),
70        )
71    }
72
73    fn unique_tokenize<'t>(&'t self, s: &str) -> Vec<String> {
74        s.split_whitespace()
75            .unique()
76            .map(|s| s.to_owned())
77            .collect()
78    }
79
80    fn unique_tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<String>, Vec<String>) {
81        let key_tokens: Vec<_> = key
82            .split_whitespace()
83            .unique()
84            .map(|s| s.to_owned())
85            .collect();
86        let query_tokens: Vec<_> = query
87            .split_whitespace()
88            .unique()
89            .map(|s| s.to_owned())
90            .collect();
91
92        (key_tokens, query_tokens)
93    }
94}
95
96impl BaseTokenizer<Vec<Vec<char>>> for WhitespaceCharSeqTokenizer {
97    type Return = ByValue;
98
99    fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
100        if let Some(nested_tok_type) = tok_type.get_nested() {
101            nested_tok_type.is_outer_seq()
102        } else {
103            false
104        }
105    }
106}
107impl Tokenizer<Vec<Vec<char>>> for WhitespaceCharSeqTokenizer {
108    fn tokenize<'t>(&'t self, s: &str) -> Vec<Vec<char>> {
109        s.split_whitespace().map(|s| s.chars().collect()).collect()
110    }
111
112    fn tokenize_pair<'t>(&'t self, key: &str, query: &str) -> (Vec<Vec<char>>, Vec<Vec<char>>) {
113        (
114            key.split_whitespace()
115                .map(|s| s.chars().collect())
116                .collect(),
117            query
118                .split_whitespace()
119                .map(|s| s.chars().collect())
120                .collect(),
121        )
122    }
123
124    fn unique_tokenize<'t>(&'t self, s: &str) -> Vec<Vec<char>> {
125        s.split_whitespace()
126            .unique()
127            .map(|s| s.chars().collect())
128            .collect()
129    }
130
131    fn unique_tokenize_pair<'t>(
132        &'t self,
133        key: &str,
134        query: &str,
135    ) -> (Vec<Vec<char>>, Vec<Vec<char>>) {
136        let key_tokens: Vec<_> = key
137            .split_whitespace()
138            .unique()
139            .map(|s| s.chars().collect())
140            .collect();
141        let query_tokens: Vec<_> = query
142            .split_whitespace()
143            .unique()
144            .map(|s| s.chars().collect())
145            .collect();
146
147        (key_tokens, query_tokens)
148    }
149}
150
151impl BaseTokenizer<Vec<String>> for CachedWhitespaceTokenizer {
152    type Return = ByReference;
153
154    fn is_compatible(&self, tok_type: &TokenizerType) -> bool {
155        !tok_type.has_nested()
156    }
157}
158
159impl MutTokenizer<Vec<String>> for CachedWhitespaceTokenizer {
160    fn tokenize<'t>(&'t mut self, s: &str) -> &'t Vec<String> {
161        if !self.cache.contains_key(s) {
162            self.cache.insert(
163                s.to_owned(),
164                s.split_whitespace().map(|s| s.to_owned()).collect(),
165            );
166        }
167
168        self.cache.get(s).unwrap()
169    }
170
171    fn tokenize_pair<'t>(
172        &'t mut self,
173        key: &str,
174        query: &str,
175    ) -> (&'t Vec<String>, &'t Vec<String>) {
176        if !self.cache.contains_key(key) {
177            self.cache.insert(
178                key.to_owned(),
179                key.split_whitespace().map(|s| s.to_owned()).collect(),
180            );
181        }
182
183        if !self.cache.contains_key(query) {
184            self.cache.insert(
185                query.to_owned(),
186                query.split_whitespace().map(|s| s.to_owned()).collect(),
187            );
188        }
189
190        (self.cache.get(key).unwrap(), self.cache.get(query).unwrap())
191    }
192
193    fn unique_tokenize<'t>(&'t mut self, s: &str) -> &'t Vec<String> {
194        if !self.unique_cache.contains_key(s) {
195            self.unique_cache.insert(
196                s.to_owned(),
197                s.split_whitespace()
198                    .unique()
199                    .map(|s| s.to_owned())
200                    .collect(),
201            );
202        }
203
204        self.unique_cache.get(s).unwrap()
205    }
206
207    fn unique_tokenize_pair<'t>(
208        &'t mut self,
209        key: &str,
210        query: &str,
211    ) -> (&'t Vec<String>, &'t Vec<String>) {
212        if !self.unique_cache.contains_key(key) {
213            self.unique_cache.insert(
214                key.to_owned(),
215                key.split_whitespace()
216                    .unique()
217                    .map(|s| s.to_owned())
218                    .collect(),
219            );
220        }
221
222        if !self.unique_cache.contains_key(query) {
223            self.unique_cache.insert(
224                query.to_owned(),
225                query
226                    .split_whitespace()
227                    .unique()
228                    .map(|s| s.to_owned())
229                    .collect(),
230            );
231        }
232
233        (
234            self.unique_cache.get(key).unwrap(),
235            self.unique_cache.get(query).unwrap(),
236        )
237    }
238}