1use hashbrown::HashSet;
2
3use fancy_regex::Regex;
4
5use crate::traits::Tokenize;
6
7pub struct Tokenizer {
8 special_whites: Vec<Regex>,
9 special_blacks: Vec<Regex>,
10 symbols: HashSet<char>,
11}
12
13impl Tokenize for Tokenizer {
14 fn tokenize<'a>(&self, msg: &'a str) -> Vec<Token<'a>> {
15 let mut tokens = Vec::new();
16 for pre_token in self.pre_tokenize(msg) {
17 match pre_token {
18 PreToken::SpecialWhite(slice) => {
19 tokens.push(Token::SpecialWhite(slice));
20 }
21 PreToken::SpecialBlack(slice) => {
22 tokens.push(Token::SpecialBlack(slice));
23 }
24 PreToken::Unrefined(slice) => {
25 tokens.append(&mut split_token(slice, &self.symbols));
26 }
27 }
28 }
29 tokens
30 }
31}
32
33impl Tokenizer {
34 pub fn new(
35 special_whites: Vec<Regex>,
36 special_blacks: Vec<Regex>,
37 symbols: HashSet<char>,
38 ) -> Self {
39 Tokenizer {
40 special_whites,
41 special_blacks,
42 symbols,
43 }
44 }
45
46 pub fn new_with_symbols(&self, symbols: HashSet<char>) -> Self {
47 Tokenizer {
48 special_whites: self.special_whites.clone(),
49 special_blacks: self.special_blacks.clone(),
50 symbols,
51 }
52 }
53
54 fn pre_tokenize<'a>(&self, msg: &'a str) -> Vec<PreToken<'a>> {
55 let mut pre_toks = vec![PreToken::Unrefined(msg)];
56 for regex in &self.special_whites {
57 let mut new_pre_toks = Vec::new();
58 for pre_tok in pre_toks {
59 match pre_tok {
60 PreToken::SpecialWhite(slice) => {
61 new_pre_toks.push(PreToken::SpecialWhite(slice))
62 }
63 PreToken::SpecialBlack(slide) => {
64 new_pre_toks.push(PreToken::SpecialBlack(slide))
65 }
66 PreToken::Unrefined(slice) => {
67 new_pre_toks.append(&mut split_special(
68 slice,
69 regex,
70 PreToken::SpecialWhite,
71 ));
72 }
73 }
74 }
75 pre_toks = new_pre_toks;
76 }
77
78 for regex in &self.special_blacks {
79 let mut new_pre_toks = Vec::new();
80 for pre_tok in pre_toks {
81 match pre_tok {
82 PreToken::SpecialWhite(slice) => {
84 new_pre_toks.push(PreToken::SpecialWhite(slice))
85 }
86 PreToken::SpecialBlack(slide) => {
87 new_pre_toks.push(PreToken::SpecialBlack(slide))
88 }
89 PreToken::Unrefined(slice) => {
90 new_pre_toks.append(&mut split_special(
91 slice,
92 regex,
93 PreToken::SpecialBlack,
94 ));
95 }
96 }
97 }
98 pre_toks = new_pre_toks;
99 }
100 pre_toks
101 }
102}
103
104fn split_special<'a, Special>(
105 msg: &'a str,
106 regex: &Regex,
107 special_type: Special,
108) -> Vec<PreToken<'a>>
109where
110 Special: Fn(&'a str) -> PreToken<'a>,
111{
112 let mut last_idx = 0;
113 let mut pre_tokens = Vec::new();
114 for m in regex.find_iter(msg).filter_map(Result::ok) {
115 let start = m.start();
116 let end = m.end();
117 if end - start > 0 {
118 if start != last_idx {
119 pre_tokens.push(PreToken::Unrefined(&msg[last_idx..m.start()]));
120 }
121 pre_tokens.push(special_type(m.as_str()));
122 last_idx = m.end();
123 }
124 }
125 if last_idx != msg.len() {
126 pre_tokens.push(PreToken::Unrefined(&msg[last_idx..]));
127 }
128 pre_tokens
129}
130
131fn split_token<'a>(msg: &'a str, symbols: &HashSet<char>) -> Vec<Token<'a>> {
132 let mut start_idx = 0;
133 let mut toks = Vec::new();
134 while let Some(end_idx) = msg[start_idx..]
135 .find(|c: char| c.is_whitespace() || symbols.contains(&c))
136 .map(|idx| idx + start_idx)
137 {
138 if start_idx < end_idx {
139 toks.push(Token::with(&msg[start_idx..end_idx], symbols));
140 }
141 toks.push(Token::with(&msg[end_idx..end_idx + 1], symbols));
142 start_idx = end_idx + 1;
143 }
144 if start_idx < msg.len() {
145 toks.push(Token::with(&msg[start_idx..], symbols));
146 }
147 toks
148}
149
150#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Hash, Copy)]
151pub enum Token<'a> {
152 Alphabetic(&'a str),
153 Numeric(&'a str),
154 Symbolic(&'a str),
155 Whitespace(&'a str),
156 Impure(&'a str),
157 SpecialWhite(&'a str),
158 SpecialBlack(&'a str),
159}
160
161impl<'a> Token<'a> {
162 pub fn with(slice: &'a str, symbols: &HashSet<char>) -> Token<'a> {
163 if slice.chars().all(char::is_alphabetic) {
164 Token::Alphabetic(slice)
165 } else if slice.chars().all(char::is_numeric) {
166 Token::Numeric(slice)
167 } else if slice.len() == 1 {
168 if slice.chars().all(char::is_whitespace) {
169 Token::Whitespace(slice)
170 } else if slice.chars().all(|c| symbols.contains(&c)) {
171 Token::Symbolic(slice)
172 } else {
173 Token::Impure(slice)
174 }
175 } else {
176 Token::Impure(slice)
177 }
178 }
179
180 pub fn as_str(&self) -> &'a str {
181 match self {
182 Token::Alphabetic(slice) => slice,
183 Token::Numeric(slice) => slice,
184 Token::Symbolic(slice) => slice,
185 Token::Whitespace(slice) => slice,
186 Token::Impure(slice) => slice,
187 Token::SpecialWhite(slice) => slice,
188 Token::SpecialBlack(slice) => slice,
189 }
190 }
191}
192
193#[derive(Debug, PartialEq, Eq)]
194enum PreToken<'a> {
195 SpecialWhite(&'a str),
197 SpecialBlack(&'a str),
198 Unrefined(&'a str),
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204
205 #[test]
206 fn tokenizer_pre_tokenize() {
207 let tokenizer = Tokenizer::new(
208 vec![Regex::new(r"\ba\b").unwrap()],
209 vec![Regex::new(r"\d+\.\d+").unwrap()],
210 "".chars().collect(),
211 );
212 let expected = vec![
213 PreToken::Unrefined("This "),
214 PreToken::SpecialBlack("10001.2"),
215 PreToken::Unrefined(" is "),
216 PreToken::SpecialBlack("1.323"),
217 PreToken::Unrefined(" "),
218 PreToken::SpecialWhite("a"),
219 PreToken::Unrefined(" "),
220 PreToken::SpecialBlack("1.4411"),
221 PreToken::Unrefined(" message"),
222 ];
223 let computed = tokenizer.pre_tokenize("This 10001.2 is 1.323 a 1.4411 message");
224 assert_eq!(expected, computed);
225 }
226
227 #[test]
228 fn tokenizer_tokenize() {
229 let tokenizer = Tokenizer::new(
230 vec![Regex::new(r"fan_\d+").unwrap()],
231 vec![Regex::new(r"\d+\.\d+").unwrap()],
232 ".".chars().collect(),
233 );
234 let computed = tokenizer
235 .tokenize("Fan fan_2 speed is set to 12.3114 on machine sys.node.fan_3 on node 12");
236 let expected = vec![
237 Token::Alphabetic("Fan"),
238 Token::Whitespace(" "),
239 Token::SpecialWhite("fan_2"),
240 Token::Whitespace(" "),
241 Token::Alphabetic("speed"),
242 Token::Whitespace(" "),
243 Token::Alphabetic("is"),
244 Token::Whitespace(" "),
245 Token::Alphabetic("set"),
246 Token::Whitespace(" "),
247 Token::Alphabetic("to"),
248 Token::Whitespace(" "),
249 Token::SpecialBlack("12.3114"),
250 Token::Whitespace(" "),
251 Token::Alphabetic("on"),
252 Token::Whitespace(" "),
253 Token::Alphabetic("machine"),
254 Token::Whitespace(" "),
255 Token::Alphabetic("sys"),
256 Token::Symbolic("."),
257 Token::Alphabetic("node"),
258 Token::Symbolic("."),
259 Token::SpecialWhite("fan_3"),
260 Token::Whitespace(" "),
261 Token::Alphabetic("on"),
262 Token::Whitespace(" "),
263 Token::Alphabetic("node"),
264 Token::Whitespace(" "),
265 Token::Numeric("12"),
266 ];
267 assert_eq!(expected, computed);
268 }
269}