tipping_rs/
tokenizer.rs

1use hashbrown::HashSet;
2
3use fancy_regex::Regex;
4
5use crate::traits::Tokenize;
6
7pub struct Tokenizer {
8    special_whites: Vec<Regex>,
9    special_blacks: Vec<Regex>,
10    symbols: HashSet<char>,
11}
12
13impl Tokenize for Tokenizer {
14    fn tokenize<'a>(&self, msg: &'a str) -> Vec<Token<'a>> {
15        let mut tokens = Vec::new();
16        for pre_token in self.pre_tokenize(msg) {
17            match pre_token {
18                PreToken::SpecialWhite(slice) => {
19                    tokens.push(Token::SpecialWhite(slice));
20                }
21                PreToken::SpecialBlack(slice) => {
22                    tokens.push(Token::SpecialBlack(slice));
23                }
24                PreToken::Unrefined(slice) => {
25                    tokens.append(&mut split_token(slice, &self.symbols));
26                }
27            }
28        }
29        tokens
30    }
31}
32
33impl Tokenizer {
34    pub fn new(
35        special_whites: Vec<Regex>,
36        special_blacks: Vec<Regex>,
37        symbols: HashSet<char>,
38    ) -> Self {
39        Tokenizer {
40            special_whites,
41            special_blacks,
42            symbols,
43        }
44    }
45
46    pub fn new_with_symbols(&self, symbols: HashSet<char>) -> Self {
47        Tokenizer {
48            special_whites: self.special_whites.clone(),
49            special_blacks: self.special_blacks.clone(),
50            symbols,
51        }
52    }
53
54    fn pre_tokenize<'a>(&self, msg: &'a str) -> Vec<PreToken<'a>> {
55        let mut pre_toks = vec![PreToken::Unrefined(msg)];
56        for regex in &self.special_whites {
57            let mut new_pre_toks = Vec::new();
58            for pre_tok in pre_toks {
59                match pre_tok {
60                    PreToken::SpecialWhite(slice) => {
61                        new_pre_toks.push(PreToken::SpecialWhite(slice))
62                    }
63                    PreToken::SpecialBlack(slide) => {
64                        new_pre_toks.push(PreToken::SpecialBlack(slide))
65                    }
66                    PreToken::Unrefined(slice) => {
67                        new_pre_toks.append(&mut split_special(
68                            slice,
69                            regex,
70                            PreToken::SpecialWhite,
71                        ));
72                    }
73                }
74            }
75            pre_toks = new_pre_toks;
76        }
77
78        for regex in &self.special_blacks {
79            let mut new_pre_toks = Vec::new();
80            for pre_tok in pre_toks {
81                match pre_tok {
82                    // }
83                    PreToken::SpecialWhite(slice) => {
84                        new_pre_toks.push(PreToken::SpecialWhite(slice))
85                    }
86                    PreToken::SpecialBlack(slide) => {
87                        new_pre_toks.push(PreToken::SpecialBlack(slide))
88                    }
89                    PreToken::Unrefined(slice) => {
90                        new_pre_toks.append(&mut split_special(
91                            slice,
92                            regex,
93                            PreToken::SpecialBlack,
94                        ));
95                    }
96                }
97            }
98            pre_toks = new_pre_toks;
99        }
100        pre_toks
101    }
102}
103
104fn split_special<'a, Special>(
105    msg: &'a str,
106    regex: &Regex,
107    special_type: Special,
108) -> Vec<PreToken<'a>>
109where
110    Special: Fn(&'a str) -> PreToken<'a>,
111{
112    let mut last_idx = 0;
113    let mut pre_tokens = Vec::new();
114    for m in regex.find_iter(msg).filter_map(Result::ok) {
115        let start = m.start();
116        let end = m.end();
117        if end - start > 0 {
118            if start != last_idx {
119                pre_tokens.push(PreToken::Unrefined(&msg[last_idx..m.start()]));
120            }
121            pre_tokens.push(special_type(m.as_str()));
122            last_idx = m.end();
123        }
124    }
125    if last_idx != msg.len() {
126        pre_tokens.push(PreToken::Unrefined(&msg[last_idx..]));
127    }
128    pre_tokens
129}
130
131fn split_token<'a>(msg: &'a str, symbols: &HashSet<char>) -> Vec<Token<'a>> {
132    let mut start_idx = 0;
133    let mut toks = Vec::new();
134    while let Some(end_idx) = msg[start_idx..]
135        .find(|c: char| c.is_whitespace() || symbols.contains(&c))
136        .map(|idx| idx + start_idx)
137    {
138        if start_idx < end_idx {
139            toks.push(Token::with(&msg[start_idx..end_idx], symbols));
140        }
141        toks.push(Token::with(&msg[end_idx..end_idx + 1], symbols));
142        start_idx = end_idx + 1;
143    }
144    if start_idx < msg.len() {
145        toks.push(Token::with(&msg[start_idx..], symbols));
146    }
147    toks
148}
149
150#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Hash, Copy)]
151pub enum Token<'a> {
152    Alphabetic(&'a str),
153    Numeric(&'a str),
154    Symbolic(&'a str),
155    Whitespace(&'a str),
156    Impure(&'a str),
157    SpecialWhite(&'a str),
158    SpecialBlack(&'a str),
159}
160
161impl<'a> Token<'a> {
162    pub fn with(slice: &'a str, symbols: &HashSet<char>) -> Token<'a> {
163        if slice.chars().all(char::is_alphabetic) {
164            Token::Alphabetic(slice)
165        } else if slice.chars().all(char::is_numeric) {
166            Token::Numeric(slice)
167        } else if slice.len() == 1 {
168            if slice.chars().all(char::is_whitespace) {
169                Token::Whitespace(slice)
170            } else if slice.chars().all(|c| symbols.contains(&c)) {
171                Token::Symbolic(slice)
172            } else {
173                Token::Impure(slice)
174            }
175        } else {
176            Token::Impure(slice)
177        }
178    }
179
180    pub fn as_str(&self) -> &'a str {
181        match self {
182            Token::Alphabetic(slice) => slice,
183            Token::Numeric(slice) => slice,
184            Token::Symbolic(slice) => slice,
185            Token::Whitespace(slice) => slice,
186            Token::Impure(slice) => slice,
187            Token::SpecialWhite(slice) => slice,
188            Token::SpecialBlack(slice) => slice,
189        }
190    }
191}
192
193#[derive(Debug, PartialEq, Eq)]
194enum PreToken<'a> {
195    // Special(&'a str),
196    SpecialWhite(&'a str),
197    SpecialBlack(&'a str),
198    Unrefined(&'a str),
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn tokenizer_pre_tokenize() {
207        let tokenizer = Tokenizer::new(
208            vec![Regex::new(r"\ba\b").unwrap()],
209            vec![Regex::new(r"\d+\.\d+").unwrap()],
210            "".chars().collect(),
211        );
212        let expected = vec![
213            PreToken::Unrefined("This "),
214            PreToken::SpecialBlack("10001.2"),
215            PreToken::Unrefined(" is "),
216            PreToken::SpecialBlack("1.323"),
217            PreToken::Unrefined(" "),
218            PreToken::SpecialWhite("a"),
219            PreToken::Unrefined(" "),
220            PreToken::SpecialBlack("1.4411"),
221            PreToken::Unrefined(" message"),
222        ];
223        let computed = tokenizer.pre_tokenize("This 10001.2 is 1.323 a 1.4411 message");
224        assert_eq!(expected, computed);
225    }
226
227    #[test]
228    fn tokenizer_tokenize() {
229        let tokenizer = Tokenizer::new(
230            vec![Regex::new(r"fan_\d+").unwrap()],
231            vec![Regex::new(r"\d+\.\d+").unwrap()],
232            ".".chars().collect(),
233        );
234        let computed = tokenizer
235            .tokenize("Fan fan_2 speed is set to 12.3114 on machine sys.node.fan_3 on node 12");
236        let expected = vec![
237            Token::Alphabetic("Fan"),
238            Token::Whitespace(" "),
239            Token::SpecialWhite("fan_2"),
240            Token::Whitespace(" "),
241            Token::Alphabetic("speed"),
242            Token::Whitespace(" "),
243            Token::Alphabetic("is"),
244            Token::Whitespace(" "),
245            Token::Alphabetic("set"),
246            Token::Whitespace(" "),
247            Token::Alphabetic("to"),
248            Token::Whitespace(" "),
249            Token::SpecialBlack("12.3114"),
250            Token::Whitespace(" "),
251            Token::Alphabetic("on"),
252            Token::Whitespace(" "),
253            Token::Alphabetic("machine"),
254            Token::Whitespace(" "),
255            Token::Alphabetic("sys"),
256            Token::Symbolic("."),
257            Token::Alphabetic("node"),
258            Token::Symbolic("."),
259            Token::SpecialWhite("fan_3"),
260            Token::Whitespace(" "),
261            Token::Alphabetic("on"),
262            Token::Whitespace(" "),
263            Token::Alphabetic("node"),
264            Token::Whitespace(" "),
265            Token::Numeric("12"),
266        ];
267        assert_eq!(expected, computed);
268    }
269}