prexel/utils/
splitter.rs

1use crate::utils::splitter::rules::{Outcome, SplitRule};
2
3/// A trait that provides a method to convert a string into a sequence of tokens.
4pub trait Splitter {
5    /// Converts a string into a sequence of tokens.
6    fn split_into_tokens(&self, expression: &str) -> Vec<String>;
7}
8
9/// Options used for whitespaces.
10#[derive(Copy, Clone, Eq, PartialEq)]
11pub enum SplitWhitespaceOption {
12    /// All the tokens will be retrieve including whitespaces.
13    None,
14    /// All the tokens will be retrieve ignoring whitespaces.
15    Remove,
16}
17
18/// Provides a way to extract tokens from a `str`.
19///
20/// # Example
21/// ```
22/// use prexel::utils::splitter::{DefaultSplitter, Splitter};
23///
24/// let splitter = DefaultSplitter::default();
25/// let tokens = splitter.split_into_tokens("2 + 3");
26/// assert_eq!(["2", "+", "3"].to_vec(), tokens);
27/// ```
28pub struct DefaultSplitter<'a> {
29    rules: Vec<Box<dyn SplitRule + 'a>>,
30}
31
32impl<'a> DefaultSplitter<'a> {
33    #[inline]
34    pub fn new(kind: SplitWhitespaceOption) -> DefaultSplitter<'a> {
35        DefaultSplitterBuilder::default()
36            .rule(rules::SplitNumeric)
37            .rule(rules::SplitIdentifier)
38            .rule(rules::SplitOperator)
39            .whitespace(kind)
40            .build()
41    }
42
43    pub fn with_numeric_rule<F: 'a>(rule: F) -> Self
44    where
45        F: SplitRule + 'a,
46    {
47        DefaultSplitterBuilder::default()
48            .rule(rule)
49            .rule(rules::SplitIdentifier)
50            .rule(rules::SplitOperator)
51            .whitespace(SplitWhitespaceOption::Remove)
52            .build()
53    }
54
55    pub fn with_identifier_rule<F: 'a>(rule: F) -> Self
56    where
57        F: SplitRule + 'a,
58    {
59        DefaultSplitterBuilder::default()
60            .rule(rules::SplitNumeric)
61            .rule(rule)
62            .rule(rules::SplitIdentifier)
63            .rule(rules::SplitOperator)
64            .whitespace(SplitWhitespaceOption::Remove)
65            .build()
66    }
67
68    #[inline]
69    pub fn builder() -> DefaultSplitterBuilder<'a> {
70        DefaultSplitterBuilder::default()
71    }
72
73    #[inline]
74    pub fn rules(&self) -> &[Box<dyn SplitRule + 'a>] {
75        self.rules.as_slice()
76    }
77}
78
79impl Splitter for DefaultSplitter<'_> {
80    fn split_into_tokens(&self, expression: &str) -> Vec<String> {
81        let mut tokens = Vec::new();
82        let mut iterator = expression.chars().peekable();
83
84        while let Some(c) = iterator.peek().cloned() {
85            iterator.next();
86
87            let mut next = false;
88
89            for rule in &self.rules {
90                match rule.split(c, &mut iterator) {
91                    Outcome::Data(s) => {
92                        tokens.push(s);
93                        next = true;
94                        break;
95                    }
96                    Outcome::Continue => {
97                        continue;
98                    }
99                    Outcome::Skip => {
100                        next = true;
101                        break;
102                    }
103                }
104            }
105
106            if !next {
107                tokens.push(c.to_string());
108            }
109        }
110
111        tokens
112    }
113}
114
115impl Default for DefaultSplitter<'_> {
116    fn default() -> Self {
117        DefaultSplitter::new(SplitWhitespaceOption::Remove)
118    }
119}
120
121pub struct DefaultSplitterBuilder<'a> {
122    rules: Vec<Box<dyn SplitRule + 'a>>,
123    whitespace_option: Option<SplitWhitespaceOption>,
124}
125
126impl<'a> DefaultSplitterBuilder<'a> {
127    pub fn new() -> Self {
128        DefaultSplitterBuilder {
129            rules: Vec::new(),
130            whitespace_option: None,
131        }
132    }
133
134    pub fn insert_rule<F: 'a>(mut self, index: usize, rule: F) -> Self
135    where
136        F: SplitRule + 'a,
137    {
138        self.rules.insert(index, Box::new(rule));
139        self
140    }
141
142    pub fn rule<F: 'a>(mut self, rule: F) -> Self
143    where
144        F: SplitRule + 'a,
145    {
146        self.rules.push(Box::new(rule));
147        self
148    }
149
150    pub fn whitespace(mut self, option: SplitWhitespaceOption) -> Self {
151        self.whitespace_option = Some(option);
152        self
153    }
154
155    pub fn build(self) -> DefaultSplitter<'a> {
156        let DefaultSplitterBuilder {
157            rules,
158            whitespace_option,
159            ..
160        } = self;
161
162        let mut rules = rules;
163        let whitespace_option = whitespace_option.unwrap_or(SplitWhitespaceOption::None);
164
165        match whitespace_option {
166            SplitWhitespaceOption::None => {}
167            SplitWhitespaceOption::Remove => rules.push(Box::new(rules::SkipWhitespace)),
168        };
169
170        DefaultSplitter { rules }
171    }
172}
173
174impl Default for DefaultSplitterBuilder<'_> {
175    fn default() -> Self {
176        DefaultSplitterBuilder::new()
177    }
178}
179
180pub mod rules {
181    use std::collections::HashSet;
182    use std::iter::Peekable;
183    use std::str::Chars;
184
185    /// The result of a split rule.
186    pub enum Outcome {
187        /// The result of a split.
188        Data(String),
189        /// Continue to the next rule.
190        Continue,
191        /// Skips the current `char`.
192        Skip,
193    }
194
195    pub trait SplitRule {
196        fn split(&self, c: char, rest: &mut Peekable<Chars>) -> Outcome;
197    }
198
199    pub struct SplitIdentifier;
200    impl SplitRule for SplitIdentifier {
201        fn split(&self, c: char, rest: &mut Peekable<Chars>) -> Outcome {
202            #[inline]
203            fn is_valid_char(c: &char) -> bool {
204                matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
205            }
206
207            match c {
208                'a'..='z' | 'A'..='Z' | '_' => {
209                    let mut temp = String::new();
210                    temp.push(c);
211
212                    while let Some(c) = rest.next_if(is_valid_char) {
213                        temp.push(c);
214                    }
215
216                    Outcome::Data(temp)
217                }
218                _ => Outcome::Continue,
219            }
220        }
221    }
222
223    pub struct SplitNumeric;
224    impl SplitRule for SplitNumeric {
225        fn split(&self, c: char, rest: &mut Peekable<Chars>) -> Outcome {
226            #[inline]
227            fn is_valid_char(c: &char) -> bool {
228                matches!(c, '0'..='9' | '.')
229            }
230
231            match c {
232                '0'..='9' => {
233                    let mut temp = String::new();
234                    temp.push(c);
235
236                    let mut has_decimal_point = false;
237
238                    while let Some(c) = rest.next_if(is_valid_char) {
239                        if c == '.' {
240                            if has_decimal_point {
241                                break;
242                            }
243
244                            has_decimal_point = true;
245                        }
246
247                        temp.push(c);
248                    }
249
250                    Outcome::Data(temp)
251                }
252                _ => Outcome::Continue,
253            }
254        }
255    }
256
257    pub struct SplitOperator;
258    impl SplitRule for SplitOperator {
259        fn split(&self, c: char, rest: &mut Peekable<Chars>) -> Outcome {
260            fn is_valid_char(c: &char) -> bool {
261                matches!(
262                    c,
263                    '~' | '`'
264                        | '!'
265                        | '@'
266                        | '#'
267                        | '$'
268                        | '%'
269                        | '^'
270                        | '&'
271                        | '*'
272                        | '-'
273                        | '+'
274                        | '_'
275                        | ':'
276                        | ';'
277                        | '"'
278                        | '\''
279                        | '|'
280                        | '\\'
281                        | '?'
282                        | '.'
283                        | '<'
284                        | '>'
285                        | '/'
286                        | '='
287                        | ','
288                )
289            }
290
291            match c {
292                _ if is_valid_char(&c) => {
293                    let mut temp = String::new();
294                    temp.push(c);
295
296                    while let Some(c) = rest.next_if(is_valid_char) {
297                        temp.push(c);
298                    }
299
300                    Outcome::Data(temp)
301                }
302                _ => Outcome::Continue,
303            }
304        }
305    }
306
307    pub struct SplitWithOperatorsBuilder {
308        operators: HashSet<char>,
309    }
310    impl SplitWithOperatorsBuilder {
311        pub fn new() -> Self {
312            let operators = HashSet::from([
313                '~', '`', '!', '@', '#', '$', '%', '^', '&', '*', '-', '+', '_', ':', ';', '"',
314                '\'', '|', '\\', '?', '.', '<', '>', '/', '=', ',',
315            ]);
316
317            SplitWithOperatorsBuilder { operators }
318        }
319
320        pub fn empty() -> Self {
321            SplitWithOperatorsBuilder { operators: HashSet::new() }
322        }
323
324        pub fn add_operator(&mut self, operator: char) -> &mut Self {
325            self.operators.insert(operator);
326            self
327        }
328
329        pub fn except(mut self, operator: char) -> Self {
330            self.operators.remove(&operator);
331            self
332        }
333
334        pub fn build(self) -> SplitWithOperators {
335            SplitWithOperators {
336                operators: self.operators,
337            }
338        }
339    }
340
341    pub struct SplitWithOperators {
342        operators: HashSet<char>,
343    }
344    impl SplitWithOperators {
345        pub fn new() -> Self {
346            SplitWithOperatorsBuilder::new().build()
347        }
348
349        pub fn builder() -> SplitWithOperatorsBuilder {
350            SplitWithOperatorsBuilder::new()
351        }
352
353        pub fn is_valid(&self, c: &char) -> bool {
354            self.operators.contains(c)
355        }
356    }
357    impl SplitRule for SplitWithOperators {
358        fn split(&self, c: char, rest: &mut Peekable<Chars>) -> Outcome {
359            match c {
360                _ if self.is_valid(&c) => {
361                    let mut temp = String::new();
362                    temp.push(c);
363
364                    while let Some(c) = rest.next_if(|c| self.is_valid(c)) {
365                        temp.push(c);
366                    }
367
368                    Outcome::Data(temp)
369                }
370                _ => Outcome::Continue,
371            }
372        }
373    }
374
375    pub struct SkipWhitespace;
376    impl SplitRule for SkipWhitespace {
377        fn split(&self, c: char, _: &mut Peekable<Chars>) -> Outcome {
378            if c.is_whitespace() {
379                return Outcome::Skip;
380            }
381
382            Outcome::Continue
383        }
384    }
385
386    #[cfg(feature = "binary")]
387    pub struct SplitBinary;
388
389    #[cfg(feature = "binary")]
390    impl SplitRule for SplitBinary {
391        fn split(&self, c: char, rest: &mut Peekable<Chars>) -> Outcome {
392            fn is_next_binary(chars: &mut Peekable<Chars>) -> bool {
393                chars.peek() == Some(&'1') || chars.peek() == Some(&'0')
394            }
395
396            if c == 'b' && is_next_binary(rest) {
397                let mut temp = String::new();
398                temp.push(c);
399                while let Some(c) = rest.peek() {
400                    if c.is_ascii_digit() {
401                        temp.push(*c);
402                        rest.next();
403                    } else {
404                        break;
405                    }
406                }
407
408                Outcome::Data(temp)
409            } else {
410                Outcome::Continue
411            }
412        }
413    }
414}
415
416#[cfg(test)]
417mod tests {
418    use super::DefaultSplitter;
419    use super::{SplitWhitespaceOption, Splitter};
420
421    #[test]
422    fn split_into_tokens() {
423        let splitter = DefaultSplitter::default();
424        assert_eq!(
425            ["10", "+", "-", "2", "*", "Sin", "(", "45", ")"].to_vec(),
426            splitter.split_into_tokens("10 + -2 * Sin(45)")
427        );
428        assert_eq!(
429            ["10", "+", "(", "-", "3", ")", "*", "0.25"].to_vec(),
430            splitter.split_into_tokens("10 + (-3) * 0.25")
431        );
432        assert_eq!(
433            ["(", "x", "+", "y", ")", "-", "2", "^", "10"].to_vec(),
434            splitter.split_into_tokens("(x+y)-2^10")
435        );
436        assert_eq!(
437            ["Log2", "(", "25", ")", "*", "PI", "-", "2"].to_vec(),
438            splitter.split_into_tokens("Log2(25) * PI - 2")
439        );
440        assert_eq!(
441            ["2", "PI", "+", "10"].to_vec(),
442            splitter.split_into_tokens("2PI + 10")
443        );
444        assert_eq!(
445            ["x", "=", "10"].to_vec(),
446            splitter.split_into_tokens("x = 10")
447        );
448
449        assert_eq!(
450            ["5", " ", "*", " ", "2"].to_vec(),
451            DefaultSplitter::new(SplitWhitespaceOption::None).split_into_tokens("5 * 2")
452        );
453
454        assert_eq!(
455            ["256", ">>", "3"].to_vec(),
456            DefaultSplitter::default().split_into_tokens("256 >> 3")
457        );
458    }
459}