scrapelect/frontend/
scanner.rs

1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2pub enum Token {
3    /// an identifier
4    Id,
5    /// a literal integer
6    Int,
7    /// a literal decimal number
8    Float,
9    /// a literal string
10    String,
11    /// the selector combinator `.` to indicate a class name
12    Dot,
13    /// the selector combinator `#` to indicate an id name
14    Hash,
15    /// the selector combinator `*` to match all elements
16    Star,
17    /// the selector combinator `+` to indicate the next sibling
18    Plus,
19    /// the selector combinator `>` to indicate a direct child or the end of an inline expansion
20    Greater,
21    /// the selector combinator `~` to indicate a subsequent sibling
22    Tilde,
23    /// any whitespace, significant to separate e.g., `a#id` from  `a #id`
24    Whitespace,
25    /// the selector option `?` to indicate zero or one item
26    Question,
27    /// an opening brace `{` to start an element block
28    BraceOpen,
29    /// a closing brace `}` to end an element block
30    BraceClose,
31    /// a dollar sign `$` to dereference a variable
32    Dollar,
33    /// a pipe `|` to indicate a filter
34    Pipe,
35    /// an opening parenthesis `(` to start a filter call
36    ParenOpen,
37    /// a closing parenthesis `)` to end a filter call
38    ParenClose,
39    /// a comma `,` to separate arguments in a list, or to indicate a different selector
40    Comma,
41    /// a colon `:` to separate id from value in statements and arguments
42    Colon,
43    /// a semicolon `;` to indicate the end of a statement
44    Semi,
45    /// a less than sign `<` to indicate the start of an inline expansion
46    Less,
47    /// an opening bracket `[` to indicate the start of a select filter
48    /// or CSS attribute selector
49    BracketOpen,
50    /// a closing bracket `]` to indicate the end of a select filter
51    /// or CSS attribute selector
52    BracketClose,
53    /// A single-line comment that begins with two forward slashes '//' and
54    /// spans the rest of the line
55    Comment,
56    /// special token to indicate the end of the file
57    Eof,
58    /// special token to indicate unknown token
59    Unknown,
60}
61
62mod statics {
63    use super::Token;
64    use regex::{Regex, RegexSet};
65    use std::sync::LazyLock;
66
67    macro_rules! make_regex_set {
68        {$vis: vis ($tokens: ident, $re_set: ident, $re_compiled: ident) = {$($tk: ident <- $pat: literal)*};} => {
69            $vis static $tokens: &[Token] = &[
70                $(Token::$tk, )*
71            ];
72
73            $vis static $re_set: LazyLock<RegexSet> = LazyLock::new(|| RegexSet::new(&[
74                $(
75                    concat!("^", $pat),
76                )*
77                ]).expect("error building RegexSet"));
78
79            $vis static $re_compiled: LazyLock<Vec<Regex>> = LazyLock::new(|| vec![
80                $(
81                    Regex::new(concat!("^", $pat)).expect(concat!("Error building Regex `", $pat, "`")),
82                )*
83            ]);
84        };
85    }
86
87    make_regex_set! {
88        pub(super) (TOKENS, REGEX_SET, REGEX_LIST) = {
89            Id <- "[a-zA-Z][a-zA-Z0-9_-]*"
90            Int <- "[+-]?[0-9]+"
91            // must match at least one number before decimal pt, but not necessarily after
92            Float <- r"[+-]?[0-9]+\.[0-9]*"
93            String <- r#""(\\.|[^\\"])*""#
94            Dot <- r"\."
95            Hash <- "#"
96            Star <- r"\*"
97            Plus <- r"\+"
98            Greater <- ">"
99            Tilde <- "~"
100            Whitespace <- r"\p{White_Space}+"
101            Question <- r"\?"
102            BraceOpen <- r"\{"
103            BraceClose <- r"\}"
104            Dollar <- r"\$"
105            Pipe <- r"\|"
106            ParenOpen <- r"\("
107            ParenClose <- r"\)"
108            Comma <- ","
109            Colon <- ":"
110            Semi <- ";"
111            Less <- "<"
112            BracketOpen <- r"\["
113            BracketClose <- r"\]"
114            Comment <- r"//[^\n]*"
115        };
116    }
117}
118
119#[derive(Debug, Clone)]
120pub struct Scanner<'a> {
121    slice: &'a str,
122    idx: usize,
123    line: usize,
124}
125
126#[derive(Debug, Clone, Copy, Default)]
127#[non_exhaustive]
128pub struct Span {
129    pub line: usize,
130    pub start: usize,
131    pub end: usize,
132}
133
134#[derive(Debug, Clone, Copy, PartialEq, Eq)]
135pub struct Lexeme<'a> {
136    pub token: Token,
137    pub value: &'a str,
138}
139
140const EOF: Lexeme = Lexeme {
141    token: Token::Eof,
142    value: "",
143};
144
145impl<'a> Scanner<'a> {
146    #[must_use]
147    pub const fn new(slice: &'a str) -> Self {
148        Self {
149            slice,
150            idx: 0,
151            line: 1,
152        }
153    }
154
155    #[must_use]
156    pub fn peek_token(&self) -> (Span, Lexeme<'a>) {
157        if self.idx >= self.slice.len() {
158            return (Span::default(), EOF);
159        }
160
161        // note to self: we can't use find_at because it still considers the
162        // start of the string (e.g., for `^`) to be i = 0, not i = idx.
163        statics::REGEX_SET
164            .matches(&self.slice[self.idx..])
165            .into_iter()
166            .map(|x| Lexeme {
167                token: statics::TOKENS[x],
168                value: statics::REGEX_LIST[x]
169                    .find(&self.slice[self.idx..])
170                    .expect("matched in set should match in list")
171                    .as_str(),
172            })
173            .max_by_key(|x| x.value.len())
174            .map(|lx| {
175                (
176                    Span {
177                        line: self.line,
178                        start: self.idx,
179                        end: self.idx + lx.value.len(),
180                    },
181                    lx,
182                )
183            })
184            .unwrap_or((
185                Span {
186                    line: self.line,
187                    start: self.idx,
188                    end: self.idx + 1,
189                },
190                Lexeme {
191                    token: Token::Unknown,
192                    value: &self.slice[self.idx..=self.idx],
193                },
194            ))
195    }
196
197    pub fn eat_token(&mut self) -> (Span, Lexeme<'a>) {
198        let (span, lexeme) = self.peek_token();
199        self.idx += lexeme.value.len();
200        self.line += lexeme.value.chars().filter(|&x| x == '\n').count();
201        (span, lexeme)
202    }
203
204    /// Looks ahead for the next non-`Comment` [`Lexeme`]
205    /// and returns it without [eating](Self::eat_token) it.
206    pub fn peek_non_comment(&mut self) -> (Span, Lexeme<'a>) {
207        while let (
208            _,
209            Lexeme {
210                token: Token::Comment,
211                ..
212            },
213        ) = self.peek_token()
214        {
215            self.eat_token();
216        }
217        self.peek_token()
218    }
219
220    /// Looks ahead for the first non-whitespace, non-comment [`Lexeme`]
221    /// and returns it without [eating](Self::eat_token) it.
222    pub fn peek_non_whitespace(&mut self) -> (Span, Lexeme<'a>) {
223        while let (
224            _,
225            Lexeme {
226                token: Token::Whitespace,
227                ..
228            },
229        ) = self.peek_non_comment()
230        {
231            self.eat_token();
232        }
233        self.peek_token()
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::{
240        statics::{REGEX_LIST, REGEX_SET},
241        Lexeme, Scanner, Token, EOF,
242    };
243
244    #[test]
245    fn test_tokens() {
246        let scanner = Scanner::new("");
247        assert_eq!(scanner.peek_token().1, EOF);
248
249        macro_rules! test_matches {
250            {$($tk: ident => $($pat: literal)+ $(!($($npat: literal)+))?)* } => {
251                $(
252                    $(
253                        assert_eq!(
254                            Scanner::new($pat).peek_token().1,
255                            Lexeme { token: Token::$tk, value: $pat }
256                        );
257                    )+
258
259                    $(
260                        $(
261                            assert_ne!(
262                                Scanner::new($npat).peek_token().1,
263                                Lexeme { token: Token::$tk, value: $npat }
264                            );
265                        )*
266                    )?
267                )*
268            };
269        }
270
271        test_matches! {
272            Id => "a" "a-" "A9-9-9-9" "a____a" !("9" "-" "_")
273            Int => "+1" "1" "1234" "-1" !("+" "-")
274            Float => "0." "-0.1234" "+0.12345" !("1" ".5" "-.5" ".")
275            String => r#""hello!""# r#""""# r#""\"""# !(r#"""""# r#""\""#)
276            // ensure no regressions because we have to escape these
277            Dot => "." !("a")
278            Star => "*"
279            Plus => "+"
280            Question => "?"
281            Pipe => "|"
282            BracketOpen => "["
283            BracketClose => "]"
284        }
285    }
286
287    macro_rules! lx {
288        ($tk: ident, $lit: literal) => {
289            Lexeme {
290                token: Token::$tk,
291                value: $lit,
292            }
293        };
294    }
295
296    #[test]
297    fn test_eat() {
298        let mut sc = Scanner::new("h3 h4#h5.h6 {}");
299        assert_eq!(sc.eat_token().1, lx!(Id, "h3"));
300        assert_eq!(sc.eat_token().1, lx!(Whitespace, " "));
301        assert_eq!(sc.eat_token().1, lx!(Id, "h4"));
302        assert_eq!(sc.eat_token().1, lx!(Hash, "#"));
303        assert_eq!(sc.eat_token().1, lx!(Id, "h5"));
304        assert_eq!(sc.eat_token().1, lx!(Dot, "."));
305        assert_eq!(sc.eat_token().1, lx!(Id, "h6"));
306        assert_eq!(sc.eat_token().1, lx!(Whitespace, " "));
307        assert_eq!(sc.eat_token().1, lx!(BraceOpen, "{"));
308        assert_eq!(sc.eat_token().1, lx!(BraceClose, "}"));
309    }
310
311    #[test]
312    fn test_peek_whitespace() {
313        let mut sc = Scanner::new("h3 h4#h5.h6 {}");
314        sc.peek_non_whitespace();
315        assert_eq!(sc.eat_token().1, lx!(Id, "h3"));
316        sc.peek_non_whitespace();
317        assert_eq!(sc.eat_token().1, lx!(Id, "h4"));
318        sc.peek_non_whitespace();
319        assert_eq!(sc.eat_token().1, lx!(Hash, "#"));
320        sc.peek_non_whitespace();
321        assert_eq!(sc.eat_token().1, lx!(Id, "h5"));
322        sc.peek_non_whitespace();
323        assert_eq!(sc.eat_token().1, lx!(Dot, "."));
324        sc.peek_non_whitespace();
325        assert_eq!(sc.eat_token().1, lx!(Id, "h6"));
326        sc.peek_non_whitespace();
327        assert_eq!(sc.eat_token().1, lx!(BraceOpen, "{"));
328        sc.peek_non_whitespace();
329        assert_eq!(sc.eat_token().1, lx!(BraceClose, "}"));
330    }
331
332    #[test]
333    fn test_whitespace_mix() {
334        let mut sc = Scanner::new("h3 h4#h5.h6 {}");
335        assert_eq!(sc.eat_token().1, lx!(Id, "h3"));
336        assert_eq!(sc.eat_token().1, lx!(Whitespace, " "));
337        assert_eq!(sc.eat_token().1, lx!(Id, "h4"));
338        assert_eq!(sc.eat_token().1, lx!(Hash, "#"));
339        assert_eq!(sc.eat_token().1, lx!(Id, "h5"));
340        assert_eq!(sc.eat_token().1, lx!(Dot, "."));
341        assert_eq!(sc.eat_token().1, lx!(Id, "h6"));
342        sc.peek_non_whitespace();
343        assert_eq!(sc.eat_token().1, lx!(BraceOpen, "{"));
344        assert_eq!(sc.eat_token().1, lx!(BraceClose, "}"));
345    }
346
347    #[test]
348    fn test_comments() {
349        let mut sc = Scanner::new(
350            r"// Hello! This is a comment!
351            b: a // and another! {
352            {
353            // } don't be fooled!
354            }",
355        );
356
357        assert_eq!(sc.peek_non_whitespace().1, lx!(Id, "b"));
358        sc.eat_token();
359        assert_eq!(sc.peek_non_whitespace().1, lx!(Colon, ":"));
360        sc.eat_token();
361        assert_eq!(sc.peek_non_whitespace().1, lx!(Id, "a"));
362        sc.eat_token();
363        assert_eq!(sc.peek_non_whitespace().1, lx!(BraceOpen, "{"));
364        sc.eat_token();
365        assert_eq!(sc.eat_token().1.token, Token::Whitespace);
366        assert_eq!(sc.eat_token().1, lx!(Comment, "// } don't be fooled!"));
367        assert_eq!(sc.peek_non_whitespace().1, lx!(BraceClose, "}"));
368        sc.eat_token();
369        assert_eq!(sc.eat_token().1.token, Token::Eof);
370    }
371
372    #[test]
373    fn all_regex_is_valid() {
374        let _ = &*REGEX_SET;
375        let _ = &*REGEX_LIST;
376    }
377}
scrapelect/frontend/scanner.rs

scrapelect/frontend/
scanner.rs