scrapelect/frontend/
parser.rs

1use core::fmt;
2use std::borrow::Cow;
3
4use super::{
5    arena::Arena,
6    ast::{
7        ArgList, Ast, AstRef, Element, Filter, FilterCall, FilterList, FilterSelect, Inline, Leaf,
8        Qualifier, RValue, Selector, SelectorCombinator, SelectorList, Statement, StatementList,
9    },
10    scanner::{Lexeme, Scanner, Span, Token},
11};
12
13#[derive(Debug)]
14pub struct Parser<'a> {
15    scanner: Scanner<'a>,
16    arena: Arena<Ast<'a>>,
17}
18
19#[derive(Debug, Clone)]
20#[non_exhaustive]
21pub enum ParseError {
22    UnexpectedToken {
23        expected: Vec<Token>,
24        got: Token,
25        value: String,
26        span: Span,
27    },
28}
29
30impl fmt::Display for ParseError {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        match self {
33            Self::UnexpectedToken {
34                expected,
35                got,
36                span,
37                value,
38            } => {
39                write!(
40                    f,
41                    "Expected one of {expected:?}, got {got:?} '{value}' on line {}",
42                    span.line
43                )
44            }
45        }
46    }
47}
48
49impl std::error::Error for ParseError {}
50
51impl ParseError {
52    /// Helper function to construct the `ParseError::UnexpectedToken` variant
53    /// from a [`Lexeme`] and a [`Span`] and expected values.
54    pub fn unexpected(expected: Vec<Token>, lx: Lexeme<'_>, span: Span) -> Self {
55        Self::UnexpectedToken {
56            expected,
57            got: lx.token,
58            value: lx.value.to_owned(),
59            span,
60        }
61    }
62}
63
64type Result<T> = std::result::Result<T, ParseError>;
65
66impl<'a> Parser<'a> {
67    #[must_use]
68    pub const fn new(input: &'a str) -> Self {
69        Self {
70            scanner: Scanner::new(input),
71            arena: Arena::new(),
72        }
73    }
74
75    pub fn parse(mut self) -> Result<(Arena<Ast<'a>>, Option<AstRef<'a, StatementList<'a>>>)> {
76        let r = match self.parse_statement_list() {
77            Ok(r) => r,
78            Err(e) => {
79                return Err(e);
80            }
81        };
82        self.try_eat(Token::Eof)?;
83        Ok((self.arena, r))
84    }
85
86    pub fn parse_statement_list(&mut self) -> Result<Option<AstRef<'a, StatementList<'a>>>> {
87        let (_, lx) = self.scanner.peek_non_whitespace();
88
89        if lx.token == Token::Id {
90            let statement = self.parse_statement()?;
91            let next = self.parse_statement_list()?;
92
93            Ok(Some(
94                self.arena
95                    .insert_variant(StatementList::new(statement, next)),
96            ))
97        } else {
98            Ok(None)
99        }
100    }
101
102    fn parse_statement(&mut self) -> Result<Statement<'a>> {
103        let id = self.try_eat(Token::Id)?.value;
104        self.try_eat(Token::Colon)?;
105        let value = self.parse_rvalue()?;
106        let filters = self.parse_filter_list()?;
107        self.try_eat(Token::Semi)?;
108        Ok(Statement { id, value, filters })
109    }
110
111    fn parse_rvalue(&mut self) -> Result<RValue<'a>> {
112        let (_, lx) = self.scanner.peek_non_whitespace();
113
114        match lx.token {
115            Token::Id | Token::Less | Token::Dot | Token::Hash => {
116                self.parse_element().map(RValue::Element)
117            }
118            _ => self.parse_leaf().map(RValue::Leaf),
119        }
120    }
121
122    fn parse_leaf(&mut self) -> Result<Leaf<'a>> {
123        self.scanner.peek_non_whitespace();
124        let (span, lx) = self.scanner.eat_token();
125        match lx.token {
126            Token::String => Ok(Leaf::String(parse_string_literal(lx.value))),
127            Token::Float => Ok(Leaf::Float(
128                lx.value.parse().expect("float literal invalid"),
129            )),
130            Token::Int => Ok(Leaf::Int(lx.value.parse().expect("int literal invalid"))),
131            Token::Dollar => {
132                let id = self.try_eat(Token::Id)?.value;
133                Ok(Leaf::Var(id))
134            }
135            _ => Err(ParseError::unexpected(
136                vec![Token::String, Token::Float, Token::Int, Token::Dollar],
137                lx,
138                span,
139            )),
140        }
141    }
142
143    #[inline]
144    fn try_eat(&mut self, tk: Token) -> Result<Lexeme<'a>> {
145        let (span, lx) = self.scanner.peek_non_whitespace();
146        self.scanner.eat_token();
147
148        if lx.token == tk {
149            Ok(lx)
150        } else {
151            Err(ParseError::unexpected(vec![tk], lx, span))
152        }
153    }
154
155    fn parse_element(&mut self) -> Result<Element<'a>> {
156        let url = self.parse_maybe_url()?;
157        let selector_head = self.parse_selector()?;
158        let selectors = self.parse_selector_list()?;
159
160        self.try_eat(Token::BraceOpen)?;
161
162        let statements = self.parse_statement_list()?;
163
164        self.try_eat(Token::BraceClose)?;
165
166        let qualifier = self.parse_qualifier()?;
167
168        Ok(Element {
169            url,
170            selector_head,
171            selectors,
172            qualifier,
173            statements,
174        })
175    }
176
177    fn parse_maybe_url(&mut self) -> Result<Option<Inline<'a>>> {
178        let (_, lx) = self.scanner.peek_non_whitespace();
179        if lx.token == Token::Less {
180            self.parse_inline().map(Some)
181        } else {
182            Ok(None)
183        }
184    }
185
186    fn parse_inline(&mut self) -> Result<Inline<'a>> {
187        self.try_eat(Token::Less)?;
188        let value = self.parse_leaf()?;
189        let filters = self.parse_filter_list()?;
190        self.try_eat(Token::Greater)?;
191        Ok(Inline { value, filters })
192    }
193
194    fn parse_value(&mut self) -> Result<Inline<'a>> {
195        let (span, lx) = self.scanner.peek_non_whitespace();
196        match lx.token {
197            Token::Less => self.parse_inline(),
198            Token::Dollar | Token::Int | Token::Float | Token::String => {
199                self.parse_leaf().map(|value| Inline {
200                    value,
201                    filters: None,
202                })
203            }
204            _ => Err(ParseError::unexpected(
205                vec![
206                    Token::Less,
207                    Token::Dollar,
208                    Token::Int,
209                    Token::Float,
210                    Token::String,
211                ],
212                lx,
213                span,
214            )),
215        }
216    }
217
218    fn parse_selector_list(&mut self) -> Result<Option<AstRef<'a, SelectorList<'a>>>> {
219        let mut item = self.scanner.peek_non_comment();
220        if item.1.token == Token::Whitespace {
221            self.scanner.eat_token();
222            let next = self.scanner.peek_non_whitespace();
223            // if the next lexeme after the whitespace doesn't signify a selector,
224            // the whitespace is not significant.
225            match next.1.token {
226                Token::Id | Token::Hash | Token::Dot | Token::Star => (),
227                _ => item = next,
228            };
229        }
230
231        let (span, lx) = item;
232
233        let sel = match lx.token {
234            Token::BraceOpen | Token::ParenOpen => return Ok(None),
235            // invariant: peek_next_whitespace is one of Id | Hash | Dot | Star
236            // whitespace is eaten in the above block.
237            Token::Whitespace => SelectorCombinator::Descendent(self.parse_selector()?),
238            Token::Greater => {
239                self.scanner.eat_token();
240                SelectorCombinator::Child(self.parse_selector()?)
241            }
242            Token::Plus => {
243                self.scanner.eat_token();
244                SelectorCombinator::NextSibling(self.parse_selector()?)
245            }
246            Token::Tilde => {
247                self.scanner.eat_token();
248                SelectorCombinator::SubsequentSibling(self.parse_selector()?)
249            }
250            Token::Hash | Token::Dot | Token::Id | Token::Star => {
251                SelectorCombinator::And(self.parse_selector()?)
252            }
253            _ => {
254                return Err(ParseError::unexpected(
255                    vec![
256                        Token::Whitespace,
257                        Token::Greater,
258                        Token::Plus,
259                        Token::Tilde,
260                        Token::Hash,
261                        Token::Dot,
262                        Token::Id,
263                        Token::Star,
264                    ],
265                    lx,
266                    span,
267                ))
268            }
269        };
270
271        let itm = SelectorList::new(sel, self.parse_selector_list()?);
272
273        Ok(Some(self.arena.insert_variant(itm)))
274    }
275
276    fn parse_selector(&mut self) -> Result<Selector<'a>> {
277        let (span, lx) = self.scanner.peek_non_whitespace();
278        match lx.token {
279            Token::Dot => {
280                self.scanner.eat_token();
281                self.try_eat(Token::Id).map(|lx| Selector::Class(lx.value))
282            }
283            Token::Hash => {
284                self.scanner.eat_token();
285                self.try_eat(Token::Id).map(|lx| Selector::Id(lx.value))
286            }
287            Token::Id => {
288                self.scanner.eat_token();
289                Ok(Selector::Tag(lx.value))
290            }
291            Token::Star => {
292                self.scanner.eat_token();
293                Ok(Selector::Any)
294            }
295            _ => Err(ParseError::unexpected(
296                vec![Token::Dot, Token::Hash, Token::Id, Token::Star],
297                lx,
298                span,
299            )),
300        }
301    }
302
303    fn parse_filter_list(&mut self) -> Result<Option<AstRef<'a, FilterList<'a>>>> {
304        let (_, lx) = self.scanner.peek_non_whitespace();
305        if lx.token == Token::Pipe {
306            self.scanner.eat_token();
307            let filter = self.parse_filter()?;
308            let next = self.parse_filter_list()?;
309            let qualifier = self.parse_qualifier()?;
310            let r = self
311                .arena
312                .insert_variant(FilterList::new(filter, qualifier, next));
313            Ok(Some(r))
314        } else {
315            Ok(None)
316        }
317    }
318
319    fn parse_filter(&mut self) -> Result<Filter<'a>> {
320        let (span, lx) = self.scanner.peek_non_whitespace();
321        self.scanner.eat_token();
322
323        match lx.token {
324            Token::Id => {
325                let id = lx.value;
326                self.try_eat(Token::ParenOpen)?;
327                let args = self.parse_arg_list()?;
328                self.try_eat(Token::ParenClose)?;
329                Ok(Filter::Call(FilterCall::new(id, args)))
330            }
331            Token::BracketOpen => {
332                let name = self.try_eat(Token::Id)?.value;
333                self.try_eat(Token::Colon)?;
334                let leaf = self.parse_leaf()?;
335                let filters = self.parse_filter_list()?;
336                self.try_eat(Token::BracketClose)?;
337                Ok(Filter::Select(FilterSelect::new(
338                    name,
339                    Inline {
340                        value: leaf,
341                        filters,
342                    },
343                )))
344            }
345            _ => Err(ParseError::unexpected(
346                vec![Token::Id, Token::BracketOpen],
347                lx,
348                span,
349            )),
350        }
351    }
352
353    fn parse_arg_list(&mut self) -> Result<Option<AstRef<'a, ArgList<'a>>>> {
354        let (span, lx) = self.scanner.peek_non_whitespace();
355        match lx.token {
356            Token::ParenClose => Ok(None),
357            Token::Id => {
358                let id = lx.value;
359                self.scanner.eat_token();
360                self.try_eat(Token::Colon)?;
361                let value = self.parse_value()?;
362                let next = match self.scanner.peek_non_whitespace().1.token {
363                    Token::Comma => {
364                        self.scanner.eat_token();
365                        self.parse_arg_list()?
366                    }
367                    _ => None,
368                };
369
370                let r = self.arena.insert_variant(ArgList::new(id, value, next));
371                Ok(Some(r))
372            }
373            _ => Err(ParseError::unexpected(
374                vec![Token::ParenClose, Token::Id],
375                lx,
376                span,
377            )),
378        }
379    }
380
381    fn parse_qualifier(&mut self) -> Result<Qualifier> {
382        let (_, lx) = self.scanner.peek_non_whitespace();
383        Ok(match lx.token {
384            Token::Question => {
385                self.scanner.eat_token();
386                Qualifier::Optional
387            }
388            Token::Star => {
389                self.scanner.eat_token();
390                Qualifier::Collection
391            }
392            _ => Qualifier::One,
393        })
394    }
395}
396
397fn parse_string_literal(s: &str) -> Cow<'_, str> {
398    debug_assert!(s.len() >= 2 && &s[0..1] == "\"" && &s[s.len() - 1..] == "\"");
399    let mut replace = vec![];
400    let s = &s[1..s.len() - 1];
401
402    let mut escape_next = false;
403    for (i, s) in s.char_indices() {
404        if escape_next {
405            escape_next = false;
406            let escaped = match s {
407                'n' => '\n',
408                '\\' => '\\',
409                '"' => '"',
410                other => {
411                    // TODO
412                    eprintln!("Unknown escape character {other:?}");
413                    other
414                }
415            };
416
417            replace.push((i, Some(escaped)));
418        } else if s == '\\' {
419            escape_next = true;
420            replace.push((i, None));
421        }
422    }
423
424    if replace.is_empty() {
425        Cow::Borrowed(s)
426    } else {
427        let mut replace = replace.into_iter().peekable();
428        Cow::Owned(
429            s.char_indices()
430                .filter_map(|(i, x)| {
431                    replace
432                        .peek()
433                        .copied()
434                        .and_then(|(j, v)| {
435                            if i == j {
436                                let _ = replace.next();
437                                Some(v)
438                            } else {
439                                None
440                            }
441                        })
442                        .unwrap_or(Some(x))
443                })
444                .collect(),
445        )
446    }
447}
448
449#[cfg(test)]
450mod tests {
451    use std::borrow::Cow;
452
453    use super::{parse_string_literal, Parser};
454    use crate::frontend::ast::*;
455
456    fn fmt_selector<'a>(head: &Selector<'a>, list: &[&SelectorList<'a>]) -> String {
457        use std::fmt::Write as _;
458        let mut out = String::new();
459        write!(&mut out, "{head}").expect("fmt error");
460        for node in list {
461            let _ = match &node.sel {
462                SelectorCombinator::And(s) => write!(&mut out, "{s}"),
463                SelectorCombinator::Child(s) => write!(&mut out, " > {s}"),
464                SelectorCombinator::Descendent(s) => write!(&mut out, " {s}"),
465                SelectorCombinator::NextSibling(s) => write!(&mut out, " + {s}"),
466                SelectorCombinator::SubsequentSibling(s) => write!(&mut out, " ~ {s}"),
467            };
468        }
469
470        out
471    }
472
473    #[test]
474    fn test_parse() {
475        let string = r#"a: h1 {
476                x: $me | cat(i: "x", ) | meow();
477
478                y: h2#x > .cat  {
479
480                };
481            };"#;
482        let parser = Parser::new(&string);
483        let (arena, r) = parser.parse().expect("parsing failed");
484
485        let stmts = arena.flatten(r);
486        let stmt = &stmts[0].value;
487
488        assert_eq!(stmt.id, "a");
489        let RValue::Element(element) = &stmt.value else {
490            panic!("expected element");
491        };
492
493        assert_eq!(
494            fmt_selector(&element.selector_head, &arena.flatten(element.selectors)),
495            "h1"
496        );
497
498        assert_eq!(element.qualifier, Qualifier::One);
499        let statements = arena.flatten(element.statements);
500
501        let stmt = &statements[0].value;
502
503        assert!(
504            matches!(
505                stmt,
506                Statement {
507                    id: "x",
508                    value: RValue::Leaf(Leaf::Var("me")),
509                    ..
510                }
511            ),
512            "found {stmt:?}",
513        );
514
515        let filters = arena.flatten(stmt.filters);
516        assert!(
517            matches!(
518                &filters[..],
519                [
520                    FilterList {
521                        filter: Filter::Call(FilterCall { id: "cat", .. }),
522                        ..
523                    },
524                    FilterList {
525                        filter: Filter::Call(FilterCall { id: "meow", .. }),
526                        ..
527                    }
528                ]
529            ),
530            "found {filters:?}"
531        );
532
533        let Filter::Call(filter) = &filters[0].filter else {
534            unreachable!("Validated as Filter::Call above");
535        };
536        let args = arena.flatten(filter.args);
537        assert!(
538            matches!(
539                &args[..],
540                [ArgList {
541                    id: "i",
542                    value: Inline {
543                        value: Leaf::String(Cow::Borrowed("x")),
544                        filters: None,
545                    },
546                    ..
547                }]
548            ),
549            "found {:?}",
550            &args[..]
551        );
552
553        let stmt = &statements[1].value;
554
555        let RValue::Element(element) = &stmt.value else {
556            panic!("Expected element");
557        };
558
559        assert!(element.statements.is_none());
560        assert_eq!(
561            fmt_selector(&element.selector_head, &arena.flatten(element.selectors)),
562            "h2#x > .cat"
563        );
564    }
565
566    #[test]
567    fn test_escape_strings() {
568        assert_eq!(parse_string_literal(r#""""#), "");
569        assert_eq!(parse_string_literal(r#""abcdef""#), "abcdef");
570        assert_eq!(parse_string_literal(r#""hello! \n""#), "hello! \n");
571        assert_eq!(
572            parse_string_literal(r#""my \" crazy \\ lifestyle \\\"""#),
573            r#"my " crazy \ lifestyle \""#
574        );
575    }
576}