xq_lang/
lexer.rs

1use lexgen::lexer;
2use thiserror::Error;
3
4pub type Loc = lexgen_util::Loc;
5pub type LexerError = lexgen_util::LexerError<LexicalError>;
6
7#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
8pub enum Keyword {
9    Or,
10    And,
11    Module,
12    Import,
13    Include,
14    Def,
15    As,
16    Label,
17    Break,
18    Null,
19    False,
20    True,
21    If,
22    Then,
23    Elif,
24    Else,
25    End,
26    Try,
27    TryNoCatch,
28    Catch,
29    Reduce,
30    Foreach,
31}
32impl Keyword {
33    #[must_use]
34    pub const fn to_str(&self) -> &'static str {
35        match self {
36            Self::Or => "or",
37            Self::And => "and",
38            Self::Module => "module",
39            Self::Import => "import",
40            Self::Include => "include",
41            Self::Def => "def",
42            Self::As => "as",
43            Self::Label => "label",
44            Self::Break => "break",
45            Self::Null => "null",
46            Self::False => "false",
47            Self::True => "true",
48            Self::If => "if",
49            Self::Then => "then",
50            Self::Elif => "elif",
51            Self::Else => "else",
52            Self::End => "end",
53            Self::Try => "try",
54            Self::TryNoCatch => "try",
55            Self::Catch => "catch",
56            Self::Reduce => "reduce",
57            Self::Foreach => "foreach",
58        }
59    }
60}
61
62#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
63pub enum StringFragment<'input> {
64    String(&'input str),
65    Char(char),
66}
67#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
68pub enum Token<'input> {
69    Plus,
70    Minus,
71    Star,
72    Slash,
73    Percent,
74
75    Eq,
76    PlusEq,
77    MinusEq,
78    StarEq,
79    SlashEq,
80    PercentEq,
81    SlashSlashEq,
82    PipeEq,
83
84    EqEq,
85    NotEq,
86    LtEq,
87    GtEq,
88    Lt,
89    Gt,
90
91    Comma,
92    Dot,
93    Semicolon,
94    Colon,
95    DotDot,
96
97    Pipe,
98    Question,
99    SlashSlash,
100    QuestionSlashSlash,
101
102    LParen,
103    RParen,
104    LBrace,
105    RBrace,
106    LBracket,
107    RBracket,
108
109    StringStart,
110    StringFragment(StringFragment<'input>),
111    InterpolationStart,
112    InterpolationEnd,
113    StringEnd,
114
115    Keyword(Keyword),
116    Field(&'input str),
117    Identifier(&'input str),
118    ModuleIdentifier(&'input str),
119    Variable(&'input str),
120    ModuleVariable(&'input str),
121    Format(&'input str),
122    Number(crate::Number),
123
124    // Post-processed
125    DefScopeEnd,
126    LabelScopeEnd,
127    BindScopeEnd,
128}
129
130#[derive(Debug, Clone, Eq, PartialEq, Error)]
131pub enum LexicalError {
132    #[error("Unmatching open {0:?} and close {1:?}")]
133    UnmatchingOpenClose(OpenCloseType, OpenCloseType),
134    #[error("Expected `{0}` but got `{1}`")]
135    UnexpectedToken(String, String),
136    #[error("Encountered an unexpected escaped character `\\{0}`")]
137    InvalidEscape(char),
138    #[error("Expected token `{0}`")]
139    OrphanToken(String),
140    #[error("No matching open for close {0:?}")]
141    TooManyClose(OpenCloseType),
142    #[error("Invalid unicode scalar value: `{0}`")]
143    InvalidUnicodeScalar(u32),
144    #[error("Unable to parse number: `{0}`")]
145    InvalidNumber(String),
146    #[error("Something went wrong")]
147    InvalidState,
148}
149
150enum ContextType<'input> {
151    /// parenthesis, braces, brackets, interpolation, if-end, etc.
152    Balancing(Token<'input>),
153    /// def to the end of the scope, catch to the end of the following query, etc.
154    AutoCloseAndEmit(Token<'input>),
155    /// try but haven't get catch yet
156    Try(usize),
157}
158pub struct Lexer<'input> {
159    input: &'input str,
160}
161
162impl<'input> Lexer<'input> {
163    #[must_use]
164    pub const fn new(input: &'input str) -> Self {
165        Self { input }
166    }
167}
168
169impl<'input> IntoIterator for Lexer<'input> {
170    type Item = Result<(Loc, Token<'input>, Loc), LexerError>;
171
172    type IntoIter = <Vec<Self::Item> as IntoIterator>::IntoIter;
173
174    fn into_iter(self) -> Self::IntoIter {
175        #[derive(Default)]
176        struct State<'input> {
177            ret: Vec<Result<(Loc, Token<'input>, Loc), LexerError>>,
178            pos: Loc,
179            stack: Vec<ContextType<'input>>,
180        }
181        impl<'input> State<'input> {
182            fn track_pos(&mut self, pos: Loc) {
183                self.pos = pos;
184            }
185            fn open(&mut self, ty: ContextType<'input>) {
186                self.stack.push(ty);
187            }
188            fn open_balancing(&mut self, token: Token<'input>) {
189                self.open(ContextType::Balancing(token));
190            }
191            fn close_to_try(&mut self) -> Result<(), LexicalError> {
192                while let Some(item) = self.stack.last() {
193                    match item {
194                        ContextType::Balancing(token) => {
195                            return Err(LexicalError::UnexpectedToken(
196                                format!("{token:?}"),
197                                "catch".to_string(),
198                            ))
199                        }
200                        ContextType::AutoCloseAndEmit(term) => {
201                            self.ret.push(Ok((self.pos, term.clone(), self.pos)));
202                            self.stack.pop();
203                        }
204                        ContextType::Try(_) => {
205                            self.stack.pop();
206                            return Ok(());
207                        }
208                    }
209                }
210                Err(LexicalError::OrphanToken("catch".to_string()))
211            }
212            fn close_autoclose(&mut self) -> Option<Token<'input>> {
213                while let Some(item) = self.stack.last() {
214                    match item {
215                        ContextType::Balancing(token) => return Some(token.clone()),
216                        ContextType::AutoCloseAndEmit(term) => {
217                            self.ret.push(Ok((self.pos, term.clone(), self.pos)));
218                            self.stack.pop();
219                        }
220                        ContextType::Try(i) => {
221                            if let Some(Ok((_, Token::Keyword(ref mut keyword), _))) =
222                                self.ret.get_mut(*i).map(Result::as_mut)
223                            {
224                                *keyword = Keyword::TryNoCatch;
225                                self.stack.pop();
226                            } else {
227                                panic!("Something went wrong with parsing try catch");
228                            }
229                        }
230                    }
231                }
232                None
233            }
234            fn close_balancing(&mut self, token: &Token<'input>) -> Result<(), LexicalError> {
235                self.close_autoclose();
236                match self.stack.last() {
237                    Some(ContextType::Balancing(expected)) => {
238                        if token == expected {
239                            self.stack.pop();
240                            Ok(())
241                        } else {
242                            Err(LexicalError::UnexpectedToken(
243                                format!("{expected:?}"),
244                                format!("{token:?}"),
245                            ))
246                        }
247                    }
248                    Some(ContextType::AutoCloseAndEmit(_)) => unreachable!(),
249                    Some(ContextType::Try(_)) => unreachable!(),
250                    None => Err(LexicalError::OrphanToken(format!("{token:?}"))),
251                }
252            }
253            fn flush_or_close(&mut self, token: &Token<'input>) -> bool {
254                self.close_autoclose();
255                match self.stack.last() {
256                    Some(ContextType::Balancing(expected)) => {
257                        if token == expected {
258                            self.stack.pop();
259                            return true;
260                        }
261                    }
262                    Some(ContextType::AutoCloseAndEmit(_)) => unreachable!(),
263                    Some(ContextType::Try(_)) => unreachable!(),
264                    None => {}
265                }
266                false
267            }
268            fn try_close_without_flush(&mut self, token: &Token<'input>) -> bool {
269                if let Some(ContextType::Balancing(expected)) = self.stack.last() {
270                    if token == expected {
271                        self.stack.pop();
272                        return true;
273                    }
274                }
275                false
276            }
277
278            fn handle_token(&mut self, token: &Token<'input>) -> Result<(), LexicalError> {
279                match token {
280                    Token::LParen => self.open_balancing(Token::RParen),
281                    Token::LBrace => self.open_balancing(Token::RBrace),
282                    Token::LBracket => self.open_balancing(Token::RBracket),
283                    Token::Keyword(Keyword::If) => {
284                        self.open_balancing(Token::Keyword(Keyword::End))
285                    }
286                    Token::RParen
287                    | Token::RBrace
288                    | Token::RBracket
289                    | Token::Keyword(Keyword::End) => {
290                        self.close_balancing(token)?;
291                    }
292                    Token::Semicolon
293                    | Token::Colon
294                    | Token::Keyword(Keyword::Then | Keyword::Elif | Keyword::Else) => {
295                        self.flush_or_close(token);
296                    }
297                    Token::Keyword(Keyword::Def) => {
298                        // def _ (_; _; ...): _; _
299                        self.open(ContextType::AutoCloseAndEmit(Token::DefScopeEnd));
300                        self.open_balancing(Token::Semicolon);
301                        self.open_balancing(Token::Colon);
302                    }
303                    Token::Keyword(Keyword::Try) => {
304                        self.open(ContextType::Try(self.ret.len()));
305                    }
306                    Token::Keyword(Keyword::Catch) => {
307                        self.close_to_try()?;
308                    }
309                    Token::Keyword(Keyword::Reduce | Keyword::Foreach) => {
310                        self.open_balancing(Token::Keyword(Keyword::As));
311                    }
312                    Token::Keyword(Keyword::As) => {
313                        if !self.try_close_without_flush(token) {
314                            self.open(ContextType::AutoCloseAndEmit(Token::BindScopeEnd));
315                        }
316                    }
317                    Token::Keyword(Keyword::Label) => {
318                        self.open(ContextType::AutoCloseAndEmit(Token::LabelScopeEnd));
319                    }
320                    _ => {}
321                }
322                Ok(())
323            }
324            fn handle_item(&mut self, item: Result<(Loc, Token<'input>, Loc), LexerError>) {
325                match item {
326                    Ok((l, token, r)) => {
327                        self.track_pos(l);
328                        let to_push =
329                            self.handle_token(&token)
330                                .map(|_| (l, token, r))
331                                .map_err(|e| LexerError {
332                                    kind: lexgen_util::LexerErrorKind::Custom(e),
333                                    location: l,
334                                });
335                        self.ret.push(to_push);
336                    }
337                    Err(_) => {
338                        self.ret.push(item);
339                    }
340                }
341            }
342            fn finish(mut self) -> Vec<Result<(Loc, Token<'input>, Loc), LexerError>> {
343                if let Some(token) = self.close_autoclose() {
344                    self.ret.push(Err(LexerError {
345                        kind: lexgen_util::LexerErrorKind::Custom(LexicalError::UnexpectedToken(
346                            format!("{token:?}"),
347                            "EOF".to_string(),
348                        )),
349                        location: self.pos,
350                    }));
351                }
352                self.ret
353            }
354        }
355
356        let lexer = LexerImpl::new(self.input);
357        let mut state = State::default();
358
359        for item in lexer {
360            state.handle_item(item);
361        }
362        state.finish().into_iter()
363    }
364}
365
366#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
367pub enum OpenCloseType {
368    /// `(` and `)`
369    Parenthesis,
370    /// `[` and `]`
371    Bracket,
372    /// `{` and `}`
373    Brace,
374    /// `\(` and `)`
375    Interpolation,
376}
377
378#[derive(Debug, Default)]
379struct LexerState {
380    stack: Vec<OpenCloseType>,
381}
382impl LexerState {
383    fn open(&mut self, ty: OpenCloseType) {
384        self.stack.push(ty);
385    }
386    fn current_type(&self) -> Option<OpenCloseType> {
387        self.stack.last().cloned()
388    }
389    fn close(&mut self, ty: OpenCloseType) -> Result<(), LexicalError> {
390        if let Some(open) = self.current_type() {
391            if open == ty {
392                self.stack.pop();
393                Ok(())
394            } else {
395                Err(LexicalError::UnmatchingOpenClose(open, ty))
396            }
397        } else {
398            Err(LexicalError::TooManyClose(ty))
399        }
400    }
401}
402
403macro_rules! handle_keyword {
404    ($lexer: expr, $keyword: expr) => {
405        if $lexer.state().current_type() == Some(OpenCloseType::Brace) {
406            // If here is a direct child of braces, it is used as an identifier.
407            $lexer.return_(Token::Identifier($lexer.match_()))
408        } else {
409            $lexer.return_(Token::Keyword($keyword))
410        }
411    };
412}
413
414lexer! {
415    LexerImpl(LexerState) -> Token<'input>;
416    type Error = LexicalError;
417
418    let ws = [' ' '\t' '\n'] | "\r\n";
419    let comment = '#' (_ # ['\r' '\n'])*;
420    let ident_start = ['a'-'z' 'A'-'Z' '_'];
421    let digit = ['0'-'9'];
422    let hex_digit = $digit | ['a'-'f' 'A'-'F'];
423    let ident_follow = $ident_start | $digit;
424
425    rule Init {
426        $ws,
427        $comment,
428        '+' = Token::Plus,
429        '-' = Token::Minus,
430        '*' = Token::Star,
431        '/' = Token::Slash,
432        '%' = Token::Percent,
433
434        '=' = Token::Eq,
435        "+=" = Token::PlusEq,
436        "-=" = Token::MinusEq,
437        "*=" = Token::StarEq,
438        "/=" = Token::SlashEq,
439        "%=" = Token::PercentEq,
440        "//=" = Token::SlashSlashEq,
441        "|=" = Token::PipeEq,
442
443        "==" = Token::EqEq,
444        "!=" = Token::NotEq,
445        "<=" = Token::LtEq,
446        ">=" = Token::GtEq,
447        '<' = Token::Lt,
448        '>' = Token::Gt,
449
450        ',' = Token::Comma,
451        '.' = Token::Dot,
452        ';' = Token::Semicolon,
453        ':' = Token::Colon,
454        ".." = Token::DotDot,
455
456        '|' = Token::Pipe,
457        '?' = Token::Question,
458        "//" = Token::SlashSlash,
459        "?//" = Token::QuestionSlashSlash,
460
461        '(' => |lexer| {
462            lexer.state().open(OpenCloseType::Parenthesis);
463            lexer.return_(Token::LParen)
464        },
465        ')' =? |lexer| {
466            if lexer.state().current_type() == Some(OpenCloseType::Interpolation) {
467                let token = lexer.state().close(OpenCloseType::Interpolation).map(|_| Token::InterpolationEnd);
468                lexer.switch_and_return(LexerImplRule::InString, token)
469            } else {
470                let token = lexer.state().close(OpenCloseType::Parenthesis).map(|_| Token::RParen);
471                lexer.return_(token)
472            }
473        },
474        '{' => |lexer| {
475            lexer.state().open(OpenCloseType::Brace);
476            lexer.return_(Token::LBrace)
477        },
478        '}' =? |lexer| {
479            let token = lexer.state().close(OpenCloseType::Brace).map(|_| Token::RBrace);
480            lexer.return_(token)
481        },
482        '[' => |lexer| {
483            lexer.state().open(OpenCloseType::Bracket);
484            lexer.return_(Token::LBracket)
485        },
486        ']' =? |lexer| {
487            let token = lexer.state().close(OpenCloseType::Bracket).map(|_| Token::RBracket);
488            lexer.return_(token)
489        },
490
491        "or"      => |lexer| handle_keyword!(lexer, Keyword::Or),
492        "and"     => |lexer| handle_keyword!(lexer, Keyword::And),
493        "module"  => |lexer| handle_keyword!(lexer, Keyword::Module),
494        "import"  => |lexer| handle_keyword!(lexer, Keyword::Import),
495        "include" => |lexer| handle_keyword!(lexer, Keyword::Include),
496        "def"     => |lexer| handle_keyword!(lexer, Keyword::Def),
497        "as"      => |lexer| handle_keyword!(lexer, Keyword::As),
498        "label"   => |lexer| handle_keyword!(lexer, Keyword::Label),
499        "break"   => |lexer| handle_keyword!(lexer, Keyword::Break),
500        "if"      => |lexer| handle_keyword!(lexer, Keyword::If),
501        "then"    => |lexer| handle_keyword!(lexer, Keyword::Then),
502        "elif"    => |lexer| handle_keyword!(lexer, Keyword::Elif),
503        "else"    => |lexer| handle_keyword!(lexer, Keyword::Else),
504        "end"     => |lexer| handle_keyword!(lexer, Keyword::End),
505        "try"     => |lexer| handle_keyword!(lexer, Keyword::Try),
506        "catch"   => |lexer| handle_keyword!(lexer, Keyword::Catch),
507        "reduce"  => |lexer| handle_keyword!(lexer, Keyword::Reduce),
508        "foreach" => |lexer| handle_keyword!(lexer, Keyword::Foreach),
509
510        // These three keywords can be in a value of an object entry.
511        // Since we don't care this in the post processing anyway,
512        // we always treat them as a keyword.
513        "null"    = Token::Keyword(Keyword::Null),
514        "false"   = Token::Keyword(Keyword::False),
515        "true"    = Token::Keyword(Keyword::True),
516
517        '.' $ident_start $ident_follow* => |lexer| {
518            lexer.return_(Token::Field(&lexer.match_()[1..]))
519        },
520        $ident_start $ident_follow* => |lexer| {
521            lexer.return_(Token::Identifier(lexer.match_()))
522        },
523        $ident_start $ident_follow* ("::" $ident_start $ident_follow*)+ => |lexer| {
524            lexer.return_(Token::ModuleIdentifier(lexer.match_()))
525        },
526        '$' $ident_start $ident_follow* => |lexer| {
527            lexer.return_(Token::Variable(&lexer.match_()[1..]))
528        },
529        '$' $ident_start $ident_follow* ("::" $ident_start $ident_follow*)+ => |lexer| {
530            lexer.return_(Token::ModuleVariable(&lexer.match_()[1..]))
531        },
532        '@' $ident_start $ident_follow* => |lexer| {
533            lexer.return_(Token::Format(&lexer.match_()[1..]))
534        },
535        ($digit+ | $digit+ '.' $digit* | $digit* '.' $digit+) (['e' 'E'] (['+' '-']? $digit+))? =? |lexer| {
536            use std::str::FromStr;
537            let parsed = crate::Number::from_str(lexer.match_())
538                .map_err(|_| LexicalError::InvalidNumber(lexer.match_().to_string()))
539                .map(Token::Number);
540            lexer.return_(parsed)
541        },
542        '"' => |lexer| {
543            lexer.switch_and_return(LexerImplRule::InString, Token::StringStart)
544        },
545    }
546    rule InString {
547        "\\n" = Token::StringFragment(StringFragment::Char('\n')),
548        "\\r" = Token::StringFragment(StringFragment::Char('\r')),
549        "\\t" = Token::StringFragment(StringFragment::Char('\t')),
550        "\\b" = Token::StringFragment(StringFragment::Char('\u{08}')),
551        "\\f" = Token::StringFragment(StringFragment::Char('\u{0C}')),
552        "\\\\" = Token::StringFragment(StringFragment::Char('\\')),
553        "\\/" = Token::StringFragment(StringFragment::Char('/')),
554        "\\\"" = Token::StringFragment(StringFragment::Char('"')),
555        "\\u" ['d' 'D'] ['8' '9' 'a' 'b' 'A' 'B'] $hex_digit $hex_digit "\\u" ['d' 'D'] ['c'-'f' 'C'-'F'] $hex_digit $hex_digit =? |lexer| {
556            let higher_surrogate = u32::from_str_radix(&lexer.match_()[2..6], 16).unwrap();
557            let lower_surrogate = u32::from_str_radix(&lexer.match_()[8..12], 16).unwrap();
558            assert!((0xD800..0xDC00).contains(&higher_surrogate));
559            assert!((0xDC00..=0xDFFF).contains(&lower_surrogate));
560            let value = (((higher_surrogate - 0xD800) as u32) << 10 | (lower_surrogate - 0xDC00) as u32) + 0x1_0000;
561            match char::from_u32(value) {
562                Some(c) => {
563                    lexer.return_(Ok(Token::StringFragment(StringFragment::Char(c))))
564                }
565                None => lexer.return_(Err(LexicalError::InvalidUnicodeScalar(value)))
566            }
567        },
568        "\\u" $hex_digit $hex_digit $hex_digit $hex_digit =? |lexer| {
569            let value = u32::from_str_radix(&lexer.match_()[2..], 16).unwrap();
570            match char::from_u32(value) {
571                Some(c) => {
572                    lexer.return_(Ok(Token::StringFragment(StringFragment::Char(c))))
573                }
574                None => lexer.return_(Err(LexicalError::InvalidUnicodeScalar(value)))
575            }
576        },
577        "\\(" => |lexer| {
578            lexer.state().open(OpenCloseType::Interpolation);
579            lexer.switch_and_return(LexerImplRule::Init, Token::InterpolationStart)
580        },
581        '"' => |lexer| {
582            lexer.switch_and_return(LexerImplRule::Init, Token::StringEnd)
583        },
584        '\\' _ =? |lexer| {
585            lexer.return_(Err(LexicalError::InvalidEscape(lexer.match_().chars().nth(1).unwrap())))
586        },
587        (_ # ['\\' '"'])+ => |lexer| {
588            lexer.return_(Token::StringFragment(StringFragment::String(lexer.match_())))
589        },
590    }
591}
592
593#[cfg(test)]
594mod test {
595    use super::{Lexer, StringFragment, Token};
596
597    fn string_fragment(s: &str) -> Token {
598        Token::StringFragment(StringFragment::String(s))
599    }
600    fn assert_lex(q: &str, expected_tokens: &[Token]) {
601        let tokens: Vec<_> = Lexer::new(q)
602            .into_iter()
603            .map(Result::unwrap)
604            .map(|(_, token, _)| token)
605            .collect();
606        assert_eq!(&tokens[..], expected_tokens);
607    }
608
609    #[test]
610    fn test_ident() {
611        assert_lex(r#"abc"#, &[Token::Identifier("abc")]);
612        assert_lex(r#"abc12"#, &[Token::Identifier("abc12")]);
613        assert_lex(
614            r#"ab ab12"#,
615            &[Token::Identifier("ab"), Token::Identifier("ab12")],
616        );
617        assert_lex(
618            r#"ab_ ab_12"#,
619            &[Token::Identifier("ab_"), Token::Identifier("ab_12")],
620        );
621    }
622
623    #[test]
624    fn test_string() {
625        assert_lex(
626            r#""abc""#,
627            &[Token::StringStart, string_fragment("abc"), Token::StringEnd],
628        );
629    }
630
631    #[test]
632    fn test_string_interpolation() {
633        assert_lex(
634            r#"(ab"(\(a"\()")))")"#,
635            &[
636                Token::LParen,
637                Token::Identifier("ab"),
638                Token::StringStart,
639                string_fragment("("),
640                Token::InterpolationStart,
641                Token::Identifier("a"),
642                Token::StringStart,
643                Token::InterpolationStart,
644                Token::InterpolationEnd,
645                Token::StringEnd,
646                Token::InterpolationEnd,
647                string_fragment("))"),
648                Token::StringEnd,
649                Token::RParen,
650            ],
651        );
652    }
653
654    #[test]
655    fn test_number() {
656        assert_lex(
657            r#"2 12 1e3 1.5 .2 .3e-1"#,
658            &[
659                Token::Number(2.0.into()),
660                Token::Number(12.0.into()),
661                Token::Number(1000.0.into()),
662                Token::Number(1.5.into()),
663                Token::Number(0.2.into()),
664                Token::Number(0.03.into()),
665            ],
666        );
667    }
668
669    #[test]
670    fn test_comment() {
671        assert_lex(
672            r#""\(
673            1# This
674            + # is
675            2  #
676            )"  # comment"#,
677            &[
678                Token::StringStart,
679                Token::InterpolationStart,
680                Token::Number(1.0.into()),
681                Token::Plus,
682                Token::Number(2.0.into()),
683                Token::InterpolationEnd,
684                Token::StringEnd,
685            ],
686        );
687    }
688}