squawk_lexer/
lib.rs

1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346
7// ident_start		[A-Za-z\200-\377_]
8const fn is_ident_start(c: char) -> bool {
9    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
10}
11
12// ident_cont		[A-Za-z\200-\377_0-9\$]
13const fn is_ident_cont(c: char) -> bool {
14    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
15}
16
17// see:
18// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128
19// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229
20const fn is_whitespace(c: char) -> bool {
21    matches!(
22        c,
23        ' ' // space
24        | '\t' // tab
25        | '\n' // newline
26        | '\r' // carriage return
27        | '\u{000B}' // vertical tab
28        | '\u{000C}' // form feed
29    )
30}
31
32impl Cursor<'_> {
33    // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
34    pub(crate) fn advance_token(&mut self) -> Token {
35        let Some(first_char) = self.bump() else {
36            return Token::new(TokenKind::Eof, 0);
37        };
38        let token_kind = match first_char {
39            // Slash, comment or block comment.
40            '/' => match self.first() {
41                '*' => self.block_comment(),
42                _ => TokenKind::Slash,
43            },
44            '-' => match self.first() {
45                '-' => self.line_comment(),
46                _ => TokenKind::Minus,
47            },
48
49            // // Whitespace sequence.
50            c if is_whitespace(c) => self.whitespace(),
51
52            // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
53            'u' | 'U' => match self.first() {
54                '&' => {
55                    self.bump();
56                    self.prefixed_string(
57                        |terminated| LiteralKind::UnicodeEscStr { terminated },
58                        true,
59                    )
60                }
61                _ => self.ident_or_unknown_prefix(),
62            },
63
64            // escaped strings
65            'e' | 'E' => {
66                self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67            }
68
69            // bit string
70            'b' | 'B' => {
71                self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72            }
73
74            // hexadecimal byte string
75            'x' | 'X' => {
76                self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77            }
78
79            // Identifier (this should be checked after other variant that can
80            // start as identifier).
81            c if is_ident_start(c) => self.ident(),
82
83            // Numeric literal.
84            // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
85            c @ '0'..='9' => {
86                let literal_kind = self.number(c);
87                TokenKind::Literal { kind: literal_kind }
88            }
89            '.' => match self.first() {
90                '0'..='9' => {
91                    let literal_kind = self.number('.');
92                    TokenKind::Literal { kind: literal_kind }
93                }
94                _ => TokenKind::Dot,
95            },
96            // One-symbol tokens.
97            ';' => TokenKind::Semi,
98            ',' => TokenKind::Comma,
99            '(' => TokenKind::OpenParen,
100            ')' => TokenKind::CloseParen,
101            '[' => TokenKind::OpenBracket,
102            ']' => TokenKind::CloseBracket,
103            '@' => TokenKind::At,
104            '#' => TokenKind::Pound,
105            '~' => TokenKind::Tilde,
106            '?' => TokenKind::Question,
107            ':' => TokenKind::Colon,
108            '$' => {
109                // Dollar quoted strings
110                if is_ident_start(self.first()) || self.first() == '$' {
111                    self.dollar_quoted_string()
112                } else {
113                    // Parameters
114                    while self.first().is_ascii_digit() {
115                        self.bump();
116                    }
117                    TokenKind::PositionalParam
118                }
119            }
120            '`' => TokenKind::Backtick,
121            '=' => TokenKind::Eq,
122            '!' => TokenKind::Bang,
123            '<' => TokenKind::Lt,
124            '>' => TokenKind::Gt,
125            '&' => TokenKind::And,
126            '|' => TokenKind::Or,
127            '+' => TokenKind::Plus,
128            '*' => TokenKind::Star,
129            '^' => TokenKind::Caret,
130            '%' => TokenKind::Percent,
131
132            // String literal
133            '\'' => {
134                let terminated = self.single_quoted_string();
135                let kind = LiteralKind::Str { terminated };
136                TokenKind::Literal { kind }
137            }
138
139            // Quoted indentifiers
140            '"' => {
141                let terminated = self.double_quoted_string();
142                TokenKind::QuotedIdent { terminated }
143            }
144            _ => TokenKind::Unknown,
145        };
146        let res = Token::new(token_kind, self.pos_within_token());
147        self.reset_pos_within_token();
148        res
149    }
150    pub(crate) fn ident(&mut self) -> TokenKind {
151        self.eat_while(is_ident_cont);
152        TokenKind::Ident
153    }
154
155    pub(crate) fn whitespace(&mut self) -> TokenKind {
156        self.eat_while(is_whitespace);
157        TokenKind::Whitespace
158    }
159
160    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
161        // Start is already eaten, eat the rest of identifier.
162        self.eat_while(is_ident_cont);
163        // Known prefixes must have been handled earlier. So if
164        // we see a prefix here, it is definitely an unknown prefix.
165        match self.first() {
166            '#' | '"' | '\'' => TokenKind::UnknownPrefix,
167            _ => TokenKind::Ident,
168        }
169    }
170
171    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227
172    // comment			("--"{non_newline}*)
173    pub(crate) fn line_comment(&mut self) -> TokenKind {
174        self.bump();
175
176        self.eat_while(|c| c != '\n');
177        TokenKind::LineComment
178    }
179
180    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344
181    pub(crate) fn block_comment(&mut self) -> TokenKind {
182        self.bump();
183
184        let mut depth = 1usize;
185        while let Some(c) = self.bump() {
186            match c {
187                '/' if self.first() == '*' => {
188                    self.bump();
189                    depth += 1;
190                }
191                '*' if self.first() == '/' => {
192                    self.bump();
193                    depth -= 1;
194                    if depth == 0 {
195                        // This block comment is closed, so for a construction like "/* */ */"
196                        // there will be a successfully parsed block comment "/* */"
197                        // and " */" will be processed separately.
198                        break;
199                    }
200                }
201                _ => (),
202            }
203        }
204
205        TokenKind::BlockComment {
206            terminated: depth == 0,
207        }
208    }
209
210    fn prefixed_string(
211        &mut self,
212        mk_kind: fn(bool) -> LiteralKind,
213        allows_double: bool,
214    ) -> TokenKind {
215        match self.first() {
216            '\'' => {
217                self.bump();
218                let terminated = self.single_quoted_string();
219                let kind = mk_kind(terminated);
220                TokenKind::Literal { kind }
221            }
222            '"' if allows_double => {
223                self.bump();
224                let terminated = self.double_quoted_string();
225                TokenKind::QuotedIdent { terminated }
226            }
227            _ => self.ident_or_unknown_prefix(),
228        }
229    }
230
231    fn number(&mut self, first_digit: char) -> LiteralKind {
232        let mut base = Base::Decimal;
233        if first_digit == '0' {
234            // Attempt to parse encoding base.
235            match self.first() {
236                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403
237                'b' | 'B' => {
238                    base = Base::Binary;
239                    self.bump();
240                    if !self.eat_decimal_digits() {
241                        return LiteralKind::Int {
242                            base,
243                            empty_int: true,
244                        };
245                    }
246                }
247                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402
248                'o' | 'O' => {
249                    base = Base::Octal;
250                    self.bump();
251                    if !self.eat_decimal_digits() {
252                        return LiteralKind::Int {
253                            base,
254                            empty_int: true,
255                        };
256                    }
257                }
258                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401
259                'x' | 'X' => {
260                    base = Base::Hexadecimal;
261                    self.bump();
262                    if !self.eat_hexadecimal_digits() {
263                        return LiteralKind::Int {
264                            base,
265                            empty_int: true,
266                        };
267                    }
268                }
269                // Not a base prefix; consume additional digits.
270                '0'..='9' | '_' => {
271                    self.eat_decimal_digits();
272                }
273
274                // Also not a base prefix; nothing more to do here.
275                '.' | 'e' | 'E' => {}
276
277                // Just a 0.
278                _ => {
279                    return LiteralKind::Int {
280                        base,
281                        empty_int: false,
282                    };
283                }
284            }
285        } else {
286            // No base prefix, parse number in the usual way.
287            self.eat_decimal_digits();
288        };
289
290        match self.first() {
291            '.' => {
292                // might have stuff after the ., and if it does, it needs to start
293                // with a number
294                self.bump();
295                let mut empty_exponent = false;
296                if self.first().is_ascii_digit() {
297                    self.eat_decimal_digits();
298                    match self.first() {
299                        'e' | 'E' => {
300                            self.bump();
301                            empty_exponent = !self.eat_float_exponent();
302                        }
303                        _ => (),
304                    }
305                } else {
306                    match self.first() {
307                        'e' | 'E' => {
308                            self.bump();
309                            empty_exponent = !self.eat_float_exponent();
310                        }
311                        _ => (),
312                    }
313                }
314                LiteralKind::Float {
315                    base,
316                    empty_exponent,
317                }
318            }
319            'e' | 'E' => {
320                self.bump();
321                let empty_exponent = !self.eat_float_exponent();
322                LiteralKind::Float {
323                    base,
324                    empty_exponent,
325                }
326            }
327            _ => LiteralKind::Int {
328                base,
329                empty_int: false,
330            },
331        }
332    }
333
334    fn single_quoted_string(&mut self) -> bool {
335        // Parse until either quotes are terminated or error is detected.
336        loop {
337            match self.first() {
338                // Quotes might be terminated.
339                '\'' => {
340                    self.bump();
341
342                    match self.first() {
343                        // encountered an escaped quote ''
344                        '\'' => {
345                            self.bump();
346                        }
347                        // encountered terminating quote
348                        _ => return true,
349                    }
350                }
351                // End of file, stop parsing.
352                EOF_CHAR if self.is_eof() => break,
353                // Skip the character.
354                _ => {
355                    self.bump();
356                }
357            }
358        }
359        // String was not terminated.
360        false
361    }
362
363    /// Eats double-quoted string and returns true
364    /// if string is terminated.
365    fn double_quoted_string(&mut self) -> bool {
366        while let Some(c) = self.bump() {
367            match c {
368                '"' if self.first() == '"' => {
369                    // Bump again to skip escaped character.
370                    self.bump();
371                }
372                '"' => {
373                    return true;
374                }
375                _ => (),
376            }
377        }
378        // End of file reached.
379        false
380    }
381
382    // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
383    fn dollar_quoted_string(&mut self) -> TokenKind {
384        // Get the start sequence of the dollar quote, i.e., 'foo' in
385        // $foo$hello$foo$
386        let mut start = vec![];
387        while let Some(c) = self.bump() {
388            match c {
389                '$' => {
390                    break;
391                }
392                _ => {
393                    start.push(c);
394                }
395            }
396        }
397
398        // we have a dollar quoted string deliminated with `$$`
399        if start.is_empty() {
400            loop {
401                self.eat_while(|c| c != '$');
402                if self.is_eof() {
403                    return TokenKind::Literal {
404                        kind: LiteralKind::DollarQuotedString { terminated: false },
405                    };
406                }
407                // eat $
408                self.bump();
409                if self.first() == '$' {
410                    self.bump();
411                    return TokenKind::Literal {
412                        kind: LiteralKind::DollarQuotedString { terminated: true },
413                    };
414                }
415            }
416        } else {
417            loop {
418                self.eat_while(|c| c != start[0]);
419                if self.is_eof() {
420                    return TokenKind::Literal {
421                        kind: LiteralKind::DollarQuotedString { terminated: false },
422                    };
423                }
424
425                // might be the start of our start/end sequence
426                let mut match_count = 0;
427                for start_char in &start {
428                    if self.first() == *start_char {
429                        self.bump();
430                        match_count += 1;
431                    } else {
432                        self.bump();
433                        break;
434                    }
435                }
436
437                // closing '$'
438                let terminated = match_count == start.len();
439                if self.first() == '$' && terminated {
440                    self.bump();
441                    return TokenKind::Literal {
442                        kind: LiteralKind::DollarQuotedString { terminated },
443                    };
444                }
445            }
446        }
447    }
448
449    fn eat_decimal_digits(&mut self) -> bool {
450        let mut has_digits = false;
451        loop {
452            match self.first() {
453                '_' => {
454                    self.bump();
455                }
456                '0'..='9' => {
457                    has_digits = true;
458                    self.bump();
459                }
460                _ => break,
461            }
462        }
463        has_digits
464    }
465
466    fn eat_hexadecimal_digits(&mut self) -> bool {
467        let mut has_digits = false;
468        loop {
469            match self.first() {
470                '_' => {
471                    self.bump();
472                }
473                '0'..='9' | 'a'..='f' | 'A'..='F' => {
474                    has_digits = true;
475                    self.bump();
476                }
477                _ => break,
478            }
479        }
480        has_digits
481    }
482
483    /// Eats the float exponent. Returns true if at least one digit was met,
484    /// and returns false otherwise.
485    fn eat_float_exponent(&mut self) -> bool {
486        if self.first() == '-' || self.first() == '+' {
487            self.bump();
488        }
489        self.eat_decimal_digits()
490    }
491}
492
493/// Creates an iterator that produces tokens from the input string.
494pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
495    let mut cursor = Cursor::new(input);
496    std::iter::from_fn(move || {
497        let token = cursor.advance_token();
498        if token.kind != TokenKind::Eof {
499            Some(token)
500        } else {
501            None
502        }
503    })
504}
505
506#[cfg(test)]
507mod tests {
508    use std::fmt;
509
510    use super::*;
511    use insta::assert_debug_snapshot;
512
513    struct TokenDebug<'a> {
514        content: &'a str,
515        token: Token,
516    }
517    impl fmt::Debug for TokenDebug<'_> {
518        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
519            write!(f, "{:?} @ {:?}", self.content, self.token.kind)
520        }
521    }
522
523    impl<'a> TokenDebug<'a> {
524        fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
525            TokenDebug {
526                token,
527                content: &input[start as usize..(start + token.len) as usize],
528            }
529        }
530    }
531
532    fn lex(input: &str) -> Vec<TokenDebug<'_>> {
533        let mut tokens = vec![];
534        let mut start = 0;
535
536        for token in tokenize(input) {
537            let length = token.len;
538            tokens.push(TokenDebug::new(token, input, start));
539            start += length;
540        }
541        tokens
542    }
543    #[test]
544    fn lex_statement() {
545        let result = lex("select 1;");
546        assert_debug_snapshot!(result);
547    }
548
549    #[test]
550    fn block_comment() {
551        let result = lex(r#"
552/*
553 * foo
554 * bar
555*/"#);
556        assert_debug_snapshot!(result);
557    }
558
559    #[test]
560    fn block_comment_unterminated() {
561        let result = lex(r#"
562/*
563 * foo
564 * bar
565 /*
566*/"#);
567        assert_debug_snapshot!(result);
568    }
569
570    #[test]
571    fn line_comment() {
572        let result = lex(r#"
573-- foooooooooooo bar buzz
574"#);
575        assert_debug_snapshot!(result);
576    }
577
578    #[test]
579    fn line_comment_whitespace() {
580        assert_debug_snapshot!(lex(r#"
581select 'Hello' -- This is a comment
582' World';"#))
583    }
584
585    #[test]
586    fn dollar_quoting() {
587        assert_debug_snapshot!(lex(r#"
588$$Dianne's horse$$
589$SomeTag$Dianne's horse$SomeTag$
590
591-- with dollar inside and matching tags
592$foo$hello$world$bar$
593"#))
594    }
595
596    #[test]
597    fn dollar_strings_part2() {
598        assert_debug_snapshot!(lex(r#"
599DO $doblock$
600end
601$doblock$;"#))
602    }
603
604    #[test]
605    fn dollar_quote_mismatch_tags_simple() {
606        assert_debug_snapshot!(lex(r#"
607-- dollar quoting with mismatched tags
608$foo$hello world$bar$
609"#));
610    }
611
612    #[test]
613    fn dollar_quote_mismatch_tags_complex() {
614        assert_debug_snapshot!(lex(r#"
615-- with dollar inside but mismatched tags
616$foo$hello$world$bar$
617"#));
618    }
619
620    #[test]
621    fn numeric() {
622        assert_debug_snapshot!(lex(r#"
62342
6243.5
6254.
626.001
627.123e10
6285e2
6291.925e-3
6301e-10
6311e+10
6321e10
6334664.E+5
634"#))
635    }
636
637    #[test]
638    fn numeric_non_decimal() {
639        assert_debug_snapshot!(lex(r#"
6400b100101
6410B10011001
6420o273
6430O755
6440x42f
6450XFFFF
646"#))
647    }
648
649    #[test]
650    fn numeric_with_seperators() {
651        assert_debug_snapshot!(lex(r#"
6521_500_000_000
6530b10001000_00000000
6540o_1_755
6550xFFFF_FFFF
6561.618_034
657"#))
658    }
659
660    #[test]
661    fn select_with_period() {
662        assert_debug_snapshot!(lex(r#"
663select public.users;
664"#))
665    }
666
667    #[test]
668    fn bitstring() {
669        assert_debug_snapshot!(lex(r#"
670B'1001'
671b'1001'
672X'1FF'
673x'1FF'
674"#))
675    }
676
677    #[test]
678    fn string() {
679        assert_debug_snapshot!(lex(r#"
680'Dianne''s horse'
681
682select 'foo ''
683bar';
684
685select 'foooo'   
686   'bar';
687
688
689'foo \\ \n \tbar'
690
691'forgot to close the string
692"#))
693    }
694
695    #[test]
696    fn params() {
697        assert_debug_snapshot!(lex(r#"
698select $1 + $2;
699
700select $1123123123123;
701
702select $;
703"#))
704    }
705
706    #[test]
707    fn string_with_escapes() {
708        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE
709
710        assert_debug_snapshot!(lex(r#"
711E'foo'
712
713e'bar'
714
715e'\b\f\n\r\t'
716
717e'\0\11\777'
718
719e'\x0\x11\xFF'
720
721e'\uAAAA \UFFFFFFFF'
722
723"#))
724    }
725
726    #[test]
727    fn string_unicode_escape() {
728        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
729
730        assert_debug_snapshot!(lex(r#"
731U&"d\0061t\+000061"
732
733U&"\0441\043B\043E\043D"
734
735u&'\0441\043B'
736
737U&"d!0061t!+000061" UESCAPE '!'
738"#))
739    }
740
741    #[test]
742    fn quoted_ident() {
743        assert_debug_snapshot!(lex(r#"
744"hello &1 -world";
745
746
747"hello-world
748"#))
749    }
750
751    #[test]
752    fn quoted_ident_with_escape_quote() {
753        assert_debug_snapshot!(lex(r#"
754"foo "" bar"
755"#))
756    }
757
758    #[test]
759    fn dollar_quoted_string() {
760        assert_debug_snapshot!(lex("$$$$"), @r#"
761        [
762            "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
763        ]
764        "#);
765    }
766}