Skip to main content

squawk_lexer/
lib.rs

1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346
7// ident_start		[A-Za-z\200-\377_]
8const fn is_ident_start(c: char) -> bool {
9    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
10}
11
12// ident_cont		[A-Za-z\200-\377_0-9\$]
13const fn is_ident_cont(c: char) -> bool {
14    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
15}
16
17// see:
18// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128
19// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229
20const fn is_whitespace(c: char) -> bool {
21    matches!(
22        c,
23        ' ' // space
24        | '\t' // tab
25        | '\n' // newline
26        | '\r' // carriage return
27        | '\u{000B}' // vertical tab
28        | '\u{000C}' // form feed
29    )
30}
31
32impl Cursor<'_> {
33    // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
34    pub(crate) fn advance_token(&mut self) -> Token {
35        let Some(first_char) = self.bump() else {
36            return Token::new(TokenKind::Eof, 0);
37        };
38        let token_kind = match first_char {
39            // Slash, comment or block comment.
40            '/' => match self.first() {
41                '*' => self.block_comment(),
42                _ => TokenKind::Slash,
43            },
44            '-' => match self.first() {
45                '-' => self.line_comment(),
46                _ => TokenKind::Minus,
47            },
48
49            // // Whitespace sequence.
50            c if is_whitespace(c) => self.whitespace(),
51
52            // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
53            'u' | 'U' => match self.first() {
54                '&' => {
55                    self.bump();
56                    self.prefixed_string(
57                        |terminated| LiteralKind::UnicodeEscStr { terminated },
58                        true,
59                    )
60                }
61                _ => self.ident_or_unknown_prefix(),
62            },
63
64            // escaped strings
65            'e' | 'E' => {
66                self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67            }
68
69            // bit string
70            'b' | 'B' => {
71                self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72            }
73
74            // hexadecimal byte string
75            'x' | 'X' => {
76                self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77            }
78
79            // Identifier (this should be checked after other variant that can
80            // start as identifier).
81            c if is_ident_start(c) => self.ident(),
82
83            // Numeric literal.
84            // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
85            c @ '0'..='9' => {
86                let literal_kind = self.number(c);
87                TokenKind::Literal { kind: literal_kind }
88            }
89            '.' => match self.first() {
90                '0'..='9' => {
91                    let literal_kind = self.number('.');
92                    TokenKind::Literal { kind: literal_kind }
93                }
94                _ => TokenKind::Dot,
95            },
96            // One-symbol tokens.
97            ';' => TokenKind::Semi,
98            ',' => TokenKind::Comma,
99            '(' => TokenKind::OpenParen,
100            ')' => TokenKind::CloseParen,
101            '[' => TokenKind::OpenBracket,
102            ']' => TokenKind::CloseBracket,
103            '{' => TokenKind::OpenCurly,
104            '}' => TokenKind::CloseCurly,
105            '@' => TokenKind::At,
106            '#' => TokenKind::Pound,
107            '~' => TokenKind::Tilde,
108            '?' => TokenKind::Question,
109            ':' => TokenKind::Colon,
110            '$' => {
111                // Dollar quoted strings
112                if is_ident_start(self.first()) || self.first() == '$' {
113                    self.dollar_quoted_string()
114                } else {
115                    // Parameters
116                    while self.first().is_ascii_digit() {
117                        self.bump();
118                    }
119                    TokenKind::PositionalParam
120                }
121            }
122            '`' => TokenKind::Backtick,
123            '=' => TokenKind::Eq,
124            '!' => TokenKind::Bang,
125            '<' => TokenKind::Lt,
126            '>' => TokenKind::Gt,
127            '&' => TokenKind::And,
128            '|' => TokenKind::Or,
129            '+' => TokenKind::Plus,
130            '*' => TokenKind::Star,
131            '^' => TokenKind::Caret,
132            '%' => TokenKind::Percent,
133
134            // String literal
135            '\'' => {
136                let terminated = self.single_quoted_string();
137                let kind = LiteralKind::Str { terminated };
138                TokenKind::Literal { kind }
139            }
140
141            // Quoted indentifiers
142            '"' => {
143                let terminated = self.double_quoted_string();
144                TokenKind::QuotedIdent { terminated }
145            }
146            _ => TokenKind::Unknown,
147        };
148        let res = Token::new(token_kind, self.pos_within_token());
149        self.reset_pos_within_token();
150        res
151    }
152    pub(crate) fn ident(&mut self) -> TokenKind {
153        self.eat_while(is_ident_cont);
154        TokenKind::Ident
155    }
156
157    pub(crate) fn whitespace(&mut self) -> TokenKind {
158        self.eat_while(is_whitespace);
159        TokenKind::Whitespace
160    }
161
162    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
163        // Start is already eaten, eat the rest of identifier.
164        self.eat_while(is_ident_cont);
165        // Known prefixes must have been handled earlier. So if
166        // we see a prefix here, it is definitely an unknown prefix.
167        match self.first() {
168            '#' | '"' | '\'' => TokenKind::UnknownPrefix,
169            _ => TokenKind::Ident,
170        }
171    }
172
173    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227
174    // comment			("--"{non_newline}*)
175    pub(crate) fn line_comment(&mut self) -> TokenKind {
176        self.bump();
177
178        self.eat_while(|c| c != '\n');
179        TokenKind::LineComment
180    }
181
182    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344
183    pub(crate) fn block_comment(&mut self) -> TokenKind {
184        self.bump();
185
186        let mut depth = 1usize;
187        while let Some(c) = self.bump() {
188            match c {
189                '/' if self.first() == '*' => {
190                    self.bump();
191                    depth += 1;
192                }
193                '*' if self.first() == '/' => {
194                    self.bump();
195                    depth -= 1;
196                    if depth == 0 {
197                        // This block comment is closed, so for a construction like "/* */ */"
198                        // there will be a successfully parsed block comment "/* */"
199                        // and " */" will be processed separately.
200                        break;
201                    }
202                }
203                _ => (),
204            }
205        }
206
207        TokenKind::BlockComment {
208            terminated: depth == 0,
209        }
210    }
211
212    fn prefixed_string(
213        &mut self,
214        mk_kind: fn(bool) -> LiteralKind,
215        allows_double: bool,
216    ) -> TokenKind {
217        match self.first() {
218            '\'' => {
219                self.bump();
220                let terminated = self.single_quoted_string();
221                let kind = mk_kind(terminated);
222                TokenKind::Literal { kind }
223            }
224            '"' if allows_double => {
225                self.bump();
226                let terminated = self.double_quoted_string();
227                TokenKind::QuotedIdent { terminated }
228            }
229            _ => self.ident_or_unknown_prefix(),
230        }
231    }
232
233    fn number(&mut self, first_digit: char) -> LiteralKind {
234        let mut base = Base::Decimal;
235        if first_digit == '0' {
236            // Attempt to parse encoding base.
237            match self.first() {
238                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403
239                'b' | 'B' => {
240                    base = Base::Binary;
241                    self.bump();
242                    if !self.eat_decimal_digits() {
243                        return LiteralKind::Int {
244                            base,
245                            empty_int: true,
246                        };
247                    }
248                }
249                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402
250                'o' | 'O' => {
251                    base = Base::Octal;
252                    self.bump();
253                    if !self.eat_decimal_digits() {
254                        return LiteralKind::Int {
255                            base,
256                            empty_int: true,
257                        };
258                    }
259                }
260                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401
261                'x' | 'X' => {
262                    base = Base::Hexadecimal;
263                    self.bump();
264                    if !self.eat_hexadecimal_digits() {
265                        return LiteralKind::Int {
266                            base,
267                            empty_int: true,
268                        };
269                    }
270                }
271                // Not a base prefix; consume additional digits.
272                '0'..='9' | '_' => {
273                    self.eat_decimal_digits();
274                }
275
276                // Also not a base prefix; nothing more to do here.
277                '.' | 'e' | 'E' => {}
278
279                // Just a 0.
280                _ => {
281                    return LiteralKind::Int {
282                        base,
283                        empty_int: false,
284                    };
285                }
286            }
287        } else {
288            // No base prefix, parse number in the usual way.
289            self.eat_decimal_digits();
290        };
291
292        match self.first() {
293            '.' => {
294                // might have stuff after the ., and if it does, it needs to start
295                // with a number
296                self.bump();
297                let mut empty_exponent = false;
298                if self.first().is_ascii_digit() {
299                    self.eat_decimal_digits();
300                    match self.first() {
301                        'e' | 'E' => {
302                            self.bump();
303                            empty_exponent = !self.eat_float_exponent();
304                        }
305                        _ => (),
306                    }
307                } else {
308                    match self.first() {
309                        'e' | 'E' => {
310                            self.bump();
311                            empty_exponent = !self.eat_float_exponent();
312                        }
313                        _ => (),
314                    }
315                }
316                LiteralKind::Float {
317                    base,
318                    empty_exponent,
319                }
320            }
321            'e' | 'E' => {
322                self.bump();
323                let empty_exponent = !self.eat_float_exponent();
324                LiteralKind::Float {
325                    base,
326                    empty_exponent,
327                }
328            }
329            _ => LiteralKind::Int {
330                base,
331                empty_int: false,
332            },
333        }
334    }
335
336    fn single_quoted_string(&mut self) -> bool {
337        // Parse until either quotes are terminated or error is detected.
338        loop {
339            match self.first() {
340                // Quotes might be terminated.
341                '\'' => {
342                    self.bump();
343
344                    match self.first() {
345                        // encountered an escaped quote ''
346                        '\'' => {
347                            self.bump();
348                        }
349                        // encountered terminating quote
350                        _ => return true,
351                    }
352                }
353                // End of file, stop parsing.
354                EOF_CHAR if self.is_eof() => break,
355                // Skip the character.
356                _ => {
357                    self.bump();
358                }
359            }
360        }
361        // String was not terminated.
362        false
363    }
364
365    /// Eats double-quoted string and returns true
366    /// if string is terminated.
367    fn double_quoted_string(&mut self) -> bool {
368        while let Some(c) = self.bump() {
369            match c {
370                '"' if self.first() == '"' => {
371                    // Bump again to skip escaped character.
372                    self.bump();
373                }
374                '"' => {
375                    return true;
376                }
377                _ => (),
378            }
379        }
380        // End of file reached.
381        false
382    }
383
384    // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
385    fn dollar_quoted_string(&mut self) -> TokenKind {
386        // Get the start sequence of the dollar quote, i.e., 'foo' in
387        // $foo$hello$foo$
388        let mut start = vec![];
389        while let Some(c) = self.bump() {
390            match c {
391                '$' => {
392                    break;
393                }
394                _ => {
395                    start.push(c);
396                }
397            }
398        }
399
400        // we have a dollar quoted string deliminated with `$$`
401        if start.is_empty() {
402            loop {
403                self.eat_while(|c| c != '$');
404                if self.is_eof() {
405                    return TokenKind::Literal {
406                        kind: LiteralKind::DollarQuotedString { terminated: false },
407                    };
408                }
409                // eat $
410                self.bump();
411                if self.first() == '$' {
412                    self.bump();
413                    return TokenKind::Literal {
414                        kind: LiteralKind::DollarQuotedString { terminated: true },
415                    };
416                }
417            }
418        } else {
419            loop {
420                self.eat_while(|c| c != start[0]);
421                if self.is_eof() {
422                    return TokenKind::Literal {
423                        kind: LiteralKind::DollarQuotedString { terminated: false },
424                    };
425                }
426
427                // might be the start of our start/end sequence
428                let mut match_count = 0;
429                for start_char in &start {
430                    if self.first() == *start_char {
431                        self.bump();
432                        match_count += 1;
433                    } else {
434                        self.bump();
435                        break;
436                    }
437                }
438
439                // closing '$'
440                let terminated = match_count == start.len();
441                if self.first() == '$' && terminated {
442                    self.bump();
443                    return TokenKind::Literal {
444                        kind: LiteralKind::DollarQuotedString { terminated },
445                    };
446                }
447            }
448        }
449    }
450
451    fn eat_decimal_digits(&mut self) -> bool {
452        let mut has_digits = false;
453        loop {
454            match self.first() {
455                '_' => {
456                    self.bump();
457                }
458                '0'..='9' => {
459                    has_digits = true;
460                    self.bump();
461                }
462                _ => break,
463            }
464        }
465        has_digits
466    }
467
468    fn eat_hexadecimal_digits(&mut self) -> bool {
469        let mut has_digits = false;
470        loop {
471            match self.first() {
472                '_' => {
473                    self.bump();
474                }
475                '0'..='9' | 'a'..='f' | 'A'..='F' => {
476                    has_digits = true;
477                    self.bump();
478                }
479                _ => break,
480            }
481        }
482        has_digits
483    }
484
485    /// Eats the float exponent. Returns true if at least one digit was met,
486    /// and returns false otherwise.
487    fn eat_float_exponent(&mut self) -> bool {
488        if self.first() == '-' || self.first() == '+' {
489            self.bump();
490        }
491        self.eat_decimal_digits()
492    }
493}
494
495/// Creates an iterator that produces tokens from the input string.
496pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
497    let mut cursor = Cursor::new(input);
498    std::iter::from_fn(move || {
499        let token = cursor.advance_token();
500        if token.kind != TokenKind::Eof {
501            Some(token)
502        } else {
503            None
504        }
505    })
506}
507
508#[cfg(test)]
509mod tests {
510    use std::fmt;
511
512    use super::*;
513    use insta::assert_debug_snapshot;
514
515    struct TokenDebug<'a> {
516        content: &'a str,
517        token: Token,
518    }
519    impl fmt::Debug for TokenDebug<'_> {
520        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
521            write!(f, "{:?} @ {:?}", self.content, self.token.kind)
522        }
523    }
524
525    impl<'a> TokenDebug<'a> {
526        fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
527            TokenDebug {
528                token,
529                content: &input[start as usize..(start + token.len) as usize],
530            }
531        }
532    }
533
534    fn lex(input: &str) -> Vec<TokenDebug<'_>> {
535        let mut tokens = vec![];
536        let mut start = 0;
537
538        for token in tokenize(input) {
539            let length = token.len;
540            tokens.push(TokenDebug::new(token, input, start));
541            start += length;
542        }
543        tokens
544    }
545    #[test]
546    fn lex_statement() {
547        let result = lex("select 1;");
548        assert_debug_snapshot!(result);
549    }
550
551    #[test]
552    fn block_comment() {
553        let result = lex(r#"
554/*
555 * foo
556 * bar
557*/"#);
558        assert_debug_snapshot!(result);
559    }
560
561    #[test]
562    fn block_comment_unterminated() {
563        let result = lex(r#"
564/*
565 * foo
566 * bar
567 /*
568*/"#);
569        assert_debug_snapshot!(result);
570    }
571
572    #[test]
573    fn line_comment() {
574        let result = lex(r#"
575-- foooooooooooo bar buzz
576"#);
577        assert_debug_snapshot!(result);
578    }
579
580    #[test]
581    fn line_comment_whitespace() {
582        assert_debug_snapshot!(lex(r#"
583select 'Hello' -- This is a comment
584' World';"#))
585    }
586
587    #[test]
588    fn dollar_quoting() {
589        assert_debug_snapshot!(lex(r#"
590$$Dianne's horse$$
591$SomeTag$Dianne's horse$SomeTag$
592
593-- with dollar inside and matching tags
594$foo$hello$world$bar$
595"#))
596    }
597
598    #[test]
599    fn dollar_strings_part2() {
600        assert_debug_snapshot!(lex(r#"
601DO $doblock$
602end
603$doblock$;"#))
604    }
605
606    #[test]
607    fn dollar_quote_mismatch_tags_simple() {
608        assert_debug_snapshot!(lex(r#"
609-- dollar quoting with mismatched tags
610$foo$hello world$bar$
611"#));
612    }
613
614    #[test]
615    fn dollar_quote_mismatch_tags_complex() {
616        assert_debug_snapshot!(lex(r#"
617-- with dollar inside but mismatched tags
618$foo$hello$world$bar$
619"#));
620    }
621
622    #[test]
623    fn numeric() {
624        assert_debug_snapshot!(lex(r#"
62542
6263.5
6274.
628.001
629.123e10
6305e2
6311.925e-3
6321e-10
6331e+10
6341e10
6354664.E+5
636"#))
637    }
638
639    #[test]
640    fn numeric_non_decimal() {
641        assert_debug_snapshot!(lex(r#"
6420b100101
6430B10011001
6440o273
6450O755
6460x42f
6470XFFFF
648"#))
649    }
650
651    #[test]
652    fn numeric_with_seperators() {
653        assert_debug_snapshot!(lex(r#"
6541_500_000_000
6550b10001000_00000000
6560o_1_755
6570xFFFF_FFFF
6581.618_034
659"#))
660    }
661
662    #[test]
663    fn select_with_period() {
664        assert_debug_snapshot!(lex(r#"
665select public.users;
666"#))
667    }
668
669    #[test]
670    fn bitstring() {
671        assert_debug_snapshot!(lex(r#"
672B'1001'
673b'1001'
674X'1FF'
675x'1FF'
676"#))
677    }
678
679    #[test]
680    fn string() {
681        assert_debug_snapshot!(lex(r#"
682'Dianne''s horse'
683
684select 'foo ''
685bar';
686
687select 'foooo'   
688   'bar';
689
690
691'foo \\ \n \tbar'
692
693'forgot to close the string
694"#))
695    }
696
697    #[test]
698    fn params() {
699        assert_debug_snapshot!(lex(r#"
700select $1 + $2;
701
702select $1123123123123;
703
704select $;
705"#))
706    }
707
708    #[test]
709    fn string_with_escapes() {
710        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE
711
712        assert_debug_snapshot!(lex(r#"
713E'foo'
714
715e'bar'
716
717e'\b\f\n\r\t'
718
719e'\0\11\777'
720
721e'\x0\x11\xFF'
722
723e'\uAAAA \UFFFFFFFF'
724
725"#))
726    }
727
728    #[test]
729    fn string_unicode_escape() {
730        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
731
732        assert_debug_snapshot!(lex(r#"
733U&"d\0061t\+000061"
734
735U&"\0441\043B\043E\043D"
736
737u&'\0441\043B'
738
739U&"d!0061t!+000061" UESCAPE '!'
740"#))
741    }
742
743    #[test]
744    fn quoted_ident() {
745        assert_debug_snapshot!(lex(r#"
746"hello &1 -world";
747
748
749"hello-world
750"#))
751    }
752
753    #[test]
754    fn quoted_ident_with_escape_quote() {
755        assert_debug_snapshot!(lex(r#"
756"foo "" bar"
757"#))
758    }
759
760    #[test]
761    fn dollar_quoted_string() {
762        assert_debug_snapshot!(lex("$$$$"), @r#"
763        [
764            "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
765        ]
766        "#);
767    }
768}