sql_parse/
lexer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5// http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13use crate::{keywords::Keyword, Span};
14
15/// SQL Token enumeration
16#[derive(Debug, Clone, PartialEq, Eq, Hash)]
17pub(crate) enum Token<'a> {
18    Ampersand,
19    At,
20    Backslash,
21    Caret,
22    Colon,
23    Comma,
24    Div,
25    DoubleColon,
26    DoubleExclamationMark,
27    DoubleAmpersand,
28    DoublePipe,
29    DoubleDollar,
30    Eq,
31    ExclamationMark,
32    Float(&'a str),
33    Gt,
34    GtEq,
35    Ident(&'a str, Keyword),
36    Integer(&'a str),
37    Invalid,
38    LBrace,
39    LBracket,
40    LParen,
41    Lt,
42    LtEq,
43    Minus,
44    Mod,
45    Mul,
46    Neq,
47    Period,
48    Pipe,
49    Plus,
50    QuestionMark,
51    RArrow,
52    RBrace,
53    RBracket,
54    RParen,
55    SemiColon,
56    Sharp,
57    ShiftLeft,
58    ShiftRight,
59    SingleQuotedString(&'a str),
60    DoubleQuotedString(&'a str),
61    Spaceship,
62    Tilde,
63    PercentS,
64    DollarArg(usize),
65    AtAtGlobal,
66    AtAtSession,
67    Eof,
68}
69
70impl<'a> Token<'a> {
71    pub(crate) fn name(&self) -> &'static str {
72        match self {
73            Token::Ampersand => "'&'",
74            Token::At => "'@'",
75            Token::Backslash => "'\\'",
76            Token::Caret => "'^'",
77            Token::Colon => "':'",
78            Token::Comma => "','",
79            Token::Div => "'/'",
80            Token::DoubleColon => "'::'",
81            Token::DoubleExclamationMark => "'!!'",
82            Token::DoublePipe => "'||'",
83            Token::DoubleAmpersand => "'&&'",
84            Token::Eq => "'='",
85            Token::ExclamationMark => "'!'",
86            Token::Float(_) => "Float",
87            Token::Gt => "'>'",
88            Token::GtEq => "'>='",
89            Token::Ident(_, Keyword::NOT_A_KEYWORD) => "Identifier",
90            Token::Ident(_, Keyword::QUOTED_IDENTIFIER) => "QuotedIdentifier",
91            Token::Ident(_, kw) => kw.name(),
92            Token::Integer(_) => "Integer",
93            Token::Invalid => "Invalid",
94            Token::LBrace => "'{'",
95            Token::LBracket => "'['",
96            Token::LParen => "'('",
97            Token::Lt => "'<'",
98            Token::LtEq => "'<='",
99            Token::Minus => "'-'",
100            Token::Mod => "'%'",
101            Token::Mul => "'*'",
102            Token::Neq => "'!='",
103            Token::Period => "'.'",
104            Token::Pipe => "'|'",
105            Token::Plus => "'+'",
106            Token::QuestionMark => "'?'",
107            Token::RArrow => "'=>'",
108            Token::RBrace => "'}'",
109            Token::RBracket => "']'",
110            Token::RParen => "')'",
111            Token::SemiColon => "';'",
112            Token::Sharp => "'#'",
113            Token::ShiftLeft => "'>>'",
114            Token::ShiftRight => "'<<'",
115            Token::DoubleDollar => "'$$'",
116            Token::DollarArg(v) if *v == 1 => "'$1'",
117            Token::DollarArg(v) if *v == 2 => "'$2'",
118            Token::DollarArg(v) if *v == 3 => "'$3'",
119            Token::DollarArg(v) if *v == 4 => "'$4'",
120            Token::DollarArg(v) if *v == 5 => "'$5'",
121            Token::DollarArg(v) if *v == 6 => "'$6'",
122            Token::DollarArg(v) if *v == 7 => "'$7'",
123            Token::DollarArg(v) if *v == 8 => "'$8'",
124            Token::DollarArg(v) if *v == 9 => "'$9'",
125            Token::DollarArg(_) => "'$i'",
126            Token::SingleQuotedString(_) => "String",
127            Token::DoubleQuotedString(_) => "String",
128            Token::Spaceship => "'<=>'",
129            Token::Tilde => "'~'",
130            Token::PercentS => "'%s'",
131            Token::AtAtGlobal => "@@GLOBAL",
132            Token::AtAtSession => "@@SESSION",
133            Token::Eof => "EndOfFile",
134        }
135    }
136}
137pub(crate) struct Lexer<'a> {
138    src: &'a str,
139    chars: core::iter::Peekable<core::str::CharIndices<'a>>,
140}
141
142impl<'a> Lexer<'a> {
143    pub fn new(src: &'a str) -> Self {
144        Self {
145            src,
146            chars: src.char_indices().peekable(),
147        }
148    }
149
150    pub(crate) fn s(&self, span: Span) -> &'a str {
151        core::str::from_utf8(&self.src.as_bytes()[span]).unwrap()
152    }
153
154    fn simple_literal(&mut self, start: usize) -> Token<'a> {
155        let end = loop {
156            match self.chars.peek() {
157                Some((_, '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) => {
158                    self.chars.next();
159                }
160                Some((i, _)) => break *i,
161                None => break self.src.len(),
162            }
163        };
164        let s = self.s(start..end);
165        let ss = s.to_ascii_uppercase();
166        Token::Ident(s, ss.as_str().into())
167    }
168
169    /// Simulate reading from standard input after a statement like `COPY ... FROM STDIN;`.
170    /// First skips space characters and optionally one NL.
171    /// Then consumes until NL '\' '.' NL is encountered, or until EOF.
172    /// The trailing '\' '.' NL is consumed but not returned.
173    pub fn read_from_stdin(&mut self) -> (&'a str, Span) {
174        // Skip optional spaces.
175        while self
176            .chars
177            .peek()
178            .filter(|(_, c)| *c != '\n' && c.is_ascii_whitespace())
179            .is_some()
180        {
181            self.chars.next().unwrap();
182        }
183        let start = match self.chars.peek() {
184            Some((i, '\n')) => i + 1,
185            Some((i, _)) => *i,
186            None => {
187                let span = self.src.len()..self.src.len();
188                return (self.s(span.clone()), span);
189            }
190        };
191        while let Some((i, c)) = self.chars.next() {
192            if c != '\n' {
193                continue;
194            }
195            if !matches!(self.chars.peek(), Some((_, '\\'))) {
196                continue;
197            }
198            self.chars.next().unwrap();
199            if !matches!(self.chars.peek(), Some((_, '.'))) {
200                continue;
201            }
202            self.chars.next().unwrap();
203            if matches!(self.chars.peek(), Some((_, '\n'))) {
204                // Data ends with NL '\' '.' NL.
205                self.chars.next().unwrap();
206            } else if self.chars.peek().is_some() {
207                continue;
208            } else {
209                // Data ends with NL '\' '.' without an extra NL,
210                // which is fine.
211            }
212            // `i` is the character index of the first '\n',
213            // so the data ends at character index i + 1.
214            let span = start..(i + 1);
215            return (self.s(span.clone()), span);
216        }
217        // Data ends at EOF without NL '\' '.' [NL].
218        let span = start..self.src.len();
219        return (self.s(span.clone()), span);
220    }
221
222    pub fn next_token(&mut self) -> (Token<'a>, Span) {
223        loop {
224            let (start, c) = match self.chars.next() {
225                Some(v) => v,
226                None => {
227                    return (Token::Eof, self.src.len()..self.src.len());
228                }
229            };
230            let t = match c {
231                ' ' | '\t' | '\n' | '\r' => continue,
232                '?' => Token::QuestionMark,
233                ';' => Token::SemiColon,
234                '\\' => Token::Backslash,
235                '[' => Token::LBracket,
236                ']' => Token::RBracket,
237                '&' => match self.chars.peek() {
238                    Some((_, '&')) => {
239                        self.chars.next();
240                        Token::DoubleAmpersand
241                    }
242                    _ => Token::Ampersand,
243                },
244                '^' => Token::Caret,
245                '{' => Token::LBrace,
246                '}' => Token::RBrace,
247                '(' => Token::LParen,
248                ')' => Token::RParen,
249                ',' => Token::Comma,
250                '+' => Token::Plus,
251                '*' => Token::Mul,
252                '%' => match self.chars.peek() {
253                    Some((_, 's')) => {
254                        self.chars.next();
255                        Token::PercentS
256                    }
257                    _ => Token::Mod,
258                },
259                '#' => Token::Sharp,
260                '@' => match self.chars.peek() {
261                    Some((_, '@')) => {
262                        self.chars.next();
263                        #[allow(clippy::never_loop)]
264                        match self.chars.peek() {
265                            Some((_, 's' | 'S')) => loop {
266                                self.chars.next();
267                                if !matches!(self.chars.peek(), Some((_, 'e' | 'E'))) {
268                                    break Token::Invalid;
269                                }
270                                self.chars.next();
271                                if !matches!(self.chars.peek(), Some((_, 's' | 'S'))) {
272                                    break Token::Invalid;
273                                }
274                                self.chars.next();
275                                if !matches!(self.chars.peek(), Some((_, 's' | 'S'))) {
276                                    break Token::Invalid;
277                                }
278                                self.chars.next();
279                                if !matches!(self.chars.peek(), Some((_, 'i' | 'I'))) {
280                                    break Token::Invalid;
281                                }
282                                self.chars.next();
283                                if !matches!(self.chars.peek(), Some((_, 'o' | 'O'))) {
284                                    break Token::Invalid;
285                                }
286                                self.chars.next();
287                                if !matches!(self.chars.peek(), Some((_, 'n' | 'N'))) {
288                                    break Token::Invalid;
289                                }
290                                self.chars.next();
291                                break Token::AtAtSession;
292                            },
293                            Some((_, 'g' | 'G')) => loop {
294                                self.chars.next();
295                                if !matches!(self.chars.peek(), Some((_, 'l' | 'L'))) {
296                                    break Token::Invalid;
297                                }
298                                self.chars.next();
299                                if !matches!(self.chars.peek(), Some((_, 'o' | 'O'))) {
300                                    break Token::Invalid;
301                                }
302                                self.chars.next();
303                                if !matches!(self.chars.peek(), Some((_, 'b' | 'B'))) {
304                                    break Token::Invalid;
305                                }
306                                self.chars.next();
307                                if !matches!(self.chars.peek(), Some((_, 'a' | 'A'))) {
308                                    break Token::Invalid;
309                                }
310                                self.chars.next();
311                                if !matches!(self.chars.peek(), Some((_, 'l' | 'L'))) {
312                                    break Token::Invalid;
313                                }
314                                self.chars.next();
315                                break Token::AtAtGlobal;
316                            },
317                            _ => Token::Invalid,
318                        }
319                    }
320                    _ => Token::At,
321                },
322                '~' => Token::Tilde,
323                ':' => match self.chars.peek() {
324                    Some((_, ':')) => {
325                        self.chars.next();
326                        Token::DoubleColon
327                    }
328                    _ => Token::Colon,
329                },
330                '$' => match self.chars.peek() {
331                    Some((_, '$')) => {
332                        self.chars.next();
333                        Token::DoubleDollar
334                    }
335                    Some((_, '1'..='9')) => {
336                        let mut v = self.chars.peek().unwrap().1.to_digit(10).unwrap() as usize;
337                        self.chars.next();
338                        while matches!(self.chars.peek(), Some((_, '0'..='9'))) {
339                            v = v * 10
340                                + self.chars.peek().unwrap().1.to_digit(10).unwrap() as usize;
341                            self.chars.next();
342                        }
343                        Token::DollarArg(v)
344                    }
345                    _ => Token::Invalid,
346                },
347                '=' => match self.chars.peek() {
348                    Some((_, '>')) => {
349                        self.chars.next();
350                        Token::RArrow
351                    }
352                    _ => Token::Eq,
353                },
354                '!' => match self.chars.peek() {
355                    Some((_, '=')) => {
356                        self.chars.next();
357                        Token::Neq
358                    }
359                    Some((_, '!')) => {
360                        self.chars.next();
361                        Token::DoubleExclamationMark
362                    }
363                    _ => Token::ExclamationMark,
364                },
365                '<' => match self.chars.peek() {
366                    Some((_, '=')) => {
367                        self.chars.next();
368                        match self.chars.peek() {
369                            Some((_, '>')) => {
370                                self.chars.next();
371                                Token::Spaceship
372                            }
373                            _ => Token::LtEq,
374                        }
375                    }
376                    Some((_, '>')) => {
377                        self.chars.next();
378                        Token::Neq
379                    }
380                    Some((_, '<')) => {
381                        self.chars.next();
382                        Token::ShiftLeft
383                    }
384                    _ => Token::Lt,
385                },
386                '>' => match self.chars.peek() {
387                    Some((_, '=')) => {
388                        self.chars.next();
389                        Token::GtEq
390                    }
391                    Some((_, '>')) => {
392                        self.chars.next();
393                        Token::ShiftRight
394                    }
395                    _ => Token::Gt,
396                },
397                '|' => match self.chars.peek() {
398                    Some((_, '|')) => {
399                        self.chars.next();
400                        Token::DoublePipe
401                    }
402                    _ => Token::Pipe,
403                },
404                '-' => match self.chars.peek() {
405                    Some((_, '-')) => {
406                        while !matches!(self.chars.next(), Some((_, '\r' | '\n')) | None) {}
407                        continue;
408                    }
409                    _ => Token::Minus,
410                },
411                '/' => match self.chars.peek() {
412                    Some((_, '*')) => {
413                        self.chars.next();
414                        let ok = loop {
415                            match self.chars.next() {
416                                Some((_, '*')) => {
417                                    if matches!(self.chars.peek(), Some((_, '/'))) {
418                                        self.chars.next();
419                                        break true;
420                                    }
421                                }
422                                Some(_) => (),
423                                None => break false,
424                            }
425                        };
426                        if ok {
427                            continue;
428                        } else {
429                            Token::Invalid
430                        }
431                    }
432                    Some((_, '/')) => {
433                        while !matches!(self.chars.next(), Some((_, '\r' | '\n')) | None) {}
434                        continue;
435                    }
436                    _ => Token::Div,
437                },
438                'x' | 'X' => match self.chars.peek() {
439                    Some((_, '\'')) => {
440                        todo!("Hex literal")
441                    }
442                    _ => self.simple_literal(start),
443                },
444                '_' | 'a'..='z' | 'A'..='Z' => self.simple_literal(start),
445                '`' => {
446                    while matches!(
447                        self.chars.peek(),
448                        Some((_, '_' | 'a'..='z' | 'A'..='Z' | '0'..='9' | '-'))
449                    ) {
450                        self.chars.next();
451                    }
452                    match self.chars.peek() {
453                        Some((i, '`')) => {
454                            let i = *i;
455                            self.chars.next();
456                            Token::Ident(self.s(start + 1..i), Keyword::QUOTED_IDENTIFIER)
457                        }
458                        _ => Token::Invalid,
459                    }
460                }
461                '\'' => loop {
462                    match self.chars.next() {
463                        Some((_, '\\')) => {
464                            self.chars.next();
465                        }
466                        Some((i, '\'')) => match self.chars.peek() {
467                            Some((_, '\'')) => {
468                                self.chars.next();
469                            }
470                            _ => break Token::SingleQuotedString(self.s(start + 1..i)),
471                        },
472                        Some((_, _)) => (),
473                        None => break Token::Invalid,
474                    }
475                },
476                '"' => loop {
477                    match self.chars.next() {
478                        Some((_, '\\')) => {
479                            self.chars.next();
480                        }
481                        Some((i, '"')) => match self.chars.peek() {
482                            Some((_, '"')) => {
483                                self.chars.next();
484                            }
485                            _ => break Token::DoubleQuotedString(self.s(start + 1..i)),
486                        },
487                        Some((_, _)) => (),
488                        None => break Token::Invalid,
489                    }
490                },
491                '0'..='9' => loop {
492                    match self.chars.peek() {
493                        Some((_, '0'..='9')) => {
494                            self.chars.next();
495                        }
496                        Some((_, '.')) => {
497                            self.chars.next();
498                            break loop {
499                                match self.chars.peek() {
500                                    Some((_, '0'..='9')) => {
501                                        self.chars.next();
502                                    }
503                                    Some((i, _)) => {
504                                        let i = *i;
505                                        break Token::Float(self.s(start..i));
506                                    }
507                                    None => break Token::Float(self.s(start..self.src.len())),
508                                }
509                            };
510                        }
511                        Some((i, _)) => {
512                            let i = *i;
513                            break Token::Integer(self.s(start..i));
514                        }
515                        None => break Token::Integer(self.s(start..self.src.len())),
516                    }
517                },
518                '.' => match self.chars.peek() {
519                    Some((_, '0'..='9')) => loop {
520                        match self.chars.peek() {
521                            Some((_, '0'..='9')) => {
522                                self.chars.next();
523                            }
524                            Some((i, _)) => {
525                                let i = *i;
526                                break Token::Float(self.s(start..i));
527                            }
528                            None => break Token::Float(self.s(start..self.src.len())),
529                        }
530                    },
531                    _ => Token::Period,
532                },
533                _ => Token::Invalid,
534            };
535
536            let end = match self.chars.peek() {
537                Some((i, _)) => *i,
538                None => self.src.len(),
539            };
540            return (t, start..end);
541
542            // // string
543
544            // '\'' => {
545            //     let value = self.tokenize_single_quoted_string(chars)?;
546            //     Ok(Some(Token::SingleQuotedString { value, span }))
547            // }
548
549            // // numbers and period
550            // '0'..='9' | '.' => {
551            //     let mut value = peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
552
553            //     // match binary literal that starts with 0x
554            //     if value == "0" && chars.peek().map(|(_, c)| c) == Some(&'x') {
555            //         chars.next();
556            //         let value = peeking_take_while(
557            //             chars,
558            //             |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
559            //         );
560            //         return Ok(Some(Token::HexStringLiteral { value, span }));
561            //     }
562
563            //     // match one period
564            //     if let Some((_, '.')) = chars.peek() {
565            //         value.push('.');
566            //         chars.next();
567            //     }
568            //     value += &peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
569
570            //     // No number -> Token::Period
571            //     if value == "." {
572            //         return Ok(Some(Token::Period { span }));
573            //     }
574
575            //     let long = if let Some((_, 'L')) = chars.peek() {
576            //         chars.next();
577            //         true
578            //     } else {
579            //         false
580            //     };
581            //     Ok(Some(Token::Number { value, long, span }))
582            // }
583            // // punctuation
584
585            // // operators
586        }
587    }
588}
589
590impl<'a> Iterator for Lexer<'a> {
591    type Item = (Token<'a>, Span);
592
593    fn next(&mut self) -> Option<Self::Item> {
594        Some(self.next_token())
595    }
596}