sql_ast/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5// http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19use std::iter::Peekable;
20use std::str::Chars;
21
22use super::dialect::keywords::ALL_KEYWORDS;
23use super::dialect::Dialect;
24use std::fmt;
25
26/// SQL Token enumeration
27#[derive(Debug, Clone, PartialEq)]
28pub enum Token {
29    /// A keyword (like SELECT) or an optionally quoted SQL identifier
30    Word(Word),
31    /// An unsigned numeric literal
32    Number(String),
33    /// A character that could not be tokenized
34    Char(char),
35    /// Single quoted string: i.e: 'string'
36    SingleQuotedString(String),
37    /// "National" string literal: i.e: N'string'
38    NationalStringLiteral(String),
39    /// Hexadecimal string literal: i.e.: X'deadbeef'
40    HexStringLiteral(String),
41    /// Comma
42    Comma,
43    /// Whitespace (space, tab, etc)
44    Whitespace(Whitespace),
45    /// Equality operator `=`
46    Eq,
47    /// Not Equals operator `<>` (or `!=` in some dialects)
48    Neq,
49    /// Less Than operator `<`
50    Lt,
51    /// Greater han operator `>`
52    Gt,
53    /// Less Than Or Equals operator `<=`
54    LtEq,
55    /// Greater Than Or Equals operator `>=`
56    GtEq,
57    /// Plus operator `+`
58    Plus,
59    /// Minus operator `-`
60    Minus,
61    /// Multiplication operator `*`
62    Mult,
63    /// Division operator `/`
64    Div,
65    /// Modulo Operator `%`
66    Mod,
67    /// Left parenthesis `(`
68    LParen,
69    /// Right parenthesis `)`
70    RParen,
71    /// Period (used for compound identifiers or projections into nested types)
72    Period,
73    /// Colon `:`
74    Colon,
75    /// DoubleColon `::` (used for casting in postgresql)
76    DoubleColon,
77    /// SemiColon `;` used as separator for COPY and payload
78    SemiColon,
79    /// Backslash `\` used in terminating the COPY payload with `\.`
80    Backslash,
81    /// Left bracket `[`
82    LBracket,
83    /// Right bracket `]`
84    RBracket,
85    /// Ampersand &
86    Ampersand,
87    /// Left brace `{`
88    LBrace,
89    /// Right brace `}`
90    RBrace,
91}
92
93impl fmt::Display for Token {
94    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95        match self {
96            Token::Word(ref w) => write!(f, "{}", w),
97            Token::Number(ref n) => f.write_str(n),
98            Token::Char(ref c) => write!(f, "{}", c),
99            Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
100            Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
101            Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
102            Token::Comma => f.write_str(","),
103            Token::Whitespace(ws) => write!(f, "{}", ws),
104            Token::Eq => f.write_str("="),
105            Token::Neq => f.write_str("<>"),
106            Token::Lt => f.write_str("<"),
107            Token::Gt => f.write_str(">"),
108            Token::LtEq => f.write_str("<="),
109            Token::GtEq => f.write_str(">="),
110            Token::Plus => f.write_str("+"),
111            Token::Minus => f.write_str("-"),
112            Token::Mult => f.write_str("*"),
113            Token::Div => f.write_str("/"),
114            Token::Mod => f.write_str("%"),
115            Token::LParen => f.write_str("("),
116            Token::RParen => f.write_str(")"),
117            Token::Period => f.write_str("."),
118            Token::Colon => f.write_str(":"),
119            Token::DoubleColon => f.write_str("::"),
120            Token::SemiColon => f.write_str(";"),
121            Token::Backslash => f.write_str("\\"),
122            Token::LBracket => f.write_str("["),
123            Token::RBracket => f.write_str("]"),
124            Token::Ampersand => f.write_str("&"),
125            Token::LBrace => f.write_str("{"),
126            Token::RBrace => f.write_str("}"),
127        }
128    }
129}
130
131impl Token {
132    pub fn make_keyword(keyword: &str) -> Self {
133        Token::make_word(keyword, None)
134    }
135    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
136        let word_uppercase = word.to_uppercase();
137        //TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is
138        // not fast but I want the simplicity for now while I experiment with pluggable
139        // dialects
140        let is_keyword = quote_style.is_none() && ALL_KEYWORDS.contains(&word_uppercase.as_str());
141        Token::Word(Word {
142            value: word.to_string(),
143            quote_style,
144            keyword: if is_keyword {
145                word_uppercase
146            } else {
147                "".to_string()
148            },
149        })
150    }
151}
152
153/// A keyword (like SELECT) or an optionally quoted SQL identifier
154#[derive(Debug, Clone, PartialEq)]
155pub struct Word {
156    /// The value of the token, without the enclosing quotes, and with the
157    /// escape sequences (if any) processed (TODO: escapes are not handled)
158    pub value: String,
159    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
160    /// The standard and most implementations allow using double quotes for this,
161    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
162    pub quote_style: Option<char>,
163    /// If the word was not quoted and it matched one of the known keywords,
164    /// this will have one of the values from dialect::keywords, otherwise empty
165    pub keyword: String,
166}
167
168impl fmt::Display for Word {
169    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
170        match self.quote_style {
171            Some(s) if s == '"' || s == '[' || s == '`' => {
172                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
173            }
174            None => f.write_str(&self.value),
175            _ => panic!("Unexpected quote_style!"),
176        }
177    }
178}
179impl Word {
180    fn matching_end_quote(ch: char) -> char {
181        match ch {
182            '"' => '"', // ANSI and most dialects
183            '[' => ']', // MS SQL
184            '`' => '`', // MySQL
185            _ => panic!("unexpected quoting style!"),
186        }
187    }
188}
189
190#[derive(Debug, Clone, PartialEq)]
191pub enum Whitespace {
192    Space,
193    Newline,
194    Tab,
195    SingleLineComment(String),
196    MultiLineComment(String),
197}
198
199impl fmt::Display for Whitespace {
200    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
201        match self {
202            Whitespace::Space => f.write_str(" "),
203            Whitespace::Newline => f.write_str("\n"),
204            Whitespace::Tab => f.write_str("\t"),
205            Whitespace::SingleLineComment(s) => write!(f, "--{}", s),
206            Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
207        }
208    }
209}
210
211/// Tokenizer error
212#[derive(Debug, PartialEq)]
213pub struct TokenizerError(String);
214
215/// SQL Tokenizer
216pub struct Tokenizer<'a> {
217    dialect: &'a dyn Dialect,
218    pub query: String,
219    pub line: u64,
220    pub col: u64,
221}
222
223impl<'a> Tokenizer<'a> {
224    /// Create a new SQL tokenizer for the specified SQL statement
225    pub fn new(dialect: &'a dyn Dialect, query: &str) -> Self {
226        Self {
227            dialect,
228            query: query.to_string(),
229            line: 1,
230            col: 1,
231        }
232    }
233
234    /// Tokenize the statement and produce a vector of tokens
235    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
236        let mut peekable = self.query.chars().peekable();
237
238        let mut tokens: Vec<Token> = vec![];
239
240        while let Some(token) = self.next_token(&mut peekable)? {
241            match &token {
242                Token::Whitespace(Whitespace::Newline) => {
243                    self.line += 1;
244                    self.col = 1;
245                }
246
247                Token::Whitespace(Whitespace::Tab) => self.col += 4,
248                Token::Word(w) if w.quote_style.is_none() => self.col += w.value.len() as u64,
249                Token::Word(w) if w.quote_style.is_some() => self.col += w.value.len() as u64 + 2,
250                Token::Number(s) => self.col += s.len() as u64,
251                Token::SingleQuotedString(s) => self.col += s.len() as u64,
252                _ => self.col += 1,
253            }
254
255            tokens.push(token);
256        }
257        Ok(tokens)
258    }
259
260    /// Get the next token or return None
261    fn next_token(&self, chars: &mut Peekable<Chars<'_>>) -> Result<Option<Token>, TokenizerError> {
262        //println!("next_token: {:?}", chars.peek());
263        match chars.peek() {
264            Some(&ch) => match ch {
265                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
266                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
267                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
268                '\r' => {
269                    // Emit a single Whitespace::Newline token for \r and \r\n
270                    chars.next();
271                    if let Some('\n') = chars.peek() {
272                        chars.next();
273                    }
274                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
275                }
276                'N' => {
277                    chars.next(); // consume, to check the next char
278                    match chars.peek() {
279                        Some('\'') => {
280                            // N'...' - a <national character string literal>
281                            let s = self.tokenize_single_quoted_string(chars);
282                            Ok(Some(Token::NationalStringLiteral(s)))
283                        }
284                        _ => {
285                            // regular identifier starting with an "N"
286                            let s = self.tokenize_word('N', chars);
287                            Ok(Some(Token::make_word(&s, None)))
288                        }
289                    }
290                }
291                // The spec only allows an uppercase 'X' to introduce a hex
292                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
293                x @ 'x' | x @ 'X' => {
294                    chars.next(); // consume, to check the next char
295                    match chars.peek() {
296                        Some('\'') => {
297                            // X'...' - a <binary string literal>
298                            let s = self.tokenize_single_quoted_string(chars);
299                            Ok(Some(Token::HexStringLiteral(s)))
300                        }
301                        _ => {
302                            // regular identifier starting with an "X"
303                            let s = self.tokenize_word(x, chars);
304                            Ok(Some(Token::make_word(&s, None)))
305                        }
306                    }
307                }
308                // identifier or keyword
309                ch if self.dialect.is_identifier_start(ch) => {
310                    chars.next(); // consume the first char
311                    let s = self.tokenize_word(ch, chars);
312                    Ok(Some(Token::make_word(&s, None)))
313                }
314                // string
315                '\'' => {
316                    let s = self.tokenize_single_quoted_string(chars);
317                    Ok(Some(Token::SingleQuotedString(s)))
318                }
319                // delimited (quoted) identifier
320                quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
321                    chars.next(); // consume the opening quote
322                    let quote_end = Word::matching_end_quote(quote_start);
323                    let s = peeking_take_while(chars, |ch| ch != quote_end);
324                    if chars.next() == Some(quote_end) {
325                        Ok(Some(Token::make_word(&s, Some(quote_start))))
326                    } else {
327                        Err(TokenizerError(format!(
328                            "Expected close delimiter '{}' before EOF.",
329                            quote_end
330                        )))
331                    }
332                }
333                // numbers
334                '0'..='9' => {
335                    // TODO: https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#unsigned-numeric-literal
336                    let s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
337                    Ok(Some(Token::Number(s)))
338                }
339                // punctuation
340                '(' => self.consume_and_return(chars, Token::LParen),
341                ')' => self.consume_and_return(chars, Token::RParen),
342                ',' => self.consume_and_return(chars, Token::Comma),
343                // operators
344                '-' => {
345                    chars.next(); // consume the '-'
346                    match chars.peek() {
347                        Some('-') => {
348                            chars.next(); // consume the second '-', starting a single-line comment
349                            let mut s = peeking_take_while(chars, |ch| ch != '\n');
350                            if let Some(ch) = chars.next() {
351                                assert_eq!(ch, '\n');
352                                s.push(ch);
353                            }
354                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment(s))))
355                        }
356                        // a regular '-' operator
357                        _ => Ok(Some(Token::Minus)),
358                    }
359                }
360                '/' => {
361                    chars.next(); // consume the '/'
362                    match chars.peek() {
363                        Some('*') => {
364                            chars.next(); // consume the '*', starting a multi-line comment
365                            self.tokenize_multiline_comment(chars)
366                        }
367                        // a regular '/' operator
368                        _ => Ok(Some(Token::Div)),
369                    }
370                }
371                '+' => self.consume_and_return(chars, Token::Plus),
372                '*' => self.consume_and_return(chars, Token::Mult),
373                '%' => self.consume_and_return(chars, Token::Mod),
374                '=' => self.consume_and_return(chars, Token::Eq),
375                '.' => self.consume_and_return(chars, Token::Period),
376                '!' => {
377                    chars.next(); // consume
378                    match chars.peek() {
379                        Some('=') => self.consume_and_return(chars, Token::Neq),
380                        _ => Err(TokenizerError(format!(
381                            "Tokenizer Error at Line: {}, Col: {}",
382                            self.line, self.col
383                        ))),
384                    }
385                }
386                '<' => {
387                    chars.next(); // consume
388                    match chars.peek() {
389                        Some('=') => self.consume_and_return(chars, Token::LtEq),
390                        Some('>') => self.consume_and_return(chars, Token::Neq),
391                        _ => Ok(Some(Token::Lt)),
392                    }
393                }
394                '>' => {
395                    chars.next(); // consume
396                    match chars.peek() {
397                        Some('=') => self.consume_and_return(chars, Token::GtEq),
398                        _ => Ok(Some(Token::Gt)),
399                    }
400                }
401                ':' => {
402                    chars.next();
403                    match chars.peek() {
404                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
405                        _ => Ok(Some(Token::Colon)),
406                    }
407                }
408                ';' => self.consume_and_return(chars, Token::SemiColon),
409                '\\' => self.consume_and_return(chars, Token::Backslash),
410                '[' => self.consume_and_return(chars, Token::LBracket),
411                ']' => self.consume_and_return(chars, Token::RBracket),
412                '&' => self.consume_and_return(chars, Token::Ampersand),
413                '{' => self.consume_and_return(chars, Token::LBrace),
414                '}' => self.consume_and_return(chars, Token::RBrace),
415                other => self.consume_and_return(chars, Token::Char(other)),
416            },
417            None => Ok(None),
418        }
419    }
420
421    /// Tokenize an identifier or keyword, after the first char is already consumed.
422    fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars<'_>>) -> String {
423        let mut s = first_char.to_string();
424        s.push_str(&peeking_take_while(chars, |ch| {
425            self.dialect.is_identifier_part(ch)
426        }));
427        s
428    }
429
430    /// Read a single quoted string, starting with the opening quote.
431    fn tokenize_single_quoted_string(&self, chars: &mut Peekable<Chars<'_>>) -> String {
432        //TODO: handle escaped quotes in string
433        //TODO: handle newlines in string
434        //TODO: handle EOF before terminating quote
435        //TODO: handle 'string' <white space> 'string continuation'
436        let mut s = String::new();
437        chars.next(); // consume the opening quote
438        while let Some(&ch) = chars.peek() {
439            match ch {
440                '\'' => {
441                    chars.next(); // consume
442                    let escaped_quote = chars.peek().map(|c| *c == '\'').unwrap_or(false);
443                    if escaped_quote {
444                        s.push('\'');
445                        chars.next();
446                    } else {
447                        break;
448                    }
449                }
450                _ => {
451                    chars.next(); // consume
452                    s.push(ch);
453                }
454            }
455        }
456        s
457    }
458
459    fn tokenize_multiline_comment(
460        &self,
461        chars: &mut Peekable<Chars<'_>>,
462    ) -> Result<Option<Token>, TokenizerError> {
463        let mut s = String::new();
464        let mut maybe_closing_comment = false;
465        // TODO: deal with nested comments
466        loop {
467            match chars.next() {
468                Some(ch) => {
469                    if maybe_closing_comment {
470                        if ch == '/' {
471                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
472                        } else {
473                            s.push('*');
474                        }
475                    }
476                    maybe_closing_comment = ch == '*';
477                    if !maybe_closing_comment {
478                        s.push(ch);
479                    }
480                }
481                None => {
482                    break Err(TokenizerError(
483                        "Unexpected EOF while in a multi-line comment".to_string(),
484                    ));
485                }
486            }
487        }
488    }
489
490    fn consume_and_return(
491        &self,
492        chars: &mut Peekable<Chars<'_>>,
493        t: Token,
494    ) -> Result<Option<Token>, TokenizerError> {
495        chars.next();
496        Ok(Some(t))
497    }
498}
499
500/// Read from `chars` until `predicate` returns `false` or EOF is hit.
501/// Return the characters read as String, and keep the first non-matching
502/// char available as `chars.next()`.
503fn peeking_take_while(
504    chars: &mut Peekable<Chars<'_>>,
505    mut predicate: impl FnMut(char) -> bool,
506) -> String {
507    let mut s = String::new();
508    while let Some(&ch) = chars.peek() {
509        if predicate(ch) {
510            chars.next(); // consume
511            s.push(ch);
512        } else {
513            break;
514        }
515    }
516    s
517}
518
519#[cfg(test)]
520mod tests {
521    use super::super::dialect::GenericDialect;
522    use super::*;
523
524    #[test]
525    fn tokenize_select_1() {
526        let sql = String::from("SELECT 1");
527        let dialect = GenericDialect {};
528        let mut tokenizer = Tokenizer::new(&dialect, &sql);
529        let tokens = tokenizer.tokenize().unwrap();
530
531        let expected = vec![
532            Token::make_keyword("SELECT"),
533            Token::Whitespace(Whitespace::Space),
534            Token::Number(String::from("1")),
535        ];
536
537        compare(expected, tokens);
538    }
539
540    #[test]
541    fn tokenize_scalar_function() {
542        let sql = String::from("SELECT sqrt(1)");
543        let dialect = GenericDialect {};
544        let mut tokenizer = Tokenizer::new(&dialect, &sql);
545        let tokens = tokenizer.tokenize().unwrap();
546
547        let expected = vec![
548            Token::make_keyword("SELECT"),
549            Token::Whitespace(Whitespace::Space),
550            Token::make_word("sqrt", None),
551            Token::LParen,
552            Token::Number(String::from("1")),
553            Token::RParen,
554        ];
555
556        compare(expected, tokens);
557    }
558
559    #[test]
560    fn tokenize_simple_select() {
561        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
562        let dialect = GenericDialect {};
563        let mut tokenizer = Tokenizer::new(&dialect, &sql);
564        let tokens = tokenizer.tokenize().unwrap();
565
566        let expected = vec![
567            Token::make_keyword("SELECT"),
568            Token::Whitespace(Whitespace::Space),
569            Token::Mult,
570            Token::Whitespace(Whitespace::Space),
571            Token::make_keyword("FROM"),
572            Token::Whitespace(Whitespace::Space),
573            Token::make_word("customer", None),
574            Token::Whitespace(Whitespace::Space),
575            Token::make_keyword("WHERE"),
576            Token::Whitespace(Whitespace::Space),
577            Token::make_word("id", None),
578            Token::Whitespace(Whitespace::Space),
579            Token::Eq,
580            Token::Whitespace(Whitespace::Space),
581            Token::Number(String::from("1")),
582            Token::Whitespace(Whitespace::Space),
583            Token::make_keyword("LIMIT"),
584            Token::Whitespace(Whitespace::Space),
585            Token::Number(String::from("5")),
586        ];
587
588        compare(expected, tokens);
589    }
590
591    #[test]
592    fn tokenize_string_predicate() {
593        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
594        let dialect = GenericDialect {};
595        let mut tokenizer = Tokenizer::new(&dialect, &sql);
596        let tokens = tokenizer.tokenize().unwrap();
597
598        let expected = vec![
599            Token::make_keyword("SELECT"),
600            Token::Whitespace(Whitespace::Space),
601            Token::Mult,
602            Token::Whitespace(Whitespace::Space),
603            Token::make_keyword("FROM"),
604            Token::Whitespace(Whitespace::Space),
605            Token::make_word("customer", None),
606            Token::Whitespace(Whitespace::Space),
607            Token::make_keyword("WHERE"),
608            Token::Whitespace(Whitespace::Space),
609            Token::make_word("salary", None),
610            Token::Whitespace(Whitespace::Space),
611            Token::Neq,
612            Token::Whitespace(Whitespace::Space),
613            Token::SingleQuotedString(String::from("Not Provided")),
614        ];
615
616        compare(expected, tokens);
617    }
618
619    #[test]
620    fn tokenize_invalid_string() {
621        let sql = String::from("\nمصطفىh");
622
623        let dialect = GenericDialect {};
624        let mut tokenizer = Tokenizer::new(&dialect, &sql);
625        let tokens = tokenizer.tokenize().unwrap();
626        println!("tokens: {:#?}", tokens);
627        let expected = vec![
628            Token::Whitespace(Whitespace::Newline),
629            Token::Char('م'),
630            Token::Char('ص'),
631            Token::Char('ط'),
632            Token::Char('ف'),
633            Token::Char('ى'),
634            Token::make_word("h", None),
635        ];
636        compare(expected, tokens);
637    }
638
639    #[test]
640    fn tokenize_invalid_string_cols() {
641        let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
642
643        let dialect = GenericDialect {};
644        let mut tokenizer = Tokenizer::new(&dialect, &sql);
645        let tokens = tokenizer.tokenize().unwrap();
646        println!("tokens: {:#?}", tokens);
647        let expected = vec![
648            Token::Whitespace(Whitespace::Newline),
649            Token::Whitespace(Whitespace::Newline),
650            Token::make_keyword("SELECT"),
651            Token::Whitespace(Whitespace::Space),
652            Token::Mult,
653            Token::Whitespace(Whitespace::Space),
654            Token::make_keyword("FROM"),
655            Token::Whitespace(Whitespace::Space),
656            Token::make_keyword("table"),
657            Token::Whitespace(Whitespace::Tab),
658            Token::Char('م'),
659            Token::Char('ص'),
660            Token::Char('ط'),
661            Token::Char('ف'),
662            Token::Char('ى'),
663            Token::make_word("h", None),
664        ];
665        compare(expected, tokens);
666    }
667
668    #[test]
669    fn tokenize_is_null() {
670        let sql = String::from("a IS NULL");
671        let dialect = GenericDialect {};
672        let mut tokenizer = Tokenizer::new(&dialect, &sql);
673        let tokens = tokenizer.tokenize().unwrap();
674
675        let expected = vec![
676            Token::make_word("a", None),
677            Token::Whitespace(Whitespace::Space),
678            Token::make_keyword("IS"),
679            Token::Whitespace(Whitespace::Space),
680            Token::make_keyword("NULL"),
681        ];
682
683        compare(expected, tokens);
684    }
685
686    #[test]
687    fn tokenize_comment() {
688        let sql = String::from("0--this is a comment\n1");
689
690        let dialect = GenericDialect {};
691        let mut tokenizer = Tokenizer::new(&dialect, &sql);
692        let tokens = tokenizer.tokenize().unwrap();
693        let expected = vec![
694            Token::Number("0".to_string()),
695            Token::Whitespace(Whitespace::SingleLineComment(
696                "this is a comment\n".to_string(),
697            )),
698            Token::Number("1".to_string()),
699        ];
700        compare(expected, tokens);
701    }
702
703    #[test]
704    fn tokenize_comment_at_eof() {
705        let sql = String::from("--this is a comment");
706
707        let dialect = GenericDialect {};
708        let mut tokenizer = Tokenizer::new(&dialect, &sql);
709        let tokens = tokenizer.tokenize().unwrap();
710        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment(
711            "this is a comment".to_string(),
712        ))];
713        compare(expected, tokens);
714    }
715
716    #[test]
717    fn tokenize_multiline_comment() {
718        let sql = String::from("0/*multi-line\n* /comment*/1");
719
720        let dialect = GenericDialect {};
721        let mut tokenizer = Tokenizer::new(&dialect, &sql);
722        let tokens = tokenizer.tokenize().unwrap();
723        let expected = vec![
724            Token::Number("0".to_string()),
725            Token::Whitespace(Whitespace::MultiLineComment(
726                "multi-line\n* /comment".to_string(),
727            )),
728            Token::Number("1".to_string()),
729        ];
730        compare(expected, tokens);
731    }
732
733    #[test]
734    fn tokenize_multiline_comment_with_even_asterisks() {
735        let sql = String::from("\n/** Comment **/\n");
736
737        let dialect = GenericDialect {};
738        let mut tokenizer = Tokenizer::new(&dialect, &sql);
739        let tokens = tokenizer.tokenize().unwrap();
740        let expected = vec![
741            Token::Whitespace(Whitespace::Newline),
742            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
743            Token::Whitespace(Whitespace::Newline),
744        ];
745        compare(expected, tokens);
746    }
747
748    #[test]
749    fn tokenize_mismatched_quotes() {
750        let sql = String::from("\"foo");
751
752        let dialect = GenericDialect {};
753        let mut tokenizer = Tokenizer::new(&dialect, &sql);
754        assert_eq!(
755            tokenizer.tokenize(),
756            Err(TokenizerError(
757                "Expected close delimiter '\"' before EOF.".to_string(),
758            ))
759        );
760    }
761
762    #[test]
763    fn tokenize_newlines() {
764        let sql = String::from("line1\nline2\rline3\r\nline4\r");
765
766        let dialect = GenericDialect {};
767        let mut tokenizer = Tokenizer::new(&dialect, &sql);
768        let tokens = tokenizer.tokenize().unwrap();
769        let expected = vec![
770            Token::make_word("line1", None),
771            Token::Whitespace(Whitespace::Newline),
772            Token::make_word("line2", None),
773            Token::Whitespace(Whitespace::Newline),
774            Token::make_word("line3", None),
775            Token::Whitespace(Whitespace::Newline),
776            Token::make_word("line4", None),
777            Token::Whitespace(Whitespace::Newline),
778        ];
779        compare(expected, tokens);
780    }
781
782    fn compare(expected: Vec<Token>, actual: Vec<Token>) {
783        //println!("------------------------------");
784        //println!("tokens   = {:?}", actual);
785        //println!("expected = {:?}", expected);
786        //println!("------------------------------");
787        assert_eq!(expected, actual);
788    }
789}
sql_ast/tokenizer.rs

sql_ast/
tokenizer.rs