sqlite3_parser/lexer/sql/
mod.rs

1//! Adaptation/port of [`SQLite` tokenizer](http://www.sqlite.org/src/artifact?ci=trunk&filename=src/tokenize.c)
2use bumpalo::Bump;
3use fallible_iterator::FallibleIterator;
4use memchr::memchr;
5
6pub use crate::dialect::TokenType;
7use crate::dialect::TokenType::*;
8use crate::dialect::{
9    is_identifier_continue, is_identifier_start, keyword_token, sentinel, MAX_KEYWORD_LEN,
10};
11use crate::parser::ast::Cmd;
12use crate::parser::parse::{yyParser, YYCODETYPE};
13use crate::parser::Context;
14
15mod error;
16#[cfg(test)]
17mod test;
18
19use crate::lexer::scan::{Pos, ScanError as _, Splitter};
20use crate::lexer::Scanner;
21pub use crate::parser::ParserError;
22pub use error::Error;
23
24// TODO Extract scanning stuff and move this into the parser crate
25// to make possible to use the tokenizer without depending on the parser...
26
27/// SQL parser
28pub struct Parser<'input> {
29    input: &'input [u8],
30    scanner: Scanner<Tokenizer>,
31    parser: yyParser<'input>,
32}
33
34impl<'input> Parser<'input> {
35    /// Constructor
36    pub fn new(bump: &'input Bump, input: &'input [u8]) -> Self {
37        let lexer = Tokenizer::new();
38        let scanner = Scanner::new(lexer);
39        let ctx = Context::new(bump, input);
40        let parser = yyParser::new(ctx);
41        Parser {
42            input,
43            scanner,
44            parser,
45        }
46    }
47    /// Current position in input
48    pub fn position(&self) -> Pos {
49        self.scanner.position(self.input)
50    }
51}
52
53/*
54 ** Return the id of the next token in input.
55 */
56fn get_token(scanner: &mut Scanner<Tokenizer>, input: &[u8]) -> Result<TokenType, Error> {
57    let mut t = {
58        let (_, token_type) = match scanner.scan(input)? {
59            (_, None, _) => {
60                return Ok(TK_EOF);
61            }
62            (_, Some(tuple), _) => tuple,
63        };
64        token_type
65    };
66    if t == TK_ID
67        || t == TK_STRING
68        || t == TK_JOIN_KW
69        || t == TK_WINDOW
70        || t == TK_OVER
71        || yyParser::parse_fallback(t as YYCODETYPE) == TK_ID as YYCODETYPE
72    {
73        t = TK_ID;
74    }
75    Ok(t)
76}
77
78/*
79 ** The following three functions are called immediately after the tokenizer
80 ** reads the keywords WINDOW, OVER and FILTER, respectively, to determine
81 ** whether the token should be treated as a keyword or an SQL identifier.
82 ** This cannot be handled by the usual lemon %fallback method, due to
83 ** the ambiguity in some constructions. e.g.
84 **
85 **   SELECT sum(x) OVER ...
86 **
87 ** In the above, "OVER" might be a keyword, or it might be an alias for the
88 ** sum(x) expression. If a "%fallback ID OVER" directive were added to
89 ** grammar, then SQLite would always treat "OVER" as an alias, making it
90 ** impossible to call a window-function without a FILTER clause.
91 **
92 ** WINDOW is treated as a keyword if:
93 **
94 **   * the following token is an identifier, or a keyword that can fallback
95 **     to being an identifier, and
96 **   * the token after than one is TK_AS.
97 **
98 ** OVER is a keyword if:
99 **
100 **   * the previous token was TK_RP, and
101 **   * the next token is either TK_LP or an identifier.
102 **
103 ** FILTER is a keyword if:
104 **
105 **   * the previous token was TK_RP, and
106 **   * the next token is TK_LP.
107 */
108fn analyze_window_keyword(
109    scanner: &mut Scanner<Tokenizer>,
110    input: &[u8],
111) -> Result<TokenType, Error> {
112    let t = get_token(scanner, input)?;
113    if t != TK_ID {
114        return Ok(TK_ID);
115    }
116    let t = get_token(scanner, input)?;
117    if t != TK_AS {
118        return Ok(TK_ID);
119    }
120    Ok(TK_WINDOW)
121}
122fn analyze_over_keyword(
123    scanner: &mut Scanner<Tokenizer>,
124    input: &[u8],
125    last_token: TokenType,
126) -> Result<TokenType, Error> {
127    if last_token == TK_RP {
128        let t = get_token(scanner, input)?;
129        if t == TK_LP || t == TK_ID {
130            return Ok(TK_OVER);
131        }
132    }
133    Ok(TK_ID)
134}
135fn analyze_filter_keyword(
136    scanner: &mut Scanner<Tokenizer>,
137    input: &[u8],
138    last_token: TokenType,
139) -> Result<TokenType, Error> {
140    if last_token == TK_RP && get_token(scanner, input)? == TK_LP {
141        return Ok(TK_FILTER);
142    }
143    Ok(TK_ID)
144}
145
146macro_rules! try_with_position {
147    ($input:expr, $offset:expr, $expr:expr) => {
148        match $expr {
149            Ok(val) => val,
150            Err(err) => {
151                let mut err = Error::from(err);
152                err.position(Pos::from($input, $offset));
153                return Err(err);
154            }
155        }
156    };
157}
158
159impl<'input> FallibleIterator for Parser<'input> {
160    type Item = Cmd<'input>;
161    type Error = Error;
162
163    fn next(&mut self) -> Result<Option<Cmd<'input>>, Error> {
164        //print!("line: {}, column: {}: ", self.scanner.line(), self.scanner.column());
165        self.parser.ctx.reset();
166        let mut last_token_parsed = TK_EOF;
167        let offset;
168        let mut eof = false;
169        loop {
170            let (start, (value, mut token_type), end) = match self.scanner.scan(self.input)? {
171                (start, None, _) => {
172                    offset = start;
173                    eof = true;
174                    break;
175                }
176                (start, Some(tuple), end) => (start, tuple, end),
177            };
178            let token = if token_type >= TK_WINDOW {
179                debug_assert!(
180                    token_type == TK_OVER || token_type == TK_FILTER || token_type == TK_WINDOW
181                );
182                self.scanner.mark();
183                if token_type == TK_WINDOW {
184                    token_type = analyze_window_keyword(&mut self.scanner, self.input)?;
185                } else if token_type == TK_OVER {
186                    token_type =
187                        analyze_over_keyword(&mut self.scanner, self.input, last_token_parsed)?;
188                } else if token_type == TK_FILTER {
189                    token_type =
190                        analyze_filter_keyword(&mut self.scanner, self.input, last_token_parsed)?;
191                }
192                self.scanner.reset_to_mark();
193                token_type.to_token(start, value, end)
194            } else {
195                token_type.to_token(start, value, end)
196            };
197            //println!("({:?}, {:?})", token_type, token);
198            try_with_position!(
199                self.input,
200                start,
201                self.parser.sqlite3Parser(token_type, token)
202            );
203            last_token_parsed = token_type;
204            if self.parser.ctx.done() {
205                //println!();
206                offset = start;
207                break;
208            }
209        }
210        if last_token_parsed == TK_EOF {
211            return Ok(None); // empty input
212        }
213        /* Upon reaching the end of input, call the parser two more times
214        with tokens TK_SEMI and 0, in that order. */
215        if eof && self.parser.ctx.is_ok() {
216            if last_token_parsed != TK_SEMI {
217                try_with_position!(
218                    self.input,
219                    offset,
220                    self.parser
221                        .sqlite3Parser(TK_SEMI, sentinel(self.input.len()))
222                );
223            }
224            try_with_position!(
225                self.input,
226                offset,
227                self.parser
228                    .sqlite3Parser(TK_EOF, sentinel(self.input.len()))
229            );
230        }
231        self.parser.sqlite3ParserFinalize();
232        if let Some(e) = self.parser.ctx.error() {
233            let err = Error::ParserError(e, Some(Pos::from(self.input, offset)));
234            return Err(err);
235        }
236        let cmd = self.parser.ctx.cmd();
237        #[cfg(feature = "extra_checks")]
238        if let Some(ref cmd) = cmd {
239            if let Err(e) = cmd.check() {
240                let err = Error::ParserError(e, Some(Pos::from(self.input, offset)));
241                return Err(err);
242            }
243        }
244        Ok(cmd)
245    }
246}
247
248/// SQL token
249pub type Token<'input> = (&'input [u8], TokenType);
250
251/// SQL lexer
252#[derive(Default)]
253pub struct Tokenizer {}
254
255impl Tokenizer {
256    /// Constructor
257    pub fn new() -> Self {
258        Self {}
259    }
260}
261
262/// ```rust
263/// use sqlite3_parser::lexer::sql::Tokenizer;
264/// use sqlite3_parser::lexer::Scanner;
265///
266/// let tokenizer = Tokenizer::new();
267/// let input = b"PRAGMA parser_trace=ON;";
268/// let mut s = Scanner::new(tokenizer);
269/// let Ok((_, Some((token1, _)), _)) = s.scan(input) else { panic!() };
270/// s.scan(input).unwrap();
271/// assert!(b"PRAGMA".eq_ignore_ascii_case(token1));
272/// ```
273impl Splitter for Tokenizer {
274    type Error = Error;
275    type TokenType = TokenType;
276
277    fn split<'input>(
278        &mut self,
279        data: &'input [u8],
280    ) -> Result<(Option<Token<'input>>, usize), Error> {
281        if data[0].is_ascii_whitespace() {
282            // eat as much space as possible
283            return Ok((
284                None,
285                match data.iter().skip(1).position(|&b| !b.is_ascii_whitespace()) {
286                    Some(i) => i + 1,
287                    _ => data.len(),
288                },
289            ));
290        }
291        match data[0] {
292            b'-' => {
293                if let Some(b) = data.get(1) {
294                    if *b == b'-' {
295                        // eat comment
296                        if let Some(i) = memchr(b'\n', data) {
297                            Ok((None, i + 1))
298                        } else {
299                            Ok((None, data.len()))
300                        }
301                    } else if *b == b'>' {
302                        if let Some(b) = data.get(2) {
303                            if *b == b'>' {
304                                return Ok((Some((&data[..3], TK_PTR)), 3));
305                            }
306                        }
307                        Ok((Some((&data[..2], TK_PTR)), 2))
308                    } else {
309                        Ok((Some((&data[..1], TK_MINUS)), 1))
310                    }
311                } else {
312                    Ok((Some((&data[..1], TK_MINUS)), 1))
313                }
314            }
315            b'(' => Ok((Some((&data[..1], TK_LP)), 1)),
316            b')' => Ok((Some((&data[..1], TK_RP)), 1)),
317            b';' => Ok((Some((&data[..1], TK_SEMI)), 1)),
318            b'+' => Ok((Some((&data[..1], TK_PLUS)), 1)),
319            b'*' => Ok((Some((&data[..1], TK_STAR)), 1)),
320            b'/' => {
321                if let Some(b) = data.get(1) {
322                    if *b == b'*' {
323                        // eat comment
324                        let mut pb = 0;
325                        let mut end = None;
326                        for (i, b) in data.iter().enumerate().skip(2) {
327                            if *b == b'/' && pb == b'*' {
328                                end = Some(i);
329                                break;
330                            }
331                            pb = *b;
332                        }
333                        if let Some(i) = end {
334                            Ok((None, i + 1))
335                        } else {
336                            Err(Error::UnterminatedBlockComment(None))
337                        }
338                    } else {
339                        Ok((Some((&data[..1], TK_SLASH)), 1))
340                    }
341                } else {
342                    Ok((Some((&data[..1], TK_SLASH)), 1))
343                }
344            }
345            b'%' => Ok((Some((&data[..1], TK_REM)), 1)),
346            b'=' => {
347                if let Some(b) = data.get(1) {
348                    Ok(if *b == b'=' {
349                        (Some((&data[..2], TK_EQ)), 2)
350                    } else {
351                        (Some((&data[..1], TK_EQ)), 1)
352                    })
353                } else {
354                    Ok((Some((&data[..1], TK_EQ)), 1))
355                }
356            }
357            b'<' => {
358                if let Some(b) = data.get(1) {
359                    Ok(match *b {
360                        b'=' => (Some((&data[..2], TK_LE)), 2),
361                        b'>' => (Some((&data[..2], TK_NE)), 2),
362                        b'<' => (Some((&data[..2], TK_LSHIFT)), 2),
363                        _ => (Some((&data[..1], TK_LT)), 1),
364                    })
365                } else {
366                    Ok((Some((&data[..1], TK_LT)), 1))
367                }
368            }
369            b'>' => {
370                if let Some(b) = data.get(1) {
371                    Ok(match *b {
372                        b'=' => (Some((&data[..2], TK_GE)), 2),
373                        b'>' => (Some((&data[..2], TK_RSHIFT)), 2),
374                        _ => (Some((&data[..1], TK_GT)), 1),
375                    })
376                } else {
377                    Ok((Some((&data[..1], TK_GT)), 1))
378                }
379            }
380            b'!' => {
381                if let Some(b) = data.get(1) {
382                    if *b == b'=' {
383                        Ok((Some((&data[..2], TK_NE)), 2))
384                    } else {
385                        Err(Error::ExpectedEqualsSign(None))
386                    }
387                } else {
388                    Err(Error::ExpectedEqualsSign(None))
389                }
390            }
391            b'|' => {
392                if let Some(b) = data.get(1) {
393                    Ok(if *b == b'|' {
394                        (Some((&data[..2], TK_CONCAT)), 2)
395                    } else {
396                        (Some((&data[..1], TK_BITOR)), 1)
397                    })
398                } else {
399                    Ok((Some((&data[..1], TK_BITOR)), 1))
400                }
401            }
402            b',' => Ok((Some((&data[..1], TK_COMMA)), 1)),
403            b'&' => Ok((Some((&data[..1], TK_BITAND)), 1)),
404            b'~' => Ok((Some((&data[..1], TK_BITNOT)), 1)),
405            quote @ (b'`' | b'\'' | b'"') => literal(data, quote),
406            b'.' => {
407                if let Some(b) = data.get(1) {
408                    if b.is_ascii_digit() {
409                        fractional_part(data, 0)
410                    } else {
411                        Ok((Some((&data[..1], TK_DOT)), 1))
412                    }
413                } else {
414                    Ok((Some((&data[..1], TK_DOT)), 1))
415                }
416            }
417            b'0'..=b'9' => number(data),
418            b'[' => {
419                if let Some(i) = memchr(b']', data) {
420                    // Keep original quotes / '[' ... ’]'
421                    Ok((Some((&data[0..=i], TK_ID)), i + 1))
422                } else {
423                    Err(Error::UnterminatedBracket(None))
424                }
425            }
426            b'?' => {
427                match data.iter().skip(1).position(|&b| !b.is_ascii_digit()) {
428                    Some(i) => {
429                        // do not include the '?' in the token
430                        Ok((Some((&data[1..=i], TK_VARIABLE)), i + 1))
431                    }
432                    None => Ok((Some((&data[1..], TK_VARIABLE)), data.len())),
433                }
434            }
435            b'$' | b'@' | b'#' | b':' => {
436                match data
437                    .iter()
438                    .skip(1)
439                    .position(|&b| !is_identifier_continue(b))
440                {
441                    Some(0) => Err(Error::BadVariableName(None)),
442                    Some(i) => {
443                        // '$' is included as part of the name
444                        Ok((Some((&data[..=i], TK_VARIABLE)), i + 1))
445                    }
446                    None => {
447                        if data.len() == 1 {
448                            return Err(Error::BadVariableName(None));
449                        }
450                        Ok((Some((data, TK_VARIABLE)), data.len()))
451                    }
452                }
453            }
454            b if is_identifier_start(b) => {
455                if b == b'x' || b == b'X' {
456                    if let Some(&b'\'') = data.get(1) {
457                        blob_literal(data)
458                    } else {
459                        Ok(self.identifierish(data))
460                    }
461                } else {
462                    Ok(self.identifierish(data))
463                }
464            }
465            _ => Err(Error::UnrecognizedToken(None)),
466        }
467    }
468}
469
470fn literal(data: &[u8], quote: u8) -> Result<(Option<Token<'_>>, usize), Error> {
471    debug_assert_eq!(data[0], quote);
472    let tt = if quote == b'\'' { TK_STRING } else { TK_ID };
473    let mut pb = 0;
474    let mut end = None;
475    // data[0] == quote => skip(1)
476    for (i, b) in data.iter().enumerate().skip(1) {
477        if *b == quote {
478            if pb == quote {
479                // escaped quote
480                pb = 0;
481                continue;
482            }
483        } else if pb == quote {
484            end = Some(i);
485            break;
486        }
487        pb = *b;
488    }
489    if end.is_some() || pb == quote {
490        let i = match end {
491            Some(i) => i,
492            _ => data.len(),
493        };
494        // keep original quotes in the token
495        Ok((Some((&data[0..i], tt)), i))
496    } else {
497        Err(Error::UnterminatedLiteral(None))
498    }
499}
500
501fn blob_literal(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
502    debug_assert!(data[0] == b'x' || data[0] == b'X');
503    debug_assert_eq!(data[1], b'\'');
504    if let Some((i, b)) = data
505        .iter()
506        .enumerate()
507        .skip(2)
508        .find(|&(_, &b)| !b.is_ascii_hexdigit())
509    {
510        if *b != b'\'' || !i.is_multiple_of(2) {
511            return Err(Error::MalformedBlobLiteral(None));
512        }
513        Ok((Some((&data[2..i], TK_BLOB)), i + 1))
514    } else {
515        Err(Error::MalformedBlobLiteral(None))
516    }
517}
518
519fn number(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
520    debug_assert!(data[0].is_ascii_digit());
521    if data[0] == b'0' {
522        if let Some(b) = data.get(1) {
523            if *b == b'x' || *b == b'X' {
524                return hex_integer(data);
525            }
526        } else {
527            return Ok((Some((data, TK_INTEGER)), data.len()));
528        }
529    }
530    if let Some((i, b)) = find_end_of_number(data, 1, u8::is_ascii_digit)? {
531        if b == b'.' {
532            return fractional_part(data, i);
533        } else if b == b'e' || b == b'E' {
534            return exponential_part(data, i);
535        } else if is_identifier_start(b) {
536            return Err(Error::BadNumber(None));
537        }
538        Ok((Some((&data[..i], TK_INTEGER)), i))
539    } else {
540        Ok((Some((data, TK_INTEGER)), data.len()))
541    }
542}
543
544fn hex_integer(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
545    debug_assert_eq!(data[0], b'0');
546    debug_assert!(data[1] == b'x' || data[1] == b'X');
547    if let Some((i, b)) = find_end_of_number(data, 2, u8::is_ascii_hexdigit)? {
548        // Must not be empty (Ox is invalid)
549        if i == 2 || is_identifier_start(b) {
550            return Err(Error::MalformedHexInteger(None));
551        }
552        Ok((Some((&data[..i], TK_INTEGER)), i))
553    } else {
554        // Must not be empty (Ox is invalid)
555        if data.len() == 2 {
556            return Err(Error::MalformedHexInteger(None));
557        }
558        Ok((Some((data, TK_INTEGER)), data.len()))
559    }
560}
561
562fn fractional_part(data: &[u8], i: usize) -> Result<(Option<Token<'_>>, usize), Error> {
563    debug_assert_eq!(data[i], b'.');
564    if let Some((i, b)) = find_end_of_number(data, i + 1, u8::is_ascii_digit)? {
565        if b == b'e' || b == b'E' {
566            return exponential_part(data, i);
567        } else if is_identifier_start(b) {
568            return Err(Error::BadNumber(None));
569        }
570        Ok((Some((&data[..i], TK_FLOAT)), i))
571    } else {
572        Ok((Some((data, TK_FLOAT)), data.len()))
573    }
574}
575
576fn exponential_part(data: &[u8], i: usize) -> Result<(Option<Token<'_>>, usize), Error> {
577    debug_assert!(data[i] == b'e' || data[i] == b'E');
578    // data[i] == 'e'|'E'
579    if let Some(b) = data.get(i + 1) {
580        let i = if *b == b'+' || *b == b'-' { i + 1 } else { i };
581        if let Some((j, b)) = find_end_of_number(data, i + 1, u8::is_ascii_digit)? {
582            if j == i + 1 || is_identifier_start(b) {
583                return Err(Error::BadNumber(None));
584            }
585            Ok((Some((&data[..j], TK_FLOAT)), j))
586        } else {
587            if data.len() == i + 1 {
588                return Err(Error::BadNumber(None));
589            }
590            Ok((Some((data, TK_FLOAT)), data.len()))
591        }
592    } else {
593        Err(Error::BadNumber(None))
594    }
595}
596
597fn find_end_of_number(
598    data: &[u8],
599    i: usize,
600    test: fn(&u8) -> bool,
601) -> Result<Option<(usize, u8)>, Error> {
602    for (j, &b) in data.iter().enumerate().skip(i) {
603        if test(&b) {
604            continue;
605        } else if b == b'_' {
606            if j >= 1 && data.get(j - 1).is_some_and(test) && data.get(j + 1).is_some_and(test) {
607                continue;
608            }
609            return Err(Error::BadNumber(None));
610        } else if b == b'$' {
611            return Err(Error::BadNumber(None));
612        } else {
613            return Ok(Some((j, b)));
614        }
615    }
616    Ok(None)
617}
618
619impl Tokenizer {
620    fn identifierish<'input>(&mut self, data: &'input [u8]) -> (Option<Token<'input>>, usize) {
621        debug_assert!(is_identifier_start(data[0]));
622        // data[0] is_identifier_start => skip(1)
623        let end = data
624            .iter()
625            .skip(1)
626            .position(|&b| !is_identifier_continue(b));
627        let i = match end {
628            Some(i) => i + 1,
629            _ => data.len(),
630        };
631        let word = &data[..i];
632        let tt = if word.len() >= 2 && word.len() <= MAX_KEYWORD_LEN && word.is_ascii() {
633            keyword_token(word).unwrap_or(TK_ID)
634        } else {
635            TK_ID
636        };
637        (Some((word, tt)), i)
638    }
639}
640
641#[cfg(test)]
642mod tests {
643    use super::Tokenizer;
644    use crate::dialect::TokenType;
645    use crate::lexer::sql::Error;
646    use crate::lexer::Scanner;
647    use std::assert_matches;
648
649    #[test]
650    fn fallible_iterator() -> Result<(), Error> {
651        let mut s = scan();
652        let input = b"PRAGMA parser_trace=ON;";
653        expect_token(&mut s, input, b"PRAGMA", TokenType::TK_PRAGMA)?;
654        expect_token(&mut s, input, b"parser_trace", TokenType::TK_ID)?;
655        Ok(())
656    }
657
658    #[test]
659    fn none() -> Result<(), Error> {
660        expect_none(b" ")?;
661        expect_none(b"  ")?;
662        expect_none(b"--")?;
663        expect_none(b"/**/")?;
664        expect_none(b"/*a/*b*/")?;
665        expect_none(b"/*\na\n/*\nb\n*/")?;
666        Ok(())
667    }
668
669    #[test]
670    fn ints() -> Result<(), Error> {
671        expect_single_token(b"45", b"45", TokenType::TK_INTEGER)?;
672        expect_single_token(b"0xFF", b"0xFF", TokenType::TK_INTEGER)?;
673        expect_single_token(b"0xFFFFFFFF", b"0xFFFFFFFF", TokenType::TK_INTEGER)?;
674        expect_single_token(b"0x123FFFFFFFF", b"0x123FFFFFFFF", TokenType::TK_INTEGER)?;
675        expect_single_token(
676            b"0xFFFFFFFFFFFFFFFF",
677            b"0xFFFFFFFFFFFFFFFF",
678            TokenType::TK_INTEGER,
679        )?;
680        expect_single_token(
681            b"0x7FFFFFFFFFFFFFFF",
682            b"0x7FFFFFFFFFFFFFFF",
683            TokenType::TK_INTEGER,
684        )?;
685        expect_single_token(
686            b"9223372036854775808",
687            b"9223372036854775808",
688            TokenType::TK_INTEGER,
689        )?;
690        expect_single_token(b"1_000", b"1_000", TokenType::TK_INTEGER)?;
691        expect_single_token(
692            b"9_223_372_036_854_775_807",
693            b"9_223_372_036_854_775_807",
694            TokenType::TK_INTEGER,
695        )?;
696        Ok(())
697    }
698
699    #[test]
700    fn floats() -> Result<(), Error> {
701        expect_single_token(b"1e12", b"1e12", TokenType::TK_FLOAT)?;
702        expect_single_token(b"1.0", b"1.0", TokenType::TK_FLOAT)?;
703        expect_single_token(b"1e1000", b"1e1000", TokenType::TK_FLOAT)?;
704        expect_single_token(b"1.1_1", b"1.1_1", TokenType::TK_FLOAT)?;
705        expect_single_token(b"1_0.1_1", b"1_0.1_1", TokenType::TK_FLOAT)?;
706        expect_single_token(b"1e1_000", b"1e1_000", TokenType::TK_FLOAT)?;
707        expect_single_token(b"12_3_456.7_8_9", b"12_3_456.7_8_9", TokenType::TK_FLOAT)?;
708        expect_single_token(b".123", b".123", TokenType::TK_FLOAT)?;
709        expect_single_token(b".456e789", b".456e789", TokenType::TK_FLOAT)?;
710        expect_single_token(b".456E-789", b".456E-789", TokenType::TK_FLOAT)?;
711        expect_single_token(b".456E-789", b".456E-789", TokenType::TK_FLOAT)?;
712        Ok(())
713    }
714
715    #[test]
716    fn invalid_number_literal() {
717        expect_bad_number(b"1E");
718        expect_bad_number(b"123a456");
719        expect_bad_number(b"123__456");
720        expect_bad_number(b"0xFF__EF");
721        expect_bad_number(b"0xFFEF_");
722        expect_bad_number(b"0xFFEF_");
723        expect_bad_number(b"1_");
724        expect_bad_number(b"1_.4");
725        expect_bad_number(b"1e_4");
726        expect_bad_number(b"1_e4");
727        expect_bad_number(b"1.4_e4");
728        expect_bad_number(b"1.4e+_4");
729        expect_bad_number(b"1.4e-_4");
730        expect_bad_number(b"1.4e4_");
731        expect_bad_number(b"1.4e_4");
732        expect_bad_number(b"12__34");
733        expect_bad_number(b"1234_");
734        expect_bad_number(b"12._34");
735        expect_bad_number(b"12.34_");
736        expect_bad_number(b"1.0e1_______2");
737        expect_bad_number(b"5$");
738    }
739
740    #[test]
741    fn single() -> Result<(), Error> {
742        expect_single_token(b"-", b"-", TokenType::TK_MINUS)?;
743        expect_single_token(b"->", b"->", TokenType::TK_PTR)?;
744        expect_single_token(b"->>", b"->>", TokenType::TK_PTR)?;
745        expect_single_token(b"(", b"(", TokenType::TK_LP)?;
746        expect_single_token(b")", b")", TokenType::TK_RP)?;
747        expect_single_token(b";", b";", TokenType::TK_SEMI)?;
748        expect_single_token(b"+", b"+", TokenType::TK_PLUS)?;
749        expect_single_token(b"*", b"*", TokenType::TK_STAR)?;
750        expect_single_token(b"/", b"/", TokenType::TK_SLASH)?;
751        expect_single_token(b"%", b"%", TokenType::TK_REM)?;
752        expect_single_token(b"=", b"=", TokenType::TK_EQ)?;
753        expect_single_token(b"==", b"==", TokenType::TK_EQ)?;
754        expect_single_token(b"<", b"<", TokenType::TK_LT)?;
755        expect_single_token(b"<=", b"<=", TokenType::TK_LE)?;
756        expect_single_token(b"<>", b"<>", TokenType::TK_NE)?;
757        expect_single_token(b"<<", b"<<", TokenType::TK_LSHIFT)?;
758        expect_single_token(b">", b">", TokenType::TK_GT)?;
759        expect_single_token(b">=", b">=", TokenType::TK_GE)?;
760        expect_single_token(b">>", b">>", TokenType::TK_RSHIFT)?;
761        expect_single_token(b"!=", b"!=", TokenType::TK_NE)?;
762        expect_single_token(b"|", b"|", TokenType::TK_BITOR)?;
763        expect_single_token(b"||", b"||", TokenType::TK_CONCAT)?;
764        expect_single_token(b",", b",", TokenType::TK_COMMA)?;
765        expect_single_token(b"&", b"&", TokenType::TK_BITAND)?;
766        expect_single_token(b"~", b"~", TokenType::TK_BITNOT)?;
767        expect_single_token(b"[]", b"[]", TokenType::TK_ID)?;
768        expect_single_token(b"[a]", b"[a]", TokenType::TK_ID)?;
769        expect_single_token(b"?", b"", TokenType::TK_VARIABLE)?;
770        expect_single_token(b"?1", b"1", TokenType::TK_VARIABLE)?;
771        expect_single_token(b"$a", b"$a", TokenType::TK_VARIABLE)?;
772        expect_single_token(b"@a", b"@a", TokenType::TK_VARIABLE)?;
773        expect_single_token(b"#a", b"#a", TokenType::TK_VARIABLE)?;
774        expect_single_token(b":a", b":a", TokenType::TK_VARIABLE)?;
775        expect_single_token(b"x''", b"", TokenType::TK_BLOB)?;
776        expect_single_token(b"x'ab'", b"ab", TokenType::TK_BLOB)?;
777        expect_single_token(b"x", b"x", TokenType::TK_ID)?;
778        expect_single_token(b"X''", b"", TokenType::TK_BLOB)?;
779        expect_single_token(b"X'ab'", b"ab", TokenType::TK_BLOB)?;
780        expect_single_token(b"X", b"X", TokenType::TK_ID)?;
781        expect_single_token(b"SELECT", b"SELECT", TokenType::TK_SELECT)?;
782        Ok(())
783    }
784
785    fn expect_token(
786        s: &mut Scanner<Tokenizer>,
787        input: &[u8],
788        token: &[u8],
789        token_type: TokenType,
790    ) -> Result<(), Error> {
791        let (t, tt) = s.scan(input)?.1.unwrap();
792        assert_eq!(token, t);
793        assert_eq!(token_type, tt);
794        Ok(())
795    }
796
797    fn expect_none(input: &[u8]) -> Result<(), Error> {
798        let mut s = scan();
799        assert!(s.scan(input)?.1.is_none());
800        Ok(())
801    }
802    fn expect_single_token(input: &[u8], token: &[u8], token_type: TokenType) -> Result<(), Error> {
803        let mut s = scan();
804        expect_token(&mut s, input, token, token_type)?;
805        assert!(s.scan(input)?.1.is_none());
806        Ok(())
807    }
808    fn expect_bad_number(input: &[u8]) {
809        let err = expect_error(input);
810        assert_matches!(err, Error::BadNumber(_));
811    }
812    fn expect_error(input: &[u8]) -> Error {
813        let mut s = scan();
814        s.scan(input).unwrap_err()
815    }
816    fn scan() -> Scanner<Tokenizer> {
817        let tokenizer = Tokenizer::new();
818        Scanner::new(tokenizer)
819    }
820}
sqlite3_parser/lexer/sql/mod.rs

sqlite3_parser/lexer/sql/
mod.rs