mod cursor;
mod token;
use cursor::{Cursor, EOF_CHAR};
pub use token::{Base, LiteralKind, Token, TokenKind};
const fn is_ident_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
}
const fn is_ident_cont(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
}
const fn is_whitespace(c: char) -> bool {
matches!(
c,
' ' | '\t' | '\n' | '\r' | '\u{000B}' | '\u{000C}' )
}
impl Cursor<'_> {
pub(crate) fn advance_token(&mut self) -> Token {
let Some(first_char) = self.bump() else {
return Token::new(TokenKind::Eof, 0);
};
let token_kind = match first_char {
'/' => match self.first() {
'*' => self.block_comment(),
_ => TokenKind::Slash,
},
'-' => match self.first() {
'-' => self.line_comment(),
_ => TokenKind::Minus,
},
c if is_whitespace(c) => self.whitespace(),
'u' | 'U' => match self.first() {
'&' => {
self.bump();
self.prefixed_string(
|terminated| LiteralKind::UnicodeEscStr { terminated },
true,
)
}
_ => self.ident_or_unknown_prefix(),
},
'e' | 'E' => {
self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
}
'b' | 'B' => {
self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
}
'x' | 'X' => {
self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
}
c if is_ident_start(c) => self.ident(),
c @ '0'..='9' => {
let literal_kind = self.number(c);
TokenKind::Literal { kind: literal_kind }
}
'.' => match self.first() {
'0'..='9' => {
let literal_kind = self.number('.');
TokenKind::Literal { kind: literal_kind }
}
_ => TokenKind::Dot,
},
';' => TokenKind::Semi,
',' => TokenKind::Comma,
'(' => TokenKind::OpenParen,
')' => TokenKind::CloseParen,
'[' => TokenKind::OpenBracket,
']' => TokenKind::CloseBracket,
'{' => TokenKind::OpenCurly,
'}' => TokenKind::CloseCurly,
'@' => TokenKind::At,
'#' => TokenKind::Pound,
'~' => TokenKind::Tilde,
'?' => TokenKind::Question,
':' => TokenKind::Colon,
'$' => {
if is_ident_start(self.first()) || self.first() == '$' {
self.dollar_quoted_string()
} else {
while self.first().is_ascii_digit() {
self.bump();
}
TokenKind::PositionalParam
}
}
'`' => TokenKind::Backtick,
'=' => TokenKind::Eq,
'!' => TokenKind::Bang,
'<' => TokenKind::Lt,
'>' => TokenKind::Gt,
'&' => TokenKind::And,
'|' => TokenKind::Or,
'+' => TokenKind::Plus,
'*' => TokenKind::Star,
'^' => TokenKind::Caret,
'%' => TokenKind::Percent,
'\'' => {
let terminated = self.single_quoted_string();
let kind = LiteralKind::Str { terminated };
TokenKind::Literal { kind }
}
'"' => {
let terminated = self.double_quoted_string();
TokenKind::QuotedIdent { terminated }
}
_ => TokenKind::Unknown,
};
let res = Token::new(token_kind, self.pos_within_token());
self.reset_pos_within_token();
res
}
pub(crate) fn ident(&mut self) -> TokenKind {
self.eat_while(is_ident_cont);
TokenKind::Ident
}
pub(crate) fn whitespace(&mut self) -> TokenKind {
self.eat_while(is_whitespace);
TokenKind::Whitespace
}
fn ident_or_unknown_prefix(&mut self) -> TokenKind {
self.eat_while(is_ident_cont);
match self.first() {
'#' | '"' | '\'' => TokenKind::UnknownPrefix,
_ => TokenKind::Ident,
}
}
pub(crate) fn line_comment(&mut self) -> TokenKind {
self.bump();
self.eat_while(|c| c != '\n');
TokenKind::LineComment
}
pub(crate) fn block_comment(&mut self) -> TokenKind {
self.bump();
let mut depth = 1usize;
while let Some(c) = self.bump() {
match c {
'/' if self.first() == '*' => {
self.bump();
depth += 1;
}
'*' if self.first() == '/' => {
self.bump();
depth -= 1;
if depth == 0 {
break;
}
}
_ => (),
}
}
TokenKind::BlockComment {
terminated: depth == 0,
}
}
fn prefixed_string(
&mut self,
mk_kind: fn(bool) -> LiteralKind,
allows_double: bool,
) -> TokenKind {
match self.first() {
'\'' => {
self.bump();
let terminated = self.single_quoted_string();
let kind = mk_kind(terminated);
TokenKind::Literal { kind }
}
'"' if allows_double => {
self.bump();
let terminated = self.double_quoted_string();
TokenKind::QuotedIdent { terminated }
}
_ => self.ident_or_unknown_prefix(),
}
}
fn number(&mut self, first_digit: char) -> LiteralKind {
let mut base = Base::Decimal;
if first_digit == '0' {
match self.first() {
'b' | 'B' => {
base = Base::Binary;
self.bump();
if !self.eat_decimal_digits() {
return LiteralKind::Int {
base,
empty_int: true,
};
}
}
'o' | 'O' => {
base = Base::Octal;
self.bump();
if !self.eat_decimal_digits() {
return LiteralKind::Int {
base,
empty_int: true,
};
}
}
'x' | 'X' => {
base = Base::Hexadecimal;
self.bump();
if !self.eat_hexadecimal_digits() {
return LiteralKind::Int {
base,
empty_int: true,
};
}
}
'0'..='9' | '_' => {
self.eat_decimal_digits();
}
'.' | 'e' | 'E' => {}
_ => {
return LiteralKind::Int {
base,
empty_int: false,
};
}
}
} else {
self.eat_decimal_digits();
};
match self.first() {
'.' => {
self.bump();
let mut empty_exponent = false;
if self.first().is_ascii_digit() {
self.eat_decimal_digits();
match self.first() {
'e' | 'E' => {
self.bump();
empty_exponent = !self.eat_float_exponent();
}
_ => (),
}
} else {
match self.first() {
'e' | 'E' => {
self.bump();
empty_exponent = !self.eat_float_exponent();
}
_ => (),
}
}
LiteralKind::Float {
base,
empty_exponent,
}
}
'e' | 'E' => {
self.bump();
let empty_exponent = !self.eat_float_exponent();
LiteralKind::Float {
base,
empty_exponent,
}
}
_ => LiteralKind::Int {
base,
empty_int: false,
},
}
}
fn single_quoted_string(&mut self) -> bool {
loop {
match self.first() {
'\'' => {
self.bump();
match self.first() {
'\'' => {
self.bump();
}
_ => return true,
}
}
EOF_CHAR if self.is_eof() => break,
_ => {
self.bump();
}
}
}
false
}
fn double_quoted_string(&mut self) -> bool {
while let Some(c) = self.bump() {
match c {
'"' if self.first() == '"' => {
self.bump();
}
'"' => {
return true;
}
_ => (),
}
}
false
}
fn dollar_quoted_string(&mut self) -> TokenKind {
let mut start = vec![];
while let Some(c) = self.bump() {
match c {
'$' => {
break;
}
_ => {
start.push(c);
}
}
}
if start.is_empty() {
loop {
self.eat_while(|c| c != '$');
if self.is_eof() {
return TokenKind::Literal {
kind: LiteralKind::DollarQuotedString { terminated: false },
};
}
self.bump();
if self.first() == '$' {
self.bump();
return TokenKind::Literal {
kind: LiteralKind::DollarQuotedString { terminated: true },
};
}
}
} else {
loop {
self.eat_while(|c| c != start[0]);
if self.is_eof() {
return TokenKind::Literal {
kind: LiteralKind::DollarQuotedString { terminated: false },
};
}
let mut match_count = 0;
for start_char in &start {
if self.first() == *start_char {
self.bump();
match_count += 1;
} else {
self.bump();
break;
}
}
let terminated = match_count == start.len();
if self.first() == '$' && terminated {
self.bump();
return TokenKind::Literal {
kind: LiteralKind::DollarQuotedString { terminated },
};
}
}
}
}
fn eat_decimal_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.first() {
'_' => {
self.bump();
}
'0'..='9' => {
has_digits = true;
self.bump();
}
_ => break,
}
}
has_digits
}
fn eat_hexadecimal_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.first() {
'_' => {
self.bump();
}
'0'..='9' | 'a'..='f' | 'A'..='F' => {
has_digits = true;
self.bump();
}
_ => break,
}
}
has_digits
}
fn eat_float_exponent(&mut self) -> bool {
if self.first() == '-' || self.first() == '+' {
self.bump();
}
self.eat_decimal_digits()
}
}
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
std::iter::from_fn(move || {
let token = cursor.advance_token();
if token.kind != TokenKind::Eof {
Some(token)
} else {
None
}
})
}
#[cfg(test)]
mod tests {
use std::fmt;
use super::*;
use insta::assert_debug_snapshot;
struct TokenDebug<'a> {
content: &'a str,
token: Token,
}
impl fmt::Debug for TokenDebug<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:?} @ {:?}", self.content, self.token.kind)
}
}
impl<'a> TokenDebug<'a> {
fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
TokenDebug {
token,
content: &input[start as usize..(start + token.len) as usize],
}
}
}
fn lex(input: &str) -> Vec<TokenDebug<'_>> {
let mut tokens = vec![];
let mut start = 0;
for token in tokenize(input) {
let length = token.len;
tokens.push(TokenDebug::new(token, input, start));
start += length;
}
tokens
}
#[test]
fn lex_statement() {
let result = lex("select 1;");
assert_debug_snapshot!(result);
}
#[test]
fn block_comment() {
let result = lex(r#"
/*
* foo
* bar
*/"#);
assert_debug_snapshot!(result);
}
#[test]
fn block_comment_unterminated() {
let result = lex(r#"
/*
* foo
* bar
/*
*/"#);
assert_debug_snapshot!(result);
}
#[test]
fn line_comment() {
let result = lex(r#"
-- foooooooooooo bar buzz
"#);
assert_debug_snapshot!(result);
}
#[test]
fn line_comment_whitespace() {
assert_debug_snapshot!(lex(r#"
select 'Hello' -- This is a comment
' World';"#))
}
#[test]
fn dollar_quoting() {
assert_debug_snapshot!(lex(r#"
$$Dianne's horse$$
$SomeTag$Dianne's horse$SomeTag$
-- with dollar inside and matching tags
$foo$hello$world$bar$
"#))
}
#[test]
fn dollar_strings_part2() {
assert_debug_snapshot!(lex(r#"
DO $doblock$
end
$doblock$;"#))
}
#[test]
fn dollar_quote_mismatch_tags_simple() {
assert_debug_snapshot!(lex(r#"
-- dollar quoting with mismatched tags
$foo$hello world$bar$
"#));
}
#[test]
fn dollar_quote_mismatch_tags_complex() {
assert_debug_snapshot!(lex(r#"
-- with dollar inside but mismatched tags
$foo$hello$world$bar$
"#));
}
#[test]
fn numeric() {
assert_debug_snapshot!(lex(r#"
42
3.5
4.
.001
.123e10
5e2
1.925e-3
1e-10
1e+10
1e10
4664.E+5
"#))
}
#[test]
fn numeric_non_decimal() {
assert_debug_snapshot!(lex(r#"
0b100101
0B10011001
0o273
0O755
0x42f
0XFFFF
"#))
}
#[test]
fn numeric_with_seperators() {
assert_debug_snapshot!(lex(r#"
1_500_000_000
0b10001000_00000000
0o_1_755
0xFFFF_FFFF
1.618_034
"#))
}
#[test]
fn select_with_period() {
assert_debug_snapshot!(lex(r#"
select public.users;
"#))
}
#[test]
fn bitstring() {
assert_debug_snapshot!(lex(r#"
B'1001'
b'1001'
X'1FF'
x'1FF'
"#))
}
#[test]
fn string() {
assert_debug_snapshot!(lex(r#"
'Dianne''s horse'
select 'foo ''
bar';
select 'foooo'
'bar';
'foo \\ \n \tbar'
'forgot to close the string
"#))
}
#[test]
fn params() {
assert_debug_snapshot!(lex(r#"
select $1 + $2;
select $1123123123123;
select $;
"#))
}
#[test]
fn string_with_escapes() {
assert_debug_snapshot!(lex(r#"
E'foo'
e'bar'
e'\b\f\n\r\t'
e'\0\11\777'
e'\x0\x11\xFF'
e'\uAAAA \UFFFFFFFF'
"#))
}
#[test]
fn string_unicode_escape() {
assert_debug_snapshot!(lex(r#"
U&"d\0061t\+000061"
U&"\0441\043B\043E\043D"
u&'\0441\043B'
U&"d!0061t!+000061" UESCAPE '!'
"#))
}
#[test]
fn quoted_ident() {
assert_debug_snapshot!(lex(r#"
"hello &1 -world";
"hello-world
"#))
}
#[test]
fn quoted_ident_with_escape_quote() {
assert_debug_snapshot!(lex(r#"
"foo "" bar"
"#))
}
#[test]
fn dollar_quoted_string() {
assert_debug_snapshot!(lex("$$$$"), @r#"
[
"$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
]
"#);
}
}