use crate::error::{Error, Result};
use alloc::string::String;
use alloc::vec::Vec;
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Word(String),
Ident(String),
Integer(i64),
Float(f64),
Str(String),
Blob(Vec<u8>),
Param(Param),
LParen,
RParen,
Comma,
Semicolon,
Dot,
Star,
Plus,
Minus,
Slash,
Percent,
Eq,
NotEq,
Lt,
LtEq,
Gt,
GtEq,
Concat,
BitAnd,
BitOr,
BitNot,
LShift,
RShift,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Param {
Anonymous,
Numbered(u32),
Named(String),
}
#[derive(Debug, Clone, PartialEq)]
pub struct Spanned {
pub token: Token,
pub start: usize,
pub end: usize,
}
pub fn tokenize(sql: &str) -> Result<Vec<Spanned>> {
Tokenizer::new(sql).run()
}
struct Tokenizer<'a> {
src: &'a str,
bytes: &'a [u8],
pos: usize,
}
impl<'a> Tokenizer<'a> {
fn new(src: &'a str) -> Tokenizer<'a> {
Tokenizer {
src,
bytes: src.as_bytes(),
pos: 0,
}
}
fn run(mut self) -> Result<Vec<Spanned>> {
let mut out = Vec::new();
loop {
self.skip_trivia()?;
let start = self.pos;
let Some(c) = self.peek() else { break };
let token = self.next_token(c)?;
out.push(Spanned {
token,
start,
end: self.pos,
});
}
Ok(out)
}
fn peek(&self) -> Option<u8> {
self.bytes.get(self.pos).copied()
}
fn peek_at(&self, ahead: usize) -> Option<u8> {
self.bytes.get(self.pos + ahead).copied()
}
fn err(&self, msg: &str) -> Error {
Error::Parse(alloc::format!("{msg} at byte {}", self.pos))
}
fn skip_trivia(&mut self) -> Result<()> {
loop {
match self.peek() {
Some(b) if b.is_ascii_whitespace() => {
self.pos += 1;
}
Some(b'-') if self.peek_at(1) == Some(b'-') => {
while let Some(c) = self.peek() {
self.pos += 1;
if c == b'\n' {
break;
}
}
}
Some(b'/') if self.peek_at(1) == Some(b'*') => {
self.pos += 2;
loop {
match self.peek() {
None => return Err(self.err("unterminated block comment")),
Some(b'*') if self.peek_at(1) == Some(b'/') => {
self.pos += 2;
break;
}
Some(_) => self.pos += 1,
}
}
}
_ => return Ok(()),
}
}
}
fn next_token(&mut self, c: u8) -> Result<Token> {
match c {
b'(' => self.single(Token::LParen),
b')' => self.single(Token::RParen),
b',' => self.single(Token::Comma),
b';' => self.single(Token::Semicolon),
b'+' => self.single(Token::Plus),
b'-' => self.single(Token::Minus),
b'%' => self.single(Token::Percent),
b'~' => self.single(Token::BitNot),
b'*' => self.single(Token::Star),
b'/' => self.single(Token::Slash),
b'=' => {
self.pos += 1;
if self.peek() == Some(b'=') {
self.pos += 1;
}
Ok(Token::Eq)
}
b'<' => {
self.pos += 1;
match self.peek() {
Some(b'=') => self.single(Token::LtEq),
Some(b'>') => self.single(Token::NotEq),
Some(b'<') => self.single(Token::LShift),
_ => Ok(Token::Lt),
}
}
b'>' => {
self.pos += 1;
match self.peek() {
Some(b'=') => self.single(Token::GtEq),
Some(b'>') => self.single(Token::RShift),
_ => Ok(Token::Gt),
}
}
b'!' => {
self.pos += 1;
if self.peek() == Some(b'=') {
self.pos += 1;
Ok(Token::NotEq)
} else {
Err(self.err("unexpected '!' (did you mean '!=' ?)"))
}
}
b'|' => {
self.pos += 1;
if self.peek() == Some(b'|') {
self.single(Token::Concat)
} else {
Ok(Token::BitOr)
}
}
b'&' => self.single(Token::BitAnd),
b'.' => {
if matches!(self.peek_at(1), Some(d) if d.is_ascii_digit()) {
self.number()
} else {
self.single(Token::Dot)
}
}
b'\'' => self.string_literal(),
b'"' => self.quoted_ident(b'"'),
b'[' => self.bracket_ident(),
b'`' => self.quoted_ident(b'`'),
b'?' | b':' | b'@' | b'$' => self.parameter(c),
b'x' | b'X' if self.peek_at(1) == Some(b'\'') => self.blob_literal(),
d if d.is_ascii_digit() => self.number(),
w if is_ident_start(w) => Ok(self.word()),
other => Err(self.err(&alloc::format!("unexpected character {:?}", other as char))),
}
}
fn single(&mut self, t: Token) -> Result<Token> {
self.pos += 1;
Ok(t)
}
fn word(&mut self) -> Token {
let start = self.pos;
while matches!(self.peek(), Some(c) if is_ident_continue(c)) {
self.pos += 1;
}
Token::Word(String::from(&self.src[start..self.pos]))
}
fn quoted_ident(&mut self, quote: u8) -> Result<Token> {
self.pos += 1; let mut s = String::new();
let mut seg = self.pos;
loop {
match self.peek() {
None => return Err(self.err("unterminated quoted identifier")),
Some(c) if c == quote => {
s.push_str(&self.src[seg..self.pos]);
self.pos += 1;
if self.peek() == Some(quote) {
s.push(quote as char); self.pos += 1;
seg = self.pos;
} else {
return Ok(Token::Ident(s));
}
}
Some(_) => self.pos += 1,
}
}
}
fn bracket_ident(&mut self) -> Result<Token> {
self.pos += 1; let start = self.pos;
while let Some(c) = self.peek() {
if c == b']' {
let s = String::from(&self.src[start..self.pos]);
self.pos += 1;
return Ok(Token::Ident(s));
}
self.pos += 1;
}
Err(self.err("unterminated [identifier]"))
}
fn string_literal(&mut self) -> Result<Token> {
self.pos += 1; let mut s = String::new();
let mut seg = self.pos;
loop {
match self.peek() {
None => return Err(self.err("unterminated string literal")),
Some(b'\'') => {
s.push_str(&self.src[seg..self.pos]);
self.pos += 1;
if self.peek() == Some(b'\'') {
s.push('\''); self.pos += 1;
seg = self.pos;
} else {
return Ok(Token::Str(s));
}
}
Some(_) => self.pos += 1,
}
}
}
fn blob_literal(&mut self) -> Result<Token> {
self.pos += 2; let start = self.pos;
while matches!(self.peek(), Some(c) if c != b'\'') {
self.pos += 1;
}
if self.peek() != Some(b'\'') {
return Err(self.err("unterminated blob literal"));
}
let hex = &self.src[start..self.pos];
self.pos += 1; if !hex.len().is_multiple_of(2) {
return Err(self.err("blob literal has odd number of hex digits"));
}
let mut bytes = Vec::with_capacity(hex.len() / 2);
let hb = hex.as_bytes();
let mut i = 0;
while i < hb.len() {
let hi = hex_val(hb[i]).ok_or_else(|| self.err("invalid hex in blob literal"))?;
let lo = hex_val(hb[i + 1]).ok_or_else(|| self.err("invalid hex in blob literal"))?;
bytes.push((hi << 4) | lo);
i += 2;
}
Ok(Token::Blob(bytes))
}
fn number(&mut self) -> Result<Token> {
let start = self.pos;
if self.peek() == Some(b'0') && matches!(self.peek_at(1), Some(b'x') | Some(b'X')) {
self.pos += 2;
let hstart = self.pos;
while matches!(self.peek(), Some(c) if c.is_ascii_hexdigit()) {
self.pos += 1;
}
let digits = &self.src[hstart..self.pos];
let v = u64::from_str_radix(digits, 16)
.map_err(|_| self.err("invalid hexadecimal integer"))?;
return Ok(Token::Integer(v as i64));
}
let mut is_float = false;
while matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
self.pos += 1;
}
if self.peek() == Some(b'.') {
is_float = true;
self.pos += 1;
while matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
self.pos += 1;
}
}
if matches!(self.peek(), Some(b'e') | Some(b'E')) {
is_float = true;
self.pos += 1;
if matches!(self.peek(), Some(b'+') | Some(b'-')) {
self.pos += 1;
}
while matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
self.pos += 1;
}
}
let text = &self.src[start..self.pos];
if is_float {
text.parse::<f64>()
.map(Token::Float)
.map_err(|_| self.err("invalid floating-point literal"))
} else {
match text.parse::<i64>() {
Ok(i) => Ok(Token::Integer(i)),
Err(_) => text
.parse::<f64>()
.map(Token::Float)
.map_err(|_| self.err("invalid integer literal")),
}
}
}
fn parameter(&mut self, sigil: u8) -> Result<Token> {
self.pos += 1; match sigil {
b'?' => {
let start = self.pos;
while matches!(self.peek(), Some(c) if c.is_ascii_digit()) {
self.pos += 1;
}
if self.pos == start {
Ok(Token::Param(Param::Anonymous))
} else {
let n = self.src[start..self.pos]
.parse::<u32>()
.map_err(|_| self.err("invalid parameter number"))?;
Ok(Token::Param(Param::Numbered(n)))
}
}
_ => {
let start = self.pos;
while matches!(self.peek(), Some(c) if is_ident_continue(c)) {
self.pos += 1;
}
if self.pos == start {
return Err(self.err("named parameter requires a name"));
}
let mut name = String::new();
name.push(sigil as char);
name.push_str(&self.src[start..self.pos]);
Ok(Token::Param(Param::Named(name)))
}
}
}
}
fn is_ident_start(c: u8) -> bool {
c.is_ascii_alphabetic() || c == b'_' || c >= 0x80
}
fn is_ident_continue(c: u8) -> bool {
c.is_ascii_alphanumeric() || c == b'_' || c >= 0x80
}
fn hex_val(c: u8) -> Option<u8> {
match c {
b'0'..=b'9' => Some(c - b'0'),
b'a'..=b'f' => Some(c - b'a' + 10),
b'A'..=b'F' => Some(c - b'A' + 10),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
fn toks(sql: &str) -> Vec<Token> {
tokenize(sql)
.unwrap()
.into_iter()
.map(|s| s.token)
.collect()
}
#[test]
fn keywords_and_identifiers_are_words() {
assert_eq!(
toks("SELECT a FROM t"),
vec![
Token::Word("SELECT".into()),
Token::Word("a".into()),
Token::Word("FROM".into()),
Token::Word("t".into()),
]
);
}
#[test]
fn operators() {
assert_eq!(
toks("a >= 1 AND b <> 2 OR c || d"),
vec![
Token::Word("a".into()),
Token::GtEq,
Token::Integer(1),
Token::Word("AND".into()),
Token::Word("b".into()),
Token::NotEq,
Token::Integer(2),
Token::Word("OR".into()),
Token::Word("c".into()),
Token::Concat,
Token::Word("d".into()),
]
);
}
#[test]
fn numbers() {
assert_eq!(toks("42"), vec![Token::Integer(42)]);
assert_eq!(toks("2.75"), vec![Token::Float(2.75)]);
assert_eq!(toks(".5"), vec![Token::Float(0.5)]);
assert_eq!(toks("1e3"), vec![Token::Float(1000.0)]);
assert_eq!(toks("0xff"), vec![Token::Integer(255)]);
}
#[test]
fn strings_and_blobs() {
assert_eq!(toks("'hi'"), vec![Token::Str("hi".into())]);
assert_eq!(toks("'it''s'"), vec![Token::Str("it's".into())]);
assert_eq!(toks("x'01ff'"), vec![Token::Blob(vec![1, 255])]);
}
#[test]
fn quoted_identifiers() {
assert_eq!(toks("\"select\""), vec![Token::Ident("select".into())]);
assert_eq!(toks("[a b]"), vec![Token::Ident("a b".into())]);
assert_eq!(toks("`x`"), vec![Token::Ident("x".into())]);
assert_eq!(toks("\"a\"\"b\""), vec![Token::Ident("a\"b".into())]);
}
#[test]
fn parameters() {
assert_eq!(toks("?"), vec![Token::Param(Param::Anonymous)]);
assert_eq!(toks("?12"), vec![Token::Param(Param::Numbered(12))]);
assert_eq!(
toks(":name"),
vec![Token::Param(Param::Named(":name".into()))]
);
assert_eq!(toks("$x"), vec![Token::Param(Param::Named("$x".into()))]);
}
#[test]
fn comments_are_skipped() {
assert_eq!(
toks("SELECT -- a comment\n 1 /* block */ + 2"),
vec![
Token::Word("SELECT".into()),
Token::Integer(1),
Token::Plus,
Token::Integer(2),
]
);
}
#[test]
fn unterminated_string_errors() {
assert!(tokenize("'oops").is_err());
assert!(tokenize("/* nope").is_err());
}
#[test]
fn utf8_identifier_preserved() {
assert_eq!(toks("café"), vec![Token::Word("café".into())]);
}
}