use reifydb_type::error::{AstErrorKind, Error, TypeError};
pub mod cursor;
pub mod identifier;
pub mod keyword;
pub mod literal;
pub mod operator;
pub mod separator;
#[allow(clippy::module_inception)]
pub mod token;
pub mod variable;
use cursor::Cursor;
use identifier::{is_identifier_char, is_identifier_start};
use reifydb_type::fragment::Fragment;
use token::{Token, TokenKind};
use variable::scan_variable;
use crate::{
Result,
bump::{Bump, BumpVec},
token::{
identifier::{scan_digit_starting_identifier, scan_identifier, scan_quoted_identifier},
keyword::scan_keyword,
literal::scan_literal,
operator::scan_operator,
separator::scan_separator,
},
};
const SYSTEM_COLUMNS: &[&str] = &["rownum", "created_at", "updated_at"];
fn scan_system_column<'b>(cursor: &mut Cursor<'b>) -> Option<Token<'b>> {
if cursor.peek() != Some('#') {
return None;
}
let state = cursor.save_state();
let start_pos = cursor.pos();
let start_line = cursor.line();
let start_column = cursor.column();
cursor.consume();
if let Some(ch) = cursor.peek()
&& is_identifier_start(ch)
{
cursor.consume_while(is_identifier_char);
let fragment = cursor.make_fragment(start_pos, start_line, start_column);
let name = &fragment.text()[1..];
if SYSTEM_COLUMNS.contains(&name) {
return Some(Token {
kind: TokenKind::SystemColumn,
fragment,
});
}
}
cursor.restore_state(state);
None
}
pub fn tokenize<'b>(bump: &'b Bump, input: &'b str) -> Result<BumpVec<'b, Token<'b>>> {
let mut cursor = Cursor::new(input);
let estimated_tokens = (input.len() / 6).clamp(8, 2048);
let mut tokens = BumpVec::with_capacity_in(estimated_tokens, bump);
while !cursor.is_eof() {
cursor.skip_whitespace();
if cursor.is_eof() {
break;
}
let token = match cursor.peek() {
Some(ch) => match ch {
'$' => scan_variable(&mut cursor),
'#' => scan_system_column(&mut cursor).or_else(|| scan_literal(&mut cursor)),
'`' => scan_quoted_identifier(&mut cursor),
'\'' | '"' => scan_literal(&mut cursor),
'0'..='9' => {
let state = cursor.save_state();
match scan_literal(&mut cursor) {
Some(tok) => {
if cursor.peek().is_some_and(|c| c.is_ascii_alphabetic()) {
let num_state = cursor.save_state();
cursor.restore_state(state);
scan_digit_starting_identifier(&mut cursor).or_else(
|| {
cursor.restore_state(num_state);
Some(tok)
},
)
} else {
Some(tok)
}
}
None => {
cursor.restore_state(state);
scan_digit_starting_identifier(&mut cursor)
}
}
}
'.' => {
if cursor.peek_ahead(1).is_some_and(|ch| ch.is_ascii_digit()) {
scan_literal(&mut cursor).or_else(|| scan_operator(&mut cursor))
} else {
scan_operator(&mut cursor).or_else(|| scan_literal(&mut cursor))
}
}
'(' | ')' | '[' | ']' | '{' | '}' | '+' | '*' | '/' | '^' | '%' | '?' => {
scan_operator(&mut cursor)
}
'<' | '>' | ':' | '&' | '|' | '=' | '!' => scan_operator(&mut cursor),
'-' => scan_operator(&mut cursor).or_else(|| scan_literal(&mut cursor)),
',' | ';' => scan_separator(&mut cursor),
'a'..='z' | 'A'..='Z' | '_' => {
scan_keyword(&mut cursor)
.or_else(|| scan_literal(&mut cursor))
.or_else(|| scan_operator(&mut cursor))
.or_else(|| scan_identifier(&mut cursor))
}
_ => scan_literal(&mut cursor)
.or_else(|| scan_operator(&mut cursor))
.or_else(|| scan_variable(&mut cursor))
.or_else(|| scan_identifier(&mut cursor))
.or_else(|| scan_separator(&mut cursor)),
},
None => None,
};
match token {
Some(tok) => tokens.push(tok),
None => {
let ch = cursor.peek().unwrap_or('?');
let message = format!(
"Unexpected character '{}' at line {}, column {}",
ch,
cursor.line(),
cursor.column()
);
return Err(Error::from(TypeError::Ast {
kind: AstErrorKind::TokenizeError {
message: message.clone(),
},
message,
fragment: Fragment::None,
}));
}
}
}
Ok(tokens)
}
#[cfg(test)]
pub mod tests {
use super::{
keyword::Keyword,
operator::Operator,
separator::Separator,
token::{Literal, TokenKind},
tokenize,
};
use crate::bump::Bump;
#[test]
fn test_tokenize_simple() {
let bump = Bump::new();
let tokens = tokenize(&bump, "MAP * FROM users").unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
assert_eq!(tokens[3].kind, TokenKind::Identifier);
}
#[test]
fn test_tokenize_with_whitespace() {
let bump = Bump::new();
let tokens = tokenize(&bump, " MAP * FROM users ").unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
assert_eq!(tokens[3].kind, TokenKind::Identifier);
}
#[test]
fn test_tokenize_numbers() {
let bump = Bump::new();
let tokens = tokenize(&bump, "42 3.14 0x2A 0b1010 0o777").unwrap();
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::Number));
assert_eq!(tokens[0].value(), "42");
assert_eq!(tokens[1].kind, TokenKind::Literal(Literal::Number));
assert_eq!(tokens[1].value(), "3.14");
assert_eq!(tokens[2].kind, TokenKind::Literal(Literal::Number));
assert_eq!(tokens[2].value(), "0x2A");
assert_eq!(tokens[3].kind, TokenKind::Literal(Literal::Number));
assert_eq!(tokens[3].value(), "0b1010");
assert_eq!(tokens[4].kind, TokenKind::Literal(Literal::Number));
assert_eq!(tokens[4].value(), "0o777");
}
#[test]
fn test_tokenize_strings() {
let bump = Bump::new();
let tokens = tokenize(&bump, "'hello' \"world\"").unwrap();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::Text));
assert_eq!(tokens[0].value(), "hello");
assert_eq!(tokens[1].kind, TokenKind::Literal(Literal::Text));
assert_eq!(tokens[1].value(), "world");
}
#[test]
fn test_tokenize_variables() {
let bump = Bump::new();
let tokens = tokenize(&bump, "$1 + $user_id").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::Variable);
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Plus));
assert_eq!(tokens[2].kind, TokenKind::Variable);
}
#[test]
fn test_tokenize_operators() {
let bump = Bump::new();
let tokens = tokenize(&bump, "a >= b && c != d").unwrap();
assert_eq!(tokens.len(), 7);
assert_eq!(tokens[0].kind, TokenKind::Identifier);
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::RightAngleEqual));
assert_eq!(tokens[2].kind, TokenKind::Identifier);
assert_eq!(tokens[3].kind, TokenKind::Operator(Operator::DoubleAmpersand));
assert_eq!(tokens[4].kind, TokenKind::Identifier);
assert_eq!(tokens[5].kind, TokenKind::Operator(Operator::BangEqual));
assert_eq!(tokens[6].kind, TokenKind::Identifier);
}
#[test]
fn test_tokenize_keywords_case_insensitive() {
let bump = Bump::new();
let tokens = tokenize(&bump, "map Map MAP").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::Map));
}
#[test]
fn test_tokenize_comptokenize_query() {
let bump = Bump::new();
let query = "MAP name, age FROM users WHERE age > 18 AND status = 'active'";
let tokens = tokenize(&bump, query).unwrap();
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Identifier);
assert_eq!(tokens[2].kind, TokenKind::Separator(Separator::Comma));
assert_eq!(tokens[3].kind, TokenKind::Identifier);
assert_eq!(tokens[4].kind, TokenKind::Keyword(Keyword::From));
assert_eq!(tokens[5].kind, TokenKind::Identifier);
assert_eq!(tokens[6].kind, TokenKind::Identifier);
assert_eq!(tokens[7].kind, TokenKind::Identifier);
assert_eq!(tokens[8].kind, TokenKind::Operator(Operator::RightAngle));
assert_eq!(tokens[9].kind, TokenKind::Literal(Literal::Number));
assert_eq!(tokens[10].kind, TokenKind::Operator(Operator::And));
assert_eq!(tokens[11].kind, TokenKind::Identifier);
assert_eq!(tokens[12].kind, TokenKind::Operator(Operator::Equal));
assert_eq!(tokens[13].kind, TokenKind::Literal(Literal::Text));
assert_eq!(tokens[13].value(), "active");
}
#[test]
fn test_tokenize_desc_keyword() {
let bump = Bump::new();
let tokens = tokenize(&bump, "DESC").unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Desc));
}
#[test]
fn test_tokenize_single_char_identifier() {
let bump = Bump::new();
let tokens = tokenize(&bump, "a").unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Identifier);
assert_eq!(tokens[0].value(), "a");
}
#[test]
fn test_tokenize_boolean_literals() {
let bump = Bump::new();
let tokens = tokenize(&bump, "true false TRUE FALSE").unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::True));
assert_eq!(tokens[1].kind, TokenKind::Literal(Literal::False));
assert_eq!(tokens[2].kind, TokenKind::Literal(Literal::True));
assert_eq!(tokens[3].kind, TokenKind::Literal(Literal::False));
}
#[test]
fn test_tokenize_inline_comment() {
let bump = Bump::new();
let tokens = tokenize(&bump, "MAP * FROM users # comment").unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
assert_eq!(tokens[3].kind, TokenKind::Identifier);
}
#[test]
fn test_tokenize_comment_only() {
let bump = Bump::new();
let tokens = tokenize(&bump, "# just a comment").unwrap();
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_tokenize_hash_in_string_literal() {
let bump = Bump::new();
let tokens = tokenize(&bump, "'hello # world'").unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::Text));
assert_eq!(tokens[0].value(), "hello # world");
}
#[test]
fn test_tokenize_comment_between_lines() {
let bump = Bump::new();
let tokens = tokenize(&bump, "MAP *\n# comment\nFROM users").unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
assert_eq!(tokens[3].kind, TokenKind::Identifier);
}
#[test]
fn test_tokenize_empty_comment() {
let bump = Bump::new();
let tokens = tokenize(&bump, "#\nMAP *").unwrap();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
}
}