reifydb-rql 0.4.12

// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2025 ReifyDB

use reifydb_type::error::{AstErrorKind, Error, TypeError};

pub mod cursor;
pub mod identifier;
pub mod keyword;
pub mod literal;
pub mod operator;
pub mod separator;
#[allow(clippy::module_inception)]
pub mod token;
pub mod variable;

use cursor::Cursor;
use identifier::{is_identifier_char, is_identifier_start};
use reifydb_type::fragment::Fragment;
use token::{Token, TokenKind};
use variable::scan_variable;

use crate::{
	Result,
	bump::{Bump, BumpVec},
	token::{
		identifier::{scan_digit_starting_identifier, scan_identifier, scan_quoted_identifier},
		keyword::scan_keyword,
		literal::scan_literal,
		operator::scan_operator,
		separator::scan_separator,
	},
};

/// Tokenize the input string into a vector of tokens.
/// The input lifetime is tied to the bump lifetime, enabling zero-copy fragments.
const SYSTEM_COLUMNS: &[&str] = &["rownum", "created_at", "updated_at"];

fn scan_system_column<'b>(cursor: &mut Cursor<'b>) -> Option<Token<'b>> {
	if cursor.peek() != Some('#') {
		return None;
	}

	let state = cursor.save_state();
	let start_pos = cursor.pos();
	let start_line = cursor.line();
	let start_column = cursor.column();

	cursor.consume();

	if let Some(ch) = cursor.peek()
		&& is_identifier_start(ch)
	{
		cursor.consume_while(is_identifier_char);
		let fragment = cursor.make_fragment(start_pos, start_line, start_column);
		let name = &fragment.text()[1..];
		if SYSTEM_COLUMNS.contains(&name) {
			return Some(Token {
				kind: TokenKind::SystemColumn,
				fragment,
			});
		}
	}

	cursor.restore_state(state);
	None
}

pub fn tokenize<'b>(bump: &'b Bump, input: &'b str) -> Result<BumpVec<'b, Token<'b>>> {
	let mut cursor = Cursor::new(input);
	// Estimate token count: rough heuristic of 1 token per 6 characters
	// with minimum of 8 and maximum reasonable limit
	let estimated_tokens = (input.len() / 6).clamp(8, 2048);
	let mut tokens = BumpVec::with_capacity_in(estimated_tokens, bump);

	while !cursor.is_eof() {
		// Skip whitespace at the beginning of each token
		cursor.skip_whitespace();

		if cursor.is_eof() {
			break;
		}

		// Character-based dispatch for better performance
		let token = match cursor.peek() {
			Some(ch) => match ch {
				// Variables start with $
				'$' => scan_variable(&mut cursor),

				'#' => scan_system_column(&mut cursor).or_else(|| scan_literal(&mut cursor)),

				// Backtick-quoted identifiers
				'`' => scan_quoted_identifier(&mut cursor),

				// String literals
				'\'' | '"' => scan_literal(&mut cursor),

				// Numbers or digit-starting identifiers (e.g., 10min, 5sec)
				'0'..='9' => {
					let state = cursor.save_state();
					match scan_literal(&mut cursor) {
						Some(tok) => {
							// If the number is immediately followed by an alpha char,
							// it's likely a digit-starting identifier like "10min"
							if cursor.peek().is_some_and(|c| c.is_ascii_alphabetic()) {
								let num_state = cursor.save_state();
								cursor.restore_state(state);
								scan_digit_starting_identifier(&mut cursor).or_else(
									|| {
										// Fallback: accept the number (e.g.,
										// 3.14px)
										cursor.restore_state(num_state);
										Some(tok)
									},
								)
							} else {
								Some(tok)
							}
						}
						None => {
							cursor.restore_state(state);
							scan_digit_starting_identifier(&mut cursor)
						}
					}
				}

				// Dot could be operator or start of decimal
				// literal
				'.' => {
					// Check if followed by digit - if so,
					// try literal first
					if cursor.peek_ahead(1).is_some_and(|ch| ch.is_ascii_digit()) {
						scan_literal(&mut cursor).or_else(|| scan_operator(&mut cursor))
					} else {
						scan_operator(&mut cursor).or_else(|| scan_literal(&mut cursor))
					}
				}

				// Pure punctuation operators
				'(' | ')' | '[' | ']' | '{' | '}' | '+' | '*' | '/' | '^' | '%' | '?' => {
					scan_operator(&mut cursor)
				}

				// Multi-char operators starting with these
				// chars - try operator first
				'<' | '>' | ':' | '&' | '|' | '=' | '!' => scan_operator(&mut cursor),

				// Minus could be operator or negative number
				'-' => scan_operator(&mut cursor).or_else(|| scan_literal(&mut cursor)),

				// Separators
				',' | ';' => scan_separator(&mut cursor),

				// Letters could be keywords, literals
				// (true/false/none), word operators, or
				// identifiers
				'a'..='z' | 'A'..='Z' | '_' => {
					// Try in order: keyword, literal,
					// operator, identifier
					scan_keyword(&mut cursor)
						.or_else(|| scan_literal(&mut cursor))
						.or_else(|| scan_operator(&mut cursor))
						.or_else(|| scan_identifier(&mut cursor))
				}

				// Everything else - try all scanners in order
				_ => scan_literal(&mut cursor)
					.or_else(|| scan_operator(&mut cursor))
					.or_else(|| scan_variable(&mut cursor))
					.or_else(|| scan_identifier(&mut cursor))
					.or_else(|| scan_separator(&mut cursor)),
			},
			None => None,
		};

		match token {
			Some(tok) => tokens.push(tok),
			None => {
				// Unable to token - report error with
				// current character
				let ch = cursor.peek().unwrap_or('?');
				let message = format!(
					"Unexpected character '{}' at line {}, column {}",
					ch,
					cursor.line(),
					cursor.column()
				);
				return Err(Error::from(TypeError::Ast {
					kind: AstErrorKind::TokenizeError {
						message: message.clone(),
					},
					message,
					fragment: Fragment::None,
				}));
			}
		}
	}

	Ok(tokens)
}

#[cfg(test)]
pub mod tests {
	use super::{
		keyword::Keyword,
		operator::Operator,
		separator::Separator,
		token::{Literal, TokenKind},
		tokenize,
	};
	use crate::bump::Bump;

	#[test]
	fn test_tokenize_simple() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "MAP * FROM users").unwrap();
		assert_eq!(tokens.len(), 4);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
		assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
		assert_eq!(tokens[3].kind, TokenKind::Identifier);
	}

	#[test]
	fn test_tokenize_with_whitespace() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "   MAP   *   FROM   users   ").unwrap();
		assert_eq!(tokens.len(), 4);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
		assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
		assert_eq!(tokens[3].kind, TokenKind::Identifier);
	}

	#[test]
	fn test_tokenize_numbers() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "42 3.14 0x2A 0b1010 0o777").unwrap();
		assert_eq!(tokens.len(), 5);
		assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::Number));
		assert_eq!(tokens[0].value(), "42");
		assert_eq!(tokens[1].kind, TokenKind::Literal(Literal::Number));
		assert_eq!(tokens[1].value(), "3.14");
		assert_eq!(tokens[2].kind, TokenKind::Literal(Literal::Number));
		assert_eq!(tokens[2].value(), "0x2A");
		assert_eq!(tokens[3].kind, TokenKind::Literal(Literal::Number));
		assert_eq!(tokens[3].value(), "0b1010");
		assert_eq!(tokens[4].kind, TokenKind::Literal(Literal::Number));
		assert_eq!(tokens[4].value(), "0o777");
	}

	#[test]
	fn test_tokenize_strings() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "'hello' \"world\"").unwrap();
		assert_eq!(tokens.len(), 2);
		assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::Text));
		assert_eq!(tokens[0].value(), "hello");
		assert_eq!(tokens[1].kind, TokenKind::Literal(Literal::Text));
		assert_eq!(tokens[1].value(), "world");
	}

	#[test]
	fn test_tokenize_variables() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "$1 + $user_id").unwrap();
		assert_eq!(tokens.len(), 3);
		assert_eq!(tokens[0].kind, TokenKind::Variable);
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Plus));
		assert_eq!(tokens[2].kind, TokenKind::Variable);
	}

	#[test]
	fn test_tokenize_operators() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "a >= b && c != d").unwrap();
		assert_eq!(tokens.len(), 7);
		assert_eq!(tokens[0].kind, TokenKind::Identifier);
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::RightAngleEqual));
		assert_eq!(tokens[2].kind, TokenKind::Identifier);
		assert_eq!(tokens[3].kind, TokenKind::Operator(Operator::DoubleAmpersand));
		assert_eq!(tokens[4].kind, TokenKind::Identifier);
		assert_eq!(tokens[5].kind, TokenKind::Operator(Operator::BangEqual));
		assert_eq!(tokens[6].kind, TokenKind::Identifier);
	}

	#[test]
	fn test_tokenize_keywords_case_insensitive() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "map Map MAP").unwrap();
		assert_eq!(tokens.len(), 3);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::Map));
	}

	#[test]
	fn test_tokenize_comptokenize_query() {
		let bump = Bump::new();
		let query = "MAP name, age FROM users WHERE age > 18 AND status = 'active'";
		let tokens = tokenize(&bump, query).unwrap();

		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Identifier);
		assert_eq!(tokens[2].kind, TokenKind::Separator(Separator::Comma));
		assert_eq!(tokens[3].kind, TokenKind::Identifier);
		assert_eq!(tokens[4].kind, TokenKind::Keyword(Keyword::From));
		assert_eq!(tokens[5].kind, TokenKind::Identifier);
		assert_eq!(tokens[6].kind, TokenKind::Identifier);
		assert_eq!(tokens[7].kind, TokenKind::Identifier);
		assert_eq!(tokens[8].kind, TokenKind::Operator(Operator::RightAngle));
		assert_eq!(tokens[9].kind, TokenKind::Literal(Literal::Number));
		assert_eq!(tokens[10].kind, TokenKind::Operator(Operator::And));
		assert_eq!(tokens[11].kind, TokenKind::Identifier);
		assert_eq!(tokens[12].kind, TokenKind::Operator(Operator::Equal));
		assert_eq!(tokens[13].kind, TokenKind::Literal(Literal::Text));
		assert_eq!(tokens[13].value(), "active");
	}

	#[test]
	fn test_tokenize_desc_keyword() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "DESC").unwrap();
		assert_eq!(tokens.len(), 1);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Desc));
	}

	#[test]
	fn test_tokenize_single_char_identifier() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "a").unwrap();
		assert_eq!(tokens.len(), 1);
		assert_eq!(tokens[0].kind, TokenKind::Identifier);
		assert_eq!(tokens[0].value(), "a");
	}

	#[test]
	fn test_tokenize_boolean_literals() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "true false TRUE FALSE").unwrap();
		assert_eq!(tokens.len(), 4);
		assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::True));
		assert_eq!(tokens[1].kind, TokenKind::Literal(Literal::False));
		assert_eq!(tokens[2].kind, TokenKind::Literal(Literal::True));
		assert_eq!(tokens[3].kind, TokenKind::Literal(Literal::False));
	}

	#[test]
	fn test_tokenize_inline_comment() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "MAP * FROM users # comment").unwrap();
		assert_eq!(tokens.len(), 4);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
		assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
		assert_eq!(tokens[3].kind, TokenKind::Identifier);
	}

	#[test]
	fn test_tokenize_comment_only() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "# just a comment").unwrap();
		assert_eq!(tokens.len(), 0);
	}

	#[test]
	fn test_tokenize_hash_in_string_literal() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "'hello # world'").unwrap();
		assert_eq!(tokens.len(), 1);
		assert_eq!(tokens[0].kind, TokenKind::Literal(Literal::Text));
		assert_eq!(tokens[0].value(), "hello # world");
	}

	#[test]
	fn test_tokenize_comment_between_lines() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "MAP *\n# comment\nFROM users").unwrap();
		assert_eq!(tokens.len(), 4);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
		assert_eq!(tokens[2].kind, TokenKind::Keyword(Keyword::From));
		assert_eq!(tokens[3].kind, TokenKind::Identifier);
	}

	#[test]
	fn test_tokenize_empty_comment() {
		let bump = Bump::new();
		let tokens = tokenize(&bump, "#\nMAP *").unwrap();
		assert_eq!(tokens.len(), 2);
		assert_eq!(tokens[0].kind, TokenKind::Keyword(Keyword::Map));
		assert_eq!(tokens[1].kind, TokenKind::Operator(Operator::Asterisk));
	}
}