use crate::diagnostic::Diagnostic;
use crate::token::SourceLocation;
use crate::token::Token;
use crate::token::TokenKind;
use crate::token::GITQL_RESERVED_KEYWORDS;
pub struct Tokenizer<'a> {
content: &'a [char],
content_len: usize,
index: usize,
line_start: u32,
line_end: u32,
column_start: u32,
column_end: u32,
}
impl<'a> Tokenizer<'a> {
pub(crate) fn new(chars: &'a [char]) -> Tokenizer<'a> {
Tokenizer {
content: chars,
content_len: chars.len(),
index: 0,
line_start: 1,
line_end: 1,
column_start: 0,
column_end: 0,
}
}
pub fn tokenize(chars: &'a str) -> Result<Vec<Token>, Box<Diagnostic>> {
let chars: Vec<char> = chars.chars().collect();
Tokenizer::new(&chars).tokenize_characters()
}
fn current_source_location(&self) -> SourceLocation {
SourceLocation {
line_start: self.line_start,
line_end: self.line_end,
column_start: self.column_start,
column_end: self.column_end,
}
}
fn tokenize_characters(&mut self) -> Result<Vec<Token>, Box<Diagnostic>> {
let mut tokens: Vec<Token> = Vec::new();
let len = self.content_len;
while self.has_next() {
self.column_start = self.column_end;
self.line_start = self.line_end;
let char = self.content[self.index];
if char.is_alphabetic() {
tokens.push(self.consume_identifier());
continue;
}
if char == '@' {
if self.is_next_char('>') {
self.index += 2;
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::AtRightArrow, location));
continue;
}
tokens.push(self.consume_global_variable_name()?);
continue;
}
if char.is_numeric() {
if char == '0' && self.index + 1 < len {
match self.content[self.index + 1] {
'b' | 'B' => {
self.index += 2;
self.column_start += 2;
tokens.push(self.consume_binary_number()?);
continue;
}
'x' | 'X' => {
self.index += 2;
self.column_start += 2;
tokens.push(self.consume_hex_number()?);
continue;
}
'o' | 'O' => {
self.index += 2;
self.column_start += 2;
tokens.push(self.consume_octal_number()?);
continue;
}
_ => {
tokens.push(self.consume_number()?);
continue;
}
}
}
tokens.push(self.consume_number()?);
continue;
}
if char == '\'' {
tokens.push(self.consume_string_in_single_quotes()?);
continue;
}
if char == '"' {
tokens.push(self.consume_string_in_double_quotes()?);
continue;
}
if char == '`' {
tokens.push(self.consume_backticks_identifier()?);
continue;
}
if char == '+' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Plus, location));
self.advance();
continue;
}
if char == '-' {
if self.is_next_char('-') {
self.consume_single_line_comment();
continue;
}
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Minus, location));
self.advance();
continue;
}
if char == '*' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Star, location));
self.advance();
continue;
}
if char == '/' {
if self.is_next_char('*') {
self.consume_c_style_block_comment()?;
continue;
}
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Slash, location));
self.advance();
continue;
}
if char == '%' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Percentage, location));
self.advance();
continue;
}
if char == '^' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Caret, location));
self.advance();
continue;
}
if char == '~' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::BitwiseNot, location));
self.advance();
continue;
}
if char == '|' {
let location = self.current_source_location();
self.advance();
let kind = if self.is_current_char('|') {
self.advance();
TokenKind::OrOr
} else {
TokenKind::BitwiseOr
};
tokens.push(Token::new(kind, location));
continue;
}
if char == '&' {
let location = self.current_source_location();
self.advance();
let kind = if self.is_current_char('&') {
self.advance();
TokenKind::AndAnd
} else {
TokenKind::BitwiseAnd
};
tokens.push(Token::new(kind, location));
continue;
}
if char == '#' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::BitwiseXor, location));
self.advance();
continue;
}
if char == ',' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Comma, location));
self.advance();
continue;
}
if char == '.' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Dot, location));
self.advance();
continue;
}
if char == '>' {
let location = self.current_source_location();
self.advance();
let kind = if self.is_current_char('=') {
self.advance();
TokenKind::GreaterEqual
} else if self.is_current_char('>') {
self.advance();
TokenKind::BitwiseRightShift
} else {
TokenKind::Greater
};
tokens.push(Token::new(kind, location));
continue;
}
if char == '<' {
let location = self.current_source_location();
self.advance();
let kind = if self.is_current_char('=') {
self.advance();
if self.is_current_char('>') {
self.advance();
TokenKind::NullSafeEqual
} else {
TokenKind::LessEqual
}
} else if self.is_current_char('<') {
self.advance();
TokenKind::BitwiseLeftShift
} else if self.is_current_char('>') {
self.advance();
TokenKind::BangEqual
} else if self.is_current_char('@') {
self.advance();
TokenKind::ArrowRightAt
} else {
TokenKind::Less
};
tokens.push(Token::new(kind, location));
continue;
}
if char == '=' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Equal, location));
self.advance();
continue;
}
if char == ':' {
let location = self.current_source_location();
if self.is_next_char('=') {
tokens.push(Token::new(TokenKind::ColonEqual, location));
self.advance_n(2);
continue;
}
if self.is_next_char(':') {
tokens.push(Token::new(TokenKind::ColonColon, location));
self.advance_n(2);
continue;
}
tokens.push(Token::new(TokenKind::Colon, location));
self.advance();
continue;
}
if char == '!' {
let location = self.current_source_location();
self.advance();
let kind = if self.is_current_char('=') {
self.advance();
TokenKind::BangEqual
} else {
TokenKind::Bang
};
tokens.push(Token::new(kind, location));
continue;
}
if char == '(' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::LeftParen, location));
self.advance();
continue;
}
if char == ')' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::RightParen, location));
self.advance();
continue;
}
if char == '[' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::LeftBracket, location));
self.advance();
continue;
}
if char == ']' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::RightBracket, location));
self.advance();
continue;
}
if char == ';' {
let location = self.current_source_location();
tokens.push(Token::new(TokenKind::Semicolon, location));
self.advance();
continue;
}
if char == ' ' || char == '\t' {
self.advance();
continue;
}
if char == '\n' {
self.advance();
self.column_end = 0;
self.line_end += 1;
continue;
}
return Err(Diagnostic::error("Unexpected character")
.with_location(self.current_source_location())
.as_boxed());
}
Ok(tokens)
}
fn consume_global_variable_name(&mut self) -> Result<Token, Box<Diagnostic>> {
let start_index = self.index;
self.advance();
if !self.is_current_char_func(|c| c.is_alphanumeric()) {
return Err(Diagnostic::error(
"Global variable name must start with alphabetic character",
)
.add_help("Add at least one alphabetic character after @")
.with_location(self.current_source_location())
.as_boxed());
}
while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
self.advance();
}
let literal = &self.content[start_index..self.index];
let mut string: String = literal.iter().collect();
string = string.to_lowercase();
let location = self.current_source_location();
Ok(Token::new(TokenKind::GlobalVariable(string), location))
}
fn consume_identifier(&mut self) -> Token {
let start_index = self.index;
while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
self.advance();
}
let literal = &self.content[start_index..self.index];
let mut string: String = literal.iter().collect();
string = string.to_lowercase();
let kind = GITQL_RESERVED_KEYWORDS
.get(string.as_str())
.cloned()
.unwrap_or(TokenKind::Symbol(string));
Token::new(kind, self.current_source_location())
}
fn consume_backticks_identifier(&mut self) -> Result<Token, Box<Diagnostic>> {
let start_index = self.index;
self.advance();
while !self.is_current_char('`') {
self.advance();
}
if self.index >= self.content_len {
return Err(Diagnostic::error("Unterminated backticks")
.add_help("Add ` at the end of the identifier")
.with_location(self.current_source_location())
.as_boxed());
}
self.advance();
let literal = &self.content[start_index + 1..self.index - 1];
let identifier: String = literal.iter().collect();
let location = self.current_source_location();
Ok(Token::new(TokenKind::Symbol(identifier), location))
}
fn consume_number(&mut self) -> Result<Token, Box<Diagnostic>> {
let start_index = self.index;
while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
self.advance();
}
let mut is_float_value = false;
if self.is_current_char('.') {
self.advance();
is_float_value = true;
while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
self.advance();
}
}
let literal = &self.content[start_index..self.index];
let string: String = literal.iter().collect();
let literal_num = string.replace('_', "");
let location = self.current_source_location();
if is_float_value {
return match literal_num.parse::<f64>() {
Ok(float) => Ok(Token::new(TokenKind::Float(float), location)),
Err(parse_float_error) => Err(Diagnostic::error(&parse_float_error.to_string())
.add_note(&format!(
"Value must be between {} and {}",
f64::MIN,
f64::MAX
))
.with_location(self.current_source_location())
.as_boxed()),
};
}
match literal_num.parse::<i64>() {
Ok(integer) => Ok(Token::new(TokenKind::Integer(integer), location)),
Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
.add_note(&format!(
"Value must be between {} and {}",
i64::MIN,
i64::MAX
))
.with_location(self.current_source_location())
.as_boxed()),
}
}
fn consume_binary_number(&mut self) -> Result<Token, Box<Diagnostic>> {
let start_index = self.index;
while self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
self.advance();
}
if start_index == self.index {
return Err(
Diagnostic::error("Missing digits after the integer base prefix")
.add_help("Expect at least one binary digits after the prefix 0b")
.add_help("Binary digit mean 0 or 1")
.with_location(self.current_source_location())
.as_boxed(),
);
}
let literal = &self.content[start_index..self.index];
let string: String = literal.iter().collect();
let literal_num = string.replace('_', "");
const BINARY_RADIX: u32 = 2;
match i64::from_str_radix(&literal_num, BINARY_RADIX) {
Ok(integer) => {
let location = self.current_source_location();
Ok(Token::new(TokenKind::Integer(integer), location))
}
Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
.add_note(&format!(
"Value must be between {} and {}",
i64::MIN,
i64::MAX
))
.with_location(self.current_source_location())
.as_boxed()),
}
}
fn consume_octal_number(&mut self) -> Result<Token, Box<Diagnostic>> {
let start_index = self.index;
while self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c)) {
self.advance();
}
if start_index == self.index {
return Err(
Diagnostic::error("Missing digits after the integer base prefix")
.add_help("Expect at least one octal digits after the prefix 0o")
.add_help("Octal digit mean 0 to 8 number")
.with_location(self.current_source_location())
.as_boxed(),
);
}
let literal = &self.content[start_index..self.index];
let string: String = literal.iter().collect();
let literal_num = string.replace('_', "");
const OCTAL_RADIX: u32 = 8;
match i64::from_str_radix(&literal_num, OCTAL_RADIX) {
Ok(integer) => {
let location = self.current_source_location();
Ok(Token::new(TokenKind::Integer(integer), location))
}
Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
.add_note(&format!(
"Value must be between {} and {}",
i64::MIN,
i64::MAX
))
.with_location(self.current_source_location())
.as_boxed()),
}
}
fn consume_hex_number(&mut self) -> Result<Token, Box<Diagnostic>> {
let start_index = self.index;
while self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
self.advance();
}
if start_index == self.index {
return Err(
Diagnostic::error("Missing digits after the integer base prefix")
.add_help("Expect at least one hex digits after the prefix 0x")
.add_help("Hex digit mean 0 to 9 and a to f")
.with_location(self.current_source_location())
.as_boxed(),
);
}
let literal = &self.content[start_index..self.index];
let string: String = literal.iter().collect();
let literal_num = string.replace('_', "");
const HEX_RADIX: u32 = 16;
match i64::from_str_radix(&literal_num, HEX_RADIX) {
Ok(integer) => {
let location = self.current_source_location();
Ok(Token::new(TokenKind::Integer(integer), location))
}
Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
.add_note(&format!(
"Value must be between {} and {}",
i64::MIN,
i64::MAX
))
.with_location(self.current_source_location())
.as_boxed()),
}
}
fn consume_string_in_single_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
let buffer = self.consume_string_with_around('\'')?;
if self.index >= self.content_len {
return Err(Diagnostic::error("Unterminated single quote string")
.add_help("Add \' at the end of the String literal")
.with_location(self.current_source_location())
.as_boxed());
}
self.advance();
let location = self.current_source_location();
Ok(Token::new(TokenKind::String(buffer), location))
}
fn consume_string_in_double_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
let buffer = self.consume_string_with_around('"')?;
if self.index >= self.content_len {
return Err(Diagnostic::error("Unterminated double quote string")
.add_help("Add \" at the end of the String literal")
.with_location(self.current_source_location())
.as_boxed());
}
self.advance();
let location = self.current_source_location();
Ok(Token::new(TokenKind::String(buffer), location))
}
fn consume_string_with_around(&mut self, around: char) -> Result<String, Box<Diagnostic>> {
self.advance();
let mut buffer = String::new();
while !self.is_current_char(around) {
if !self.is_current_char('\\') {
buffer.push(self.content[self.index]);
self.advance();
continue;
}
if self.is_last() {
buffer.push(self.content[self.index]);
self.advance();
continue;
}
self.advance();
let next_char = self.content[self.index];
let character_with_escape_handled = match next_char {
'\'' => {
self.advance();
'\''
}
'\"' => {
self.advance();
'\"'
}
'\\' => {
self.advance();
'\\'
}
'n' => {
self.advance();
'\n'
}
'r' => {
self.advance();
'\r'
}
't' => {
self.advance();
'\t'
}
_ => self.content[self.index - 1],
};
buffer.push(character_with_escape_handled);
}
Ok(buffer)
}
fn consume_single_line_comment(&mut self) {
self.advance_n(2);
while !self.is_current_char('\n') {
self.advance();
}
self.advance();
self.line_end += 1;
self.column_end = 0;
}
fn consume_c_style_block_comment(&mut self) -> Result<(), Box<Diagnostic>> {
self.advance_n(2);
let mut number_nested_block_start = 0;
loop {
if self.is_current_char('/') && self.is_next_char('*') {
number_nested_block_start += 1;
}
self.advance();
if self.is_current_char('*') && self.is_next_char('/') {
number_nested_block_start -= 1;
if number_nested_block_start < 0 {
break;
}
}
}
if self.index + 2 > self.content_len {
return Err(Diagnostic::error("C Style comment must end with */")
.add_help("Add */ at the end of C Style comments")
.with_location(self.current_source_location())
.as_boxed());
}
self.advance_n(2);
Ok(())
}
fn advance(&mut self) {
self.index += 1;
self.column_end += 1;
}
fn advance_n(&mut self, n: usize) {
self.index += n;
self.column_end += n as u32;
}
fn is_current_char(&self, ch: char) -> bool {
self.index < self.content_len && self.content[self.index] == ch
}
fn is_next_char(&self, ch: char) -> bool {
self.index + 1 < self.content_len && self.content[self.index + 1] == ch
}
fn is_current_char_func(&self, func: fn(char) -> bool) -> bool {
self.index < self.content_len && func(self.content[self.index])
}
fn has_next(&self) -> bool {
self.index < self.content_len
}
fn is_last(&self) -> bool {
self.index == self.content_len - 1
}
}