use crate::util::compare_strs;
use crate::common::{ c_char, Number, NumberFormat, Identifier, MAX_IDENTIFIER_LENGTH, MAX_NUMBER_LENGTH, InternalIdentifierType, INTERNAL_IDENTIFIERS, OPERATOR_TYPE_SYMS };
use crate::tracking::SourceRegion;
use super::iterator::LexerIterator;
use super::token::{ Token, TokenData };
pub enum LexerResult {
None,
Ignore,
Some(Token)
}
const MAX_OP_SYM_LENGTH: usize = 16;
pub struct OpSym {
pub chars: [ char; MAX_OP_SYM_LENGTH ],
pub length: usize
}
impl OpSym {
pub fn new () -> Self {
Self {
chars: [ '\0'; MAX_OP_SYM_LENGTH ],
length: 0
}
}
}
impl std::fmt::Display for OpSym {
fn fmt (&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for i in 0..self.length {
write!(f, "{}", self.chars[i])?;
}
Ok(())
}
}
fn lex_whitespace (it: &mut LexerIterator) -> LexerResult {
if it.curr.is_whitespace() {
loop {
it.advance();
if !it.curr.is_whitespace() { break }
}
LexerResult::Ignore
} else {
LexerResult::None
}
}
fn lex_comment (it: &mut LexerIterator) -> LexerResult {
let next = it.peek_next();
let single = next == '/';
if it.curr == '/' && (single || next == '*') {
it.advance_n(2);
if single {
while it.curr != '\n' && it.valid() { it.advance(); }
} else {
let mut closed = false;
while it.valid() {
if it.curr == '/' && it.peek_next() == '*' {
lex_comment(it);
} else if it.curr == '*' && it.peek_next() == '/' {
it.advance_n(2);
closed = true;
break
} else {
it.advance();
}
}
if !closed {
it.simple_error("Unexpected end of input, multi-line comment has no closing tag".to_string());
}
}
LexerResult::Ignore
} else {
LexerResult::None
}
}
fn match_ident_first_char (c: char) -> bool {
c.is_ascii_alphabetic() || c == '_'
}
fn match_ident_nth_char (c: char) -> bool {
c.is_ascii_alphanumeric() || c == '_'
}
fn lex_identifier (it: &mut LexerIterator) -> LexerResult {
if match_ident_first_char(it.curr) {
let start = it.index;
let mut ident = Identifier {
value: [ '\0' as c_char; MAX_IDENTIFIER_LENGTH ],
length: 1
};
ident.value[0] = it.curr as c_char;
it.advance();
let mut err = false;
while match_ident_nth_char(it.curr) {
if ident.length < MAX_IDENTIFIER_LENGTH {
ident.value[ident.length] = it.curr as c_char;
ident.length += 1;
} else {
err = true;
}
it.advance();
}
let source = SourceRegion { start, end: it.index };
if err {
it.error(source, format!("Identifier is too long ({}), max length is {}", source.len(), MAX_IDENTIFIER_LENGTH));
} else {
for internal in INTERNAL_IDENTIFIERS.iter() {
let in_ident = internal.0;
let in_len = in_ident.len();
if in_len == ident.length
&& compare_strs(&ident.value, in_ident, in_len) {
return LexerResult::Some(Token {
data: match internal.1 {
InternalIdentifierType::Nil => TokenData::Nil,
InternalIdentifierType::Ignore => TokenData::Ignore,
InternalIdentifierType::Boolean => TokenData::Boolean(internal.2 != 0),
InternalIdentifierType::ConstantNumber => TokenData::Number(Number::Constant(num::FromPrimitive::from_u8(internal.2).expect("Internal error converting INTERNAL_IDENTIFIER to ConstantNumber"))),
InternalIdentifierType::Operator => TokenData::Operator(num::FromPrimitive::from_u8(internal.2).expect("Internal error converting INTERNAL_IDENTIFIER to OperatorType")),
InternalIdentifierType::Keyword => TokenData::Keyword(num::FromPrimitive::from_u8(internal.2).expect("Internal error converting INTERNAL_IDENTIFIER to KeywordType"))
},
source
})
}
}
}
LexerResult::Some(Token {
data: TokenData::Identifier(ident),
source
})
} else {
LexerResult::None
}
}
struct NumberIterator<'a, 'b> {
digits: [ char; MAX_NUMBER_LENGTH ],
length: usize,
lexer_it: &'a mut LexerIterator<'b>,
len_err: bool,
}
impl<'a, 'b> NumberIterator<'a, 'b> {
fn new (lexer_it: &'a mut LexerIterator<'b>) -> Self {
Self {
digits: [ '\0'; MAX_NUMBER_LENGTH ],
length: 0,
lexer_it,
len_err: false
}
}
fn advance (&mut self) -> char {
self.lexer_it.advance()
}
fn valid (&self) -> bool {
self.lexer_it.valid()
}
fn peek_prev (&self) -> char {
self.lexer_it.peek_prev()
}
fn peek_next (&self) -> char {
self.lexer_it.peek_next()
}
fn get_simple_err_region (&self) -> SourceRegion {
self.lexer_it.get_simple_err_region()
}
fn index (&self) -> usize {
self.lexer_it.index
}
fn curr (&self) -> char {
self.lexer_it.curr
}
fn add_curr (&mut self) {
if self.length < MAX_NUMBER_LENGTH {
self.digits[self.length] = self.lexer_it.curr;
self.length += 1;
} else {
self.len_err = true;
}
self.lexer_it.advance();
}
pub fn warning (&mut self, region: SourceRegion, msg: String) {
self.lexer_it.warning(region, msg);
}
pub fn error (&mut self, region: SourceRegion, msg: String) {
self.lexer_it.error(region, msg);
}
pub fn simple_warning (&mut self, msg: String) {
self.warning(self.lexer_it.get_simple_err_region(), msg);
}
pub fn simple_error (&mut self, msg: String) {
self.error(self.lexer_it.get_simple_err_region(), msg);
}
}
fn lex_number (it: &mut LexerIterator) -> LexerResult {
let first_is_digit = it.curr.is_ascii_digit();
if first_is_digit
|| (it.curr == '.' && it.peek_next().is_ascii_digit()) {
let start = it.index;
let mut num_it = NumberIterator::new(it);
num_it.add_curr();
let mut fmt = NumberFormat::DecimalInteger;
let mut base: u32 = 10;
if num_it.digits[0] == '0' {
if num_it.curr() == 'x' {
fmt = NumberFormat::Hexadecimal;
base = 16;
num_it.advance();
num_it.length = 0;
} else if num_it.curr() == 'b' {
fmt = NumberFormat::Binary;
base = 2;
num_it.advance();
num_it.length = 0;
}
} else if num_it.digits[0] == '.' {
fmt = NumberFormat::DecimalFloatingPoint;
}
let mut digit_err = false;
let mut exp = false;
let mut pm = false;
let mut cexp = false;
let mut allow_u = first_is_digit;
while num_it.valid() {
if num_it.curr().is_digit(base) {
allow_u = true;
num_it.add_curr();
if exp { cexp = true; }
} else if allow_u && num_it.curr() == '_' {
num_it.advance();
} else if num_it.curr() == '.' && num_it.peek_next() != '.' {
if fmt == NumberFormat::DecimalInteger {
num_it.add_curr();
fmt = NumberFormat::DecimalFloatingPoint;
allow_u = false;
} else {
break
}
} else if !exp && num_it.curr() == 'e' {
fmt = NumberFormat::DecimalFloatingPoint;
num_it.add_curr();
exp = true;
allow_u = false;
} else if num_it.curr() == '-' || num_it.curr() == '+' {
if exp && !pm && !cexp {
num_it.add_curr();
pm = true;
} else {
break
}
} else if num_it.curr().is_ascii_alphabetic() {
num_it.add_curr();
if exp { cexp = true; }
digit_err = true;
} else {
break;
}
}
let source = SourceRegion { start, end: num_it.index() };
let mut err = false;
if num_it.length == 0 {
num_it.error(
source,
format!(
"Expected at least one digit after opening {} literal tag",
if fmt == NumberFormat::Hexadecimal { "hexadecimal" }
else if fmt == NumberFormat::Binary { "binary" }
else { "number" }
)
);
err = true;
}
if exp && !cexp {
num_it.error(source, "Expected an exponent number to follow 'e'".to_string());
err = true
}
if digit_err {
num_it.error(source, "Number literal contains invalid digits".to_string());
err = true;
}
if num_it.len_err {
num_it.error(
source,
format!(
"Number literal is too long ({}), max length is {} digits (Following any opening tag such as 0x)",
source.len(), MAX_NUMBER_LENGTH
)
);
err = true;
}
let num = if err {
Number::Integer(0)
} else {
let mut digits_u8 = [ 0u8; MAX_NUMBER_LENGTH * 4 ];
let mut offset = 0usize;
for i in 0..num_it.length {
let digit = num_it.digits[i];
digit.encode_utf8(&mut digits_u8[offset..]);
offset += digit.len_utf8();
}
let digits_str = unsafe { std::str::from_utf8_unchecked(&digits_u8[..offset]) };
match fmt {
NumberFormat::Binary => match u64::from_str_radix(digits_str, 2) {
Ok(n) => Number::Integer(n),
Err(e) => {
it.error(source, format!("Failed to parse binary integer literal with body '{}', internal error: {}", digits_str, e));
Number::Integer(0)
}
},
NumberFormat::DecimalInteger => match digits_str.parse::<u64>() {
Ok(n) => Number::Integer(n),
Err(e) => {
it.error(source, format!("Failed to parse decimal integer literal with body '{}', internal error: {}", digits_str, e));
Number::Integer(0)
}
},
NumberFormat::DecimalFloatingPoint => match digits_str.parse::<f64>() {
Ok(n) => Number::FloatingPoint(n),
Err(e) => {
it.error(source, format!("Failed to parse decimal floating point literal with body '{}', internal error: {}", digits_str, e));
Number::FloatingPoint(0.0)
}
},
NumberFormat::Hexadecimal => match u64::from_str_radix(digits_str, 16) {
Ok(n) => Number::Integer(n),
Err(e) => {
it.error(source, format!("Failed to parse hexadecimal integer literal with body '{}', internal error: {}", digits_str, e));
Number::Integer(0)
}
}
}
};
LexerResult::Some(Token {
data: TokenData::Number(num),
source
})
} else {
LexerResult::None
}
}
fn parse_ascii_hex_code (code: &str) -> Result<char, String> {
match u32::from_str_radix(code, 16) {
Ok(n) => {
if n < std::u8::MAX as u32 {
Ok(unsafe { std::char::from_u32_unchecked(n) })
} else {
Err(format!("Failed to parse ASCII escape sequence '{}', the resulting integer {} is not a valid ASCII codepoint", code, n))
}
},
Err(e) => Err(format!("Failed to parse ASCII escape sequence '{}', internal error: {}", code, e))
}
}
fn parse_unicode_hex_code (code: &str) -> Result<char, String> {
match u32::from_str_radix(code, 16) {
Ok(n) => match std::char::from_u32(n) {
Some(c) => Ok(c),
None => Err(format!("Failed to parse Unicode escape sequence '{}', the resulting integer {} is not a valid Unicode codepoint", code, n))
},
Err(e) => Err(format!("Failed to parse Unicode escape sequence '{}', internal error: {}", code, e))
}
}
fn get_hex_escape_body (start: usize, it: &mut LexerIterator, is_ascii: bool) -> char {
if it.curr != '{' {
it.error(SourceRegion { start, end: it.index }, format!("Invalid escape sequence: Expected {{ hex_code }} to follow '\\{}'", it.peek_prev()));
'\0'
} else {
it.advance();
let max_len = if is_ascii { 2 } else { 6 };
let name = if is_ascii { "ASCII" } else { "Unicode" };
let parser = if is_ascii { parse_ascii_hex_code } else { parse_unicode_hex_code };
let mut seq = [ 0u8; 4 * 6 ];
let mut seq_len = 0usize;
let mut seq_offset = 0usize;
let mut closed = false;
let mut len_err = false;
let mut digit_err = false;
let mut started = false;
let mut ended = false;
while it.valid() {
if (!started || ended || seq_len > 0) && it.curr.is_whitespace() {
if started { ended = true; }
it.advance();
continue
} else if !ended && it.curr.is_digit(16) {
started = true;
if seq_len < max_len {
it.curr.encode_utf8(&mut seq[seq_offset..]);
seq_len += 1;
seq_offset += it.curr.len_utf8();
} else {
len_err = true;
}
it.advance();
} else if it.curr == '}' {
closed = true;
it.advance();
break
} else if it.curr == '\'' || it.curr == '"' {
break;
} else {
digit_err = true;
it.advance();
}
}
let source = SourceRegion { start, end: it.index };
let mut err = false;
if digit_err {
it.error(source, format!("{} escape sequence contains invalid characters", name));
err = true;
}
if len_err {
it.error(source, format!("{} escape sequence contains too many digits ({}), max is {}", name, source.len(), max_len));
err = true;
}
if !closed {
it.error(source, format!("Unexpected end of input, expected }} to close {} escape sequence", name));
err = true;
}
if err {
'\0'
} else {
match parser(unsafe { std::str::from_utf8_unchecked(&seq[..seq_offset]) }) {
Ok(c) => c,
Err(e) => {
it.error(source, e);
'\0'
}
}
}
}
}
fn get_escape_sequence (it: &mut LexerIterator) -> char {
assert!(it.curr == '\\', "Internal error: improper usage of get_escape_sequence");
let start = it.index;
it.advance();
match it.curr {
ch @ '\\' | ch @ '\'' | ch @ '"' => {
it.advance();
ch
},
ch @ 't' | ch @ 'r' | ch @ 'n' | ch @ '0' => {
it.advance();
match ch {
't' => '\t',
'r' => '\r',
'n' => '\n',
_ => '\0'
}
}
'a' => {
it.advance();
get_hex_escape_body(start, it, true)
},
'u' => {
it.advance();
get_hex_escape_body(start, it, false)
}
ch @ _ => {
it.advance();
it.error(SourceRegion { start, end: it.index }, format!("'\\{}' is not a valid escape sequence", ch));
'\0'
}
}
}
fn lex_character (it: &mut LexerIterator) -> LexerResult {
if it.curr == '\'' {
let start = it.index;
it.advance();
LexerResult::Some(Token {
data: TokenData::Number(Number::Character(match it.curr {
ch @ '\'' | ch @ '\0' => {
if ch == '\'' {
it.advance();
it.error(SourceRegion { start, end: it.index }, "Character literal has no value".to_string());
} else {
it.error(SourceRegion { start, end: it.index }, "Unexpected end of input, character literal has no closing tag".to_string());
}
'\0'
},
mut ch @ _ => {
if ch == '\\' {
ch = get_escape_sequence(it);
} else {
if ch.is_control() {
it.simple_error(format!("Control character '{}' should be escaped", ch.escape_default()));
ch = '\0';
}
it.advance();
}
if it.curr == '\'' {
it.advance();
ch
} else {
it.simple_error("Expected ' to close character literal".to_string());
'\0'
}
}
})),
source: SourceRegion { start, end: it.index }
})
} else {
LexerResult::None
}
}
fn lex_string (it: &mut LexerIterator) -> LexerResult {
if it.curr == '"' {
let start = it.index;
it.advance();
let mut string = String::new();
let mut closed = false;
while it.valid() {
match it.curr {
'\\' => string.push(get_escape_sequence(it)),
'"' => {
it.advance();
closed = true;
break
},
ch @ _ => {
it.advance();
if ch.is_control() {
it.simple_error(format!("Control character '{}' should be escaped", ch.escape_default()));
} else {
string.push(ch);
}
}
}
}
let source = SourceRegion { start, end: it.index };
if !closed {
it.error(source, "Unexpected end of input, string literal has no closing tag".to_string())
}
LexerResult::Some(Token {
data: TokenData::String(string),
source
})
} else {
LexerResult::None
}
}
fn lex_operator (it: &mut LexerIterator) -> LexerResult {
if it.curr.is_ascii_graphic() {
let start = it.index;
let mut op_sym = OpSym::new();
'it_loop: while it.valid() {
if it.curr.is_ascii_graphic() {
for in_sym in OPERATOR_TYPE_SYMS.iter() {
let in_chars = in_sym.0;
let in_len = in_chars.len();
if in_len > op_sym.length
&& in_chars[op_sym.length] == it.curr
&& (op_sym.length == 0 || compare_strs(&op_sym.chars, in_chars, op_sym.length)) {
op_sym.chars[op_sym.length] = it.curr;
op_sym.length += 1;
it.advance();
continue 'it_loop
}
}
}
break
}
if op_sym.length > 0 {
let mut op_type = None;
for in_sym in OPERATOR_TYPE_SYMS.iter() {
let in_chars = in_sym.0;
let in_len = in_chars.len();
if in_len == op_sym.length && compare_strs(&op_sym.chars, in_chars, in_len) {
op_type = Some(in_sym.1);
break
}
}
let region = SourceRegion { start, end: it.index };
if let Some(op_type) = op_type {
return LexerResult::Some(Token {
data: TokenData::Operator(op_type),
source: region
});
} else {
let source = it.source_ref.get_source().expect("Failed to get Source for match error");
let (sl, sc) = source.get_line_and_column(region.start).expect("Failed to get start line and column for operator match error");
let (el, ec) = source.get_line_and_column(region.end).expect("Failed to get end line and column for operator match error");
panic!("Internal Error: Failed to find matching operator symbol for character series '{}' (at [{}:{}:{} to {}:{}])", op_sym, source.origin, sl + 1, sc + 1, el + 1, ec + 1);
}
}
}
LexerResult::None
}
pub static LEXLETS: &[fn(&mut LexerIterator) -> LexerResult] = &[
lex_whitespace,
lex_comment,
lex_identifier,
lex_number,
lex_character,
lex_string,
lex_operator,
];