use std::cell::RefCell;
use std::rc::Rc;
use std::str::FromStr;
use conv::ValueFrom;
use crate::*;
use crate::util::CodePointsReader;
pub struct Tokenizer<'input> {
pub source: Rc<Source>,
current_line_number: usize,
code_points: CodePointsReader<'input>,
}
impl<'input> Tokenizer<'input> {
pub fn new(source: &'input Rc<Source>) -> Self {
let text: &'input str = source.text.as_ref();
let source = Rc::clone(source);
assert!(!source.already_tokenized.get(), "A Source must only be tokenized once.");
source.already_tokenized.set(true);
Self {
source,
current_line_number: 1,
code_points: CodePointsReader::from(text),
}
}
pub fn scan_ie_div(&mut self) -> Result<(Token, Location), ParserFailure> {
loop {
let ch = self.code_points.peek_or_zero();
if character_validation::is_whitespace(ch) {
self.code_points.next();
} else if self.consume_line_terminator() || self.consume_comment()? {
} else {
break;
}
}
if let Some(result) = self.scan_identifier()? {
return Ok(result);
}
if let Some(result) = self.scan_dot_or_numeric_literal()? {
return Ok(result);
}
if let Some(result) = self.scan_string_literal()? {
return Ok(result);
}
let start = self.current_cursor_location();
match self.code_points.peek_or_zero() {
',' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Comma, location));
},
'(' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LeftParen, location));
},
')' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::RightParen, location));
},
'[' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LeftBracket, location));
},
']' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::RightBracket, location));
},
'{' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LeftBrace, location));
},
'}' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::RightBrace, location));
},
':' => {
self.code_points.next();
if self.code_points.peek_or_zero() == ':' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::ColonColon, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Colon, location));
},
'=' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '>' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::FatArrow, location));
}
if ch == '=' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::StrictEquals, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Equals, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Assign, location));
},
'!' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '=' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::StrictNotEquals, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::NotEquals, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Exclamation, location));
},
'?' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '.' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::OptionalChaining, location));
}
if ch == '?' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::NullCoalescingAssign, location));
}
if ch == '?' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::NullCoalescing, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Question, location));
},
';' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Semicolon, location));
},
'<' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Le, location));
}
if ch == '<' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LeftShiftAssign, location));
}
if ch == '<' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LeftShift, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Lt, location));
},
'>' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Ge, location));
}
if ch == '>' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::RightShiftAssign, location));
}
if ch == '>' && self.code_points.peek_seq(3) == ">>=" {
self.code_points.skip_count_in_place(3);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::UnsignedRightShiftAssign, location));
}
if ch == '>' && self.code_points.peek_at_or_zero(1) == '>' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::UnsignedRightShift, location));
}
if ch == '<' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::RightShift, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Gt, location));
},
'@' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Attribute, location));
},
'+' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '+' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Increment, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::AddAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Plus, location));
},
'-' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '-' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Decrement, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::SubtractAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Minus, location));
},
'*' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '*' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::PowerAssign, location));
}
if ch == '*' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Power, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::MultiplyAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Times, location));
},
'/' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::DivideAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Div, location));
},
'%' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::RemainderAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::Remainder, location));
},
'&' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '&' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LogicalAndAssign, location));
}
if ch == '&' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LogicalAnd, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseAndAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseAnd, location));
},
'^' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '^' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LogicalXorAssign, location));
}
if ch == '^' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LogicalXor, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseXorAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseXor, location));
},
'|' => {
self.code_points.next();
let ch = self.code_points.peek_or_zero();
if ch == '|' && self.code_points.peek_at_or_zero(1) == '=' {
self.code_points.skip_count_in_place(2);
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LogicalOrAssign, location));
}
if ch == '|' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::LogicalOr, location));
}
if ch == '=' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseOrAssign, location));
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseOr, location));
},
'~' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::BitwiseNot, location));
},
_ => {
if self.code_points.has_remaining() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
return Ok((Token::Eof, start))
}
},
}
}
pub fn scan_regexp_literal(&mut self, start: Location) -> Result<(Token, Location), ParserFailure> {
let mut body = String::new();
loop {
let ch = self.code_points.peek_or_zero();
if ch == '/' {
self.code_points.next();
break;
} else if ch == '\\' {
self.code_points.next();
body.push('\\');
let ch = self.code_points.peek_or_zero();
if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure);
} else if character_validation::is_line_terminator(ch) {
self.add_unexpected_error();
}
self.consume_line_terminator();
body.push(ch);
} else if character_validation::is_line_terminator(ch) {
body.push('\n');
self.consume_line_terminator();
} else if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
body.push(ch);
self.code_points.next();
}
}
let mut flags = String::new();
while let Some((ch, _)) = self.consume_identifier_part()? {
flags.push(ch);
}
let location = start.combine_with(self.current_cursor_location());
Ok((Token::RegExpLiteral { body, flags }, location))
}
pub fn current_line_number(&self) -> usize {
self.current_line_number
}
fn current_character_ahead_location(&self) -> Location {
let offset = self.code_points.index();
let mut next_code_points = self.code_points.clone();
next_code_points.next();
Location::with_line_and_offsets(&self.source, self.current_line_number, offset, next_code_points.index() + 1)
}
fn current_cursor_location(&self) -> Location {
let offset = self.code_points.index();
Location::with_line_and_offset(&self.source, self.current_line_number, offset)
}
fn add_unexpected_error(&self) {
if self.code_points.has_remaining() {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&self.current_character_ahead_location(), DiagnosticKind::UnexpectedOrInvalidToken, vec![]))
} else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&self.current_cursor_location(), DiagnosticKind::UnexpectedEnd, vec![]))
}
}
fn consume_line_terminator(&mut self) -> bool {
let ch = self.code_points.peek_or_zero();
if ch == '\x0D' && self.code_points.peek_at_or_zero(1) == '\x0A' {
self.code_points.skip_count_in_place(2);
self.source.line_number_offsets.borrow_mut().push(self.code_points.index());
self.current_line_number += 1;
return true;
}
if character_validation::is_line_terminator(ch) {
self.code_points.next();
self.source.line_number_offsets.borrow_mut().push(self.code_points.index());
self.current_line_number += 1;
return true;
}
false
}
fn consume_comment(&mut self) -> Result<bool, ParserFailure> {
let ch = self.code_points.peek_or_zero();
if ch != '/' {
return Ok(false);
}
let ch2 = self.code_points.peek_at_or_zero(1);
if ch2 == '/' {
let start = self.current_cursor_location();
self.code_points.skip_count_in_place(2);
while !character_validation::is_line_terminator(self.code_points.peek_or_zero()) && self.code_points.has_remaining() {
self.code_points.skip_in_place();
}
let location = start.combine_with(self.current_cursor_location());
self.consume_line_terminator();
self.source.comments.borrow_mut().push(Rc::new(Comment {
multiline: false,
content: RefCell::new(self.source.text[(location.first_offset() + 2)..location.last_offset()].to_owned()),
location: RefCell::new(location),
}));
return Ok(true);
}
if ch2 == '*' {
let start = self.current_cursor_location();
self.code_points.skip_count_in_place(2);
loop {
if self.code_points.peek_or_zero() == '*' && self.code_points.peek_at_or_zero(1) == '/' {
self.code_points.skip_count_in_place(2);
break;
} else if self.consume_line_terminator() {
} else if self.code_points.has_remaining() {
self.code_points.skip_in_place();
} else {
self.add_unexpected_error();
return Err(ParserFailure);
}
}
let location = start.combine_with(self.current_cursor_location());
self.source.comments.borrow_mut().push(Rc::new(Comment {
multiline: true,
content: RefCell::new(self.source.text[(location.first_offset() + 2)..(location.last_offset() - 2)].to_owned()),
location: RefCell::new(location),
}));
return Ok(true);
}
Ok(false)
}
fn scan_identifier(&mut self) -> Result<Option<(Token, Location)>, ParserFailure> {
let start = self.current_cursor_location();
let mut escaped = false;
let Some((ch, escaped_2)) = self.consume_identifier_start()? else {
return Ok(None);
};
escaped = escaped || escaped_2;
let mut name = String::new();
name.push(ch);
while let Some((ch, escaped_2)) = self.consume_identifier_part()? {
escaped = escaped || escaped_2;
name.push(ch);
}
let location = start.combine_with(self.current_cursor_location());
if !escaped {
if let Some(token) = keywords::reserved_word_token(name.as_ref()) {
return Ok(Some((token, location)));
}
}
Ok(Some((Token::Identifier(name), location)))
}
fn consume_identifier_start(&mut self) -> Result<Option<(char, bool)>, ParserFailure> {
let ch = self.code_points.peek_or_zero();
if character_validation::is_identifier_start(ch) {
self.code_points.next();
return Ok(Some((ch, false)));
}
if self.code_points.peek_or_zero() == '\\' {
self.code_points.next();
return Ok(Some((self.expect_unicode_escape_sequence()?, true)));
}
Ok(None)
}
fn consume_identifier_part(&mut self) -> Result<Option<(char, bool)>, ParserFailure> {
let ch = self.code_points.peek_or_zero();
if character_validation::is_identifier_part(ch) {
self.code_points.next();
return Ok(Some((ch, false)));
}
if self.code_points.peek_or_zero() == '\\' {
self.code_points.next();
return Ok(Some((self.expect_unicode_escape_sequence()?, true)));
}
Ok(None)
}
fn expect_unicode_escape_sequence(&mut self) -> Result<char, ParserFailure> {
let start = self.current_cursor_location();
if self.code_points.peek_or_zero() != 'u' {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
if character_validation::is_hex_digit(self.code_points.peek_or_zero()) {
let r = char::from_u32(self.expect_hex_digit()? << 12
| (self.expect_hex_digit()? << 8)
| (self.expect_hex_digit()? << 4)
| self.expect_hex_digit()?);
let Some(r) = r else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&start.combine_with(self.current_cursor_location()), DiagnosticKind::UnexpectedOrInvalidToken, vec![]));
return Err(ParserFailure);
};
return Ok(r);
}
if self.code_points.peek_or_zero() != '{' {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
while character_validation::is_hex_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
}
if self.code_points.peek_or_zero() != '}' {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
let r = u32::from_str_radix(&self.source.text[(start.first_offset + 2)..(location.last_offset - 1)], 16);
let Ok(r) = r else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&location, DiagnosticKind::UnexpectedOrInvalidToken, vec![]));
return Err(ParserFailure);
};
let r = char::from_u32(r);
let Some(r) = r else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&location, DiagnosticKind::UnexpectedOrInvalidToken, vec![]));
return Err(ParserFailure);
};
Ok(r)
}
fn expect_hex_digit(&mut self) -> Result<u32, ParserFailure> {
let ch = self.code_points.peek_or_zero();
if !character_validation::is_hex_digit(ch) {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
Ok(
if ch >= 'A' && ch <= 'F' {
(ch as u32) - 0x41 + 10
} else if ch >= 'a' && ch <= 'f' {
(ch as u32) - 0x61 + 10
} else {
(ch as u32) - 0x30
}
)
}
fn scan_dot_or_numeric_literal(&mut self) -> Result<Option<(Token, Location)>, ParserFailure> {
let start = self.current_cursor_location();
let ch = self.code_points.peek_or_zero();
let mut initial_dot = false;
if ch == '.' {
initial_dot = true;
self.code_points.next();
let seq = self.code_points.peek_seq(2);
if seq == ".." {
self.code_points.skip_count_in_place(2);
return Ok(Some((Token::Ellipsis, start.combine_with(self.current_cursor_location()))));
}
let ch = seq.get(..1).map(|ch| ch.chars().next().unwrap()).unwrap_or('\x00');
if ch == '.' {
self.code_points.next();
return Ok(Some((Token::Descendants, start.combine_with(self.current_cursor_location()))));
}
if !character_validation::is_dec_digit(ch) {
return Ok(Some((Token::Dot, start.combine_with(self.current_cursor_location()))));
}
while character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
self.consume_underscore_followed_by_dec_digit()?;
}
} else if ch == '0' {
self.code_points.next();
let ch_2 = self.code_points.peek_or_zero();
if ['X', 'x'].contains(&ch_2) {
self.code_points.next();
return self.scan_hex_literal(start.clone());
}
if ['B', 'b'].contains(&ch_2) {
self.code_points.next();
return self.scan_bin_literal(start.clone());
}
} else if character_validation::is_dec_digit(ch) {
while character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
self.consume_underscore_followed_by_dec_digit()?;
}
} else {
return Ok(None);
}
if !initial_dot && self.code_points.peek_or_zero() == '.' {
self.code_points.next();
if !character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
while character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
self.consume_underscore_followed_by_dec_digit()?;
}
}
if ['E', 'e'].contains(&self.code_points.peek_or_zero()) {
self.code_points.next();
if ['+', '-'].contains(&self.code_points.peek_or_zero()) {
self.code_points.next();
}
if !character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
while character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
self.consume_underscore_followed_by_dec_digit()?;
}
}
self.unallow_numeric_suffix();
let location = start.combine_with(self.current_cursor_location());
let string = self.source.text[location.first_offset..location.last_offset].to_owned().replace('_', "");
let Ok(v) = f64::from_str(&string) else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&location, DiagnosticKind::FailedProcessingNumericLiteral, vec![]));
return Err(ParserFailure);
};
Ok(Some((Token::NumericLiteral(v), location)))
}
fn scan_hex_literal(&mut self, start: Location) -> Result<Option<(Token, Location)>, ParserFailure> {
if !character_validation::is_hex_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
while character_validation::is_hex_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
self.consume_underscore_followed_by_hex_digit()?;
}
self.unallow_numeric_suffix();
let location = start.combine_with(self.current_cursor_location());
let s = self.source.text[(location.first_offset + 2)..location.last_offset].replace('_', "");
let n = u64::from_str_radix(&s, 16);
let n = n.map_err(|_| NumericRangeError)
.and_then(|n| f64::value_from(n).map_err(|_| NumericRangeError));
let Ok(n) = n else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&location, DiagnosticKind::FailedProcessingNumericLiteral, vec![]));
return Err(ParserFailure);
};
Ok(Some((Token::NumericLiteral(n), location)))
}
fn scan_bin_literal(&mut self, start: Location) -> Result<Option<(Token, Location)>, ParserFailure> {
if !character_validation::is_bin_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
while character_validation::is_bin_digit(self.code_points.peek_or_zero()) {
self.code_points.next();
self.consume_underscore_followed_by_bin_digit()?;
}
self.unallow_numeric_suffix();
let location = start.combine_with(self.current_cursor_location());
let s = self.source.text[(location.first_offset + 2)..location.last_offset].replace('_', "");
let n = u64::from_str_radix(&s, 2);
let n = n.map_err(|_| NumericRangeError)
.and_then(|n| f64::value_from(n).map_err(|_| NumericRangeError));
let Ok(n) = n else {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&location, DiagnosticKind::FailedProcessingNumericLiteral, vec![]));
return Err(ParserFailure);
};
Ok(Some((Token::NumericLiteral(n), location)))
}
fn consume_underscore_followed_by_dec_digit(&mut self) -> Result<(), ParserFailure> {
if self.code_points.peek_or_zero() == '_' {
self.code_points.next();
if !character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
}
Ok(())
}
fn consume_underscore_followed_by_hex_digit(&mut self) -> Result<(), ParserFailure> {
if self.code_points.peek_or_zero() == '_' {
self.code_points.next();
if !character_validation::is_hex_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
}
Ok(())
}
fn consume_underscore_followed_by_bin_digit(&mut self) -> Result<(), ParserFailure> {
if self.code_points.peek_or_zero() == '_' {
self.code_points.next();
if !character_validation::is_bin_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
}
Ok(())
}
fn unallow_numeric_suffix(&self) {
if character_validation::is_identifier_start(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
}
}
fn scan_string_literal(&mut self) -> Result<Option<(Token, Location)>, ParserFailure> {
let delim = self.code_points.peek_or_zero();
if !['"', '\''].contains(&delim) {
return Ok(None);
}
let start = self.current_cursor_location();
self.code_points.next();
if self.code_points.peek_or_zero() == delim && self.code_points.peek_at_or_zero(1) == delim {
self.code_points.skip_count_in_place(2);
return self.scan_triple_string_literal(delim, start);
}
let mut value = String::new();
loop {
if let Some(s) = self.consume_escape_sequence()? {
value.push_str(&s);
} else {
let ch = self.code_points.peek_or_zero();
if ch == delim {
self.code_points.next();
break;
} else if character_validation::is_line_terminator(ch) {
self.source.add_diagnostic(Diagnostic::new_syntax_error(&self.current_character_ahead_location(), DiagnosticKind::UnallowedLineBreak, vec![]));
self.consume_line_terminator();
} else if !self.code_points.has_remaining() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
value.push(ch);
self.code_points.next();
}
}
}
let location = start.combine_with(self.current_cursor_location());
Ok(Some((Token::StringLiteral(value), location)))
}
fn scan_triple_string_literal(&mut self, delim: char, start: Location) -> Result<Option<(Token, Location)>, ParserFailure> {
let mut lines: Vec<String> = vec![];
let mut builder = String::new();
let initial_line_break = self.consume_line_terminator();
loop {
if let Some(s) = self.consume_escape_sequence()? {
builder.push_str(&s);
} else {
let ch = self.code_points.peek_or_zero();
if ch == delim && self.code_points.peek_at_or_zero(1) == delim && self.code_points.peek_at_or_zero(2) == delim {
self.code_points.skip_count_in_place(3);
lines.push(builder.clone());
break;
} else if character_validation::is_line_terminator(ch) {
lines.push(builder.clone());
builder.clear();
self.consume_line_terminator();
} else if !self.code_points.has_remaining() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
builder.push(ch);
self.code_points.next();
}
}
}
let location = start.combine_with(self.current_cursor_location());
let last_line = if initial_line_break && lines.len() > 1 {
lines.pop().unwrap()
} else {
"".to_owned()
};
let base_indent = character_validation::indent_count(&last_line);
let mut lines: Vec<String> = lines.iter().map(|line| {
let indent = character_validation::indent_count(line);
line[usize::min(base_indent, indent)..].to_owned()
}).collect();
let last_line = last_line[base_indent..].to_owned();
if !last_line.is_empty() {
lines.push(last_line);
}
let value = lines.join("\n");
Ok(Some((Token::StringLiteral(value), location)))
}
fn consume_escape_sequence(&mut self) -> Result<Option<String>, ParserFailure> {
if self.code_points.peek_or_zero() != '\\' {
return Ok(None);
}
self.code_points.next();
if !self.code_points.has_remaining() {
self.add_unexpected_error();
return Err(ParserFailure);
}
if self.consume_line_terminator() {
return Ok(Some("".into()));
}
let ch = self.code_points.peek_or_zero();
match ch {
'\'' | '"' | '\\' => {
self.code_points.next();
Ok(Some(ch.into()))
},
'u' => {
Ok(Some(self.expect_unicode_escape_sequence()?.into()))
},
'x' => {
self.code_points.next();
let v = (self.expect_hex_digit()? << 4) | self.expect_hex_digit()?;
let v = char::from_u32(v).unwrap();
Ok(Some(v.into()))
},
'b' => {
self.code_points.next();
Ok(Some('\x08'.into()))
},
'f' => {
self.code_points.next();
Ok(Some('\x0C'.into()))
},
'n' => {
self.code_points.next();
Ok(Some('\x0A'.into()))
},
'r' => {
self.code_points.next();
Ok(Some('\x0D'.into()))
},
't' => {
self.code_points.next();
Ok(Some('\x09'.into()))
},
'v' => {
self.code_points.next();
Ok(Some('\x0B'.into()))
},
'0' => {
self.code_points.next();
if character_validation::is_dec_digit(self.code_points.peek_or_zero()) {
self.add_unexpected_error();
}
Ok(Some('\x00'.into()))
},
ch => {
if character_validation::is_dec_digit(ch) {
self.add_unexpected_error();
}
self.code_points.next();
Ok(Some(ch.into()))
},
}
}
pub fn scan_ie_xml_tag(&mut self) -> Result<(Token, Location), ParserFailure> {
let start = self.current_cursor_location();
let ch = self.code_points.peek_or_zero();
if character_validation::is_xml_name_start(ch) {
self.code_points.next();
while character_validation::is_xml_name_part(self.code_points.peek_or_zero()) {
self.code_points.next();
}
let location = start.combine_with(self.current_cursor_location());
let name = self.source.text[location.first_offset..location.last_offset].to_owned();
return Ok((Token::XmlName(name), location));
}
if character_validation::is_xml_whitespace(ch) {
while character_validation::is_xml_whitespace(self.code_points.peek_or_zero()) {
if !self.consume_line_terminator() {
self.code_points.next();
}
}
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::XmlWhitespace, location));
}
match ch {
'=' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
Ok((Token::Assign, location))
},
'>' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
Ok((Token::Gt, location))
},
'/' => {
self.code_points.next();
if self.code_points.peek_or_zero() != '>' {
self.add_unexpected_error();
return Err(ParserFailure);
}
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
Ok((Token::XmlSlashGt, location))
},
'"' | '\'' => {
let delim = ch;
self.code_points.next();
while self.code_points.peek_or_zero() != delim && self.code_points.has_remaining() {
if !self.consume_line_terminator() {
self.code_points.next();
}
}
if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure)
}
let value = self.source.text[(start.first_offset + 1)..self.current_cursor_location().first_offset].to_owned();
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
Ok((Token::XmlAttributeValue(value), location))
},
'{' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
Ok((Token::LeftBrace, location))
},
_ => {
self.add_unexpected_error();
Err(ParserFailure)
},
}
}
pub fn scan_ie_xml_content(&mut self) -> Result<(Token, Location), ParserFailure> {
let start = self.current_cursor_location();
let ch = self.code_points.peek_or_zero();
match ch {
'<' => {
self.code_points.next();
if let Some(r) = self.scan_xml_markup(start.clone())? {
return Ok(r);
}
if self.code_points.peek_or_zero() == '/' {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
return Ok((Token::XmlLtSlash, location));
}
let location = start.combine_with(self.current_cursor_location());
Ok((Token::Lt, location))
},
'{' => {
self.code_points.next();
let location = start.combine_with(self.current_cursor_location());
Ok((Token::LeftBrace, location))
},
_ => {
loop {
let ch = self.code_points.peek_or_zero();
if ['<', '{'].contains(&ch) {
break;
}
if character_validation::is_line_terminator(ch) {
self.consume_line_terminator();
} else if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
self.code_points.next();
}
}
let location = start.combine_with(self.current_cursor_location());
let content = self.source.text[location.first_offset..location.last_offset].to_owned();
Ok((Token::XmlText(content), location))
},
}
}
pub fn scan_xml_markup(&mut self, start: Location) -> Result<Option<(Token, Location)>, ParserFailure> {
if self.code_points.peek_seq(3) == "!--" {
self.code_points.skip_count_in_place(3);
loop {
if self.code_points.peek_or_zero() == '-' && self.code_points.peek_seq(3) == "-->" {
self.code_points.skip_count_in_place(3);
break;
} else if character_validation::is_line_terminator(self.code_points.peek_or_zero()) {
self.consume_line_terminator();
} else if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
self.code_points.next();
}
}
let location = start.combine_with(self.current_cursor_location());
let content = self.source.text[location.first_offset..location.last_offset].to_owned();
return Ok(Some((Token::XmlMarkup(content), location)));
}
if self.code_points.peek_seq(8) == "![CDATA[" {
self.code_points.skip_count_in_place(8);
loop {
if self.code_points.peek_or_zero() == ']' && self.code_points.peek_seq(3) == "]]>" {
self.code_points.skip_count_in_place(3);
break;
} else if character_validation::is_line_terminator(self.code_points.peek_or_zero()) {
self.consume_line_terminator();
} else if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
self.code_points.next();
}
}
let location = start.combine_with(self.current_cursor_location());
let content = self.source.text[location.first_offset..location.last_offset].to_owned();
return Ok(Some((Token::XmlMarkup(content), location)));
}
if self.code_points.peek_or_zero() == '?' {
self.code_points.next();
loop {
if self.code_points.peek_or_zero() == '?' && self.code_points.peek_at_or_zero(1) == '>' {
self.code_points.skip_count_in_place(2);
break;
} else if character_validation::is_line_terminator(self.code_points.peek_or_zero()) {
self.consume_line_terminator();
} else if self.code_points.reached_end() {
self.add_unexpected_error();
return Err(ParserFailure);
} else {
self.code_points.next();
}
}
let location = start.combine_with(self.current_cursor_location());
let content = self.source.text[location.first_offset..location.last_offset].to_owned();
return Ok(Some((Token::XmlMarkup(content), location)));
}
Ok(None)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_n_per_n() {
let _n = "n".to_owned();
let source = Source::new(None, "n * n".into(), &CompilerOptions::new());
let mut tokenizer = Tokenizer::new(&source);
let Ok((Token::Identifier(name), _)) = tokenizer.scan_ie_div() else { panic!() };
assert_eq!(name, "n");
assert!(matches!(tokenizer.scan_ie_div(), Ok((Token::Times, _))));
let Ok((Token::Identifier(name), _)) = tokenizer.scan_ie_div() else { panic!() };
assert_eq!(name, "n");
}
#[test]
fn tokenize_comments() {
let _n = "n".to_owned();
let source = Source::new(None, "
// Single-line comment
/* Multi-line comment */
".into(), &CompilerOptions::new());
let mut tokenizer = Tokenizer::new(&source);
assert!(matches!(tokenizer.scan_ie_div(), Ok((Token::Eof, _))));
assert_eq!(source.comments()[0].content(), " Single-line comment");
assert_eq!(source.comments()[1].content(), " Multi-line comment ");
}
#[test]
fn tokenize_strings() {
let source = Source::new(None, r###"
"Some \u{41}\u0041\x41 content"
"""
Another
common
content
"""
"###.into(), &CompilerOptions::new());
let mut tokenizer = Tokenizer::new(&source);
let Ok((Token::StringLiteral(s), _)) = tokenizer.scan_ie_div() else { panic!() };
assert_eq!(s, "Some AAA content");
let Ok((Token::StringLiteral(s), _)) = tokenizer.scan_ie_div() else { panic!() };
assert_eq!(s, "Another\n common\ncontent");
}
#[test]
fn tokenize_numbers() {
let numbers: Vec<f64> = vec![
0.0,
50.0,
1_000.0,
0.5,
0.5,
1_000.0,
1_000.0,
0.001,
0.0,
0.0,
];
let source = Source::new(None, r###"
0
50
1_000
0.5
.5
1e3
1e+3
1e-3
0x00_00
0b0000_0000
"###.into(), &CompilerOptions::new());
let mut tokenizer = Tokenizer::new(&source);
for n in numbers {
let Ok((Token::NumericLiteral(n2), _)) = tokenizer.scan_ie_div() else { panic!() };
assert_eq!(n, n2);
}
}
#[test]
fn tokenize_regexp() {
let source = Source::new(None, r###"
/(?:)/
/(?:)/gi
"###.into(), &CompilerOptions::new());
let mut tokenizer = Tokenizer::new(&source);
let Ok((Token::Div, start)) = tokenizer.scan_ie_div() else { panic!() };
let Ok((Token::RegExpLiteral { body, flags }, _)) = tokenizer.scan_regexp_literal(start) else { panic!() };
assert_eq!(body, "(?:)");
assert_eq!(flags, "");
let Ok((Token::Div, start)) = tokenizer.scan_ie_div() else { panic!() };
let Ok((Token::RegExpLiteral { body, flags }, _)) = tokenizer.scan_regexp_literal(start) else { panic!() };
assert_eq!(body, "(?:)");
assert_eq!(flags, "gi");
}
}