use crate::char::CharFilter;
use crate::char::DIGIT;
use crate::char::DIGIT_BIN;
use crate::char::DIGIT_HEX;
use crate::char::DIGIT_OCT;
use crate::char::ID_CONTINUE;
use crate::char::ID_START_CHARSTR;
use crate::char::WHITESPACE;
use crate::error::SyntaxError;
use crate::error::SyntaxErrorType;
use crate::error::SyntaxResult;
use crate::source::SourceRange;
use crate::token::Token;
use crate::token::TokenType;
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use aho_corasick::MatchKind;
use core::ops::Index;
use lazy_static::lazy_static;
use memchr::memchr;
use memchr::memchr2;
use memchr::memchr3;
use std::collections::HashMap;
#[derive(Copy, Clone)]
pub struct LexerCheckpoint {
next: usize,
}
#[derive(Copy, Clone)]
struct Match {
len: usize,
}
impl Match {
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
}
#[derive(Copy, Clone)]
struct AhoCorasickMatch {
id: usize,
mat: Match,
}
pub struct Lexer<'a> {
source: &'a [u8],
next: usize,
}
#[allow(dead_code)]
impl<'a> Lexer<'a> {
pub fn new(code: &'a [u8]) -> Lexer<'a> {
Lexer {
source: code,
next: 0,
}
}
fn end(&self) -> usize {
self.source.len()
}
fn remaining(&self) -> usize {
self.end() - self.next
}
pub fn source_range(&self) -> SourceRange {
SourceRange::new(0, self.end())
}
fn eof_range(&self) -> SourceRange {
SourceRange::new(self.end(), self.end())
}
fn error(&self, typ: SyntaxErrorType) -> SyntaxError {
SyntaxError::new(typ, SourceRange::new(self.next, self.end()), None)
}
fn at_end(&self) -> bool {
self.next >= self.end()
}
fn peek(&self, n: usize) -> SyntaxResult<u8> {
self
.peek_or_eof(n)
.ok_or_else(|| self.error(SyntaxErrorType::UnexpectedEnd))
}
fn peek_or_eof(&self, n: usize) -> Option<u8> {
self.source.get(self.next + n).map(|&c| c)
}
pub fn checkpoint(&self) -> LexerCheckpoint {
LexerCheckpoint { next: self.next }
}
pub fn since_checkpoint(&self, checkpoint: LexerCheckpoint) -> SourceRange {
SourceRange::new(checkpoint.next, self.next)
}
pub fn apply_checkpoint(&mut self, checkpoint: LexerCheckpoint) -> () {
self.next = checkpoint.next;
}
fn n(&self, n: usize) -> SyntaxResult<Match> {
if self.next + n > self.end() {
return Err(self.error(SyntaxErrorType::UnexpectedEnd));
};
Ok(Match { len: n })
}
fn if_char(&self, c: u8) -> Match {
Match {
len: (!self.at_end() && self.source[self.next] == c) as usize,
}
}
fn through_char_or_end(&self, c: u8) -> Match {
memchr(c, &self.source[self.next..])
.map(|pos| Match { len: pos + 1 })
.unwrap_or_else(|| Match {
len: self.remaining(),
})
}
fn through_char(&self, c: u8) -> SyntaxResult<Match> {
memchr(c, &self.source[self.next..])
.map(|pos| Match { len: pos + 1 })
.ok_or_else(|| self.error(SyntaxErrorType::UnexpectedEnd))
}
fn while_not_char(&self, a: u8) -> Match {
Match {
len: memchr(a, &self.source[self.next..]).unwrap_or(self.remaining()),
}
}
fn while_not_2_chars(&self, a: u8, b: u8) -> Match {
Match {
len: memchr2(a, b, &self.source[self.next..]).unwrap_or(self.remaining()),
}
}
fn while_not_3_chars(&self, a: u8, b: u8, c: u8) -> Match {
Match {
len: memchr3(a, b, c, &self.source[self.next..]).unwrap_or(self.remaining()),
}
}
fn while_chars(&self, chars: &CharFilter) -> Match {
let mut len = 0;
while len < self.remaining() && chars.has(self.source[self.next + len]) {
len += 1;
}
Match { len }
}
fn aho_corasick(&self, ac: &AhoCorasick) -> SyntaxResult<AhoCorasickMatch> {
ac.find(&self.source[self.next..])
.map(|m| AhoCorasickMatch {
id: m.pattern(),
mat: Match { len: m.end() },
})
.ok_or_else(|| self.error(SyntaxErrorType::ExpectedNotFound))
}
fn range(&self, m: Match) -> SourceRange {
SourceRange::new(self.next, self.next + m.len)
}
fn consume(&mut self, m: Match) -> Match {
self.next += m.len;
m
}
fn consume_next(&mut self) -> SyntaxResult<u8> {
let c = self.peek(0)?;
self.next += 1;
Ok(c)
}
fn skip_expect(&mut self, n: usize) -> () {
debug_assert!(self.next + n <= self.end());
self.next += n;
}
}
impl<'a> Index<SourceRange> for Lexer<'a> {
type Output = [u8];
fn index(&self, index: SourceRange) -> &Self::Output {
&self.source[index.start()..index.end()]
}
}
impl<'a> Index<Match> for Lexer<'a> {
type Output = [u8];
fn index(&self, index: Match) -> &Self::Output {
&self.source[self.next - index.len..self.next]
}
}
lazy_static! {
pub static ref OPERATORS_MAPPING: HashMap<TokenType, &'static [u8]> = {
let mut map = HashMap::<TokenType, &'static [u8]>::new();
map.insert(TokenType::Ampersand, b"&");
map.insert(TokenType::AmpersandAmpersand, b"&&");
map.insert(TokenType::AmpersandAmpersandEquals, b"&&=");
map.insert(TokenType::AmpersandEquals, b"&=");
map.insert(TokenType::Asterisk, b"*");
map.insert(TokenType::AsteriskAsterisk, b"**");
map.insert(TokenType::AsteriskAsteriskEquals, b"**=");
map.insert(TokenType::AsteriskEquals, b"*=");
map.insert(TokenType::Bar, b"|");
map.insert(TokenType::BarBar, b"||");
map.insert(TokenType::BarBarEquals, b"||=");
map.insert(TokenType::BarEquals, b"|=");
map.insert(TokenType::BraceClose, b"}");
map.insert(TokenType::BraceOpen, b"{");
map.insert(TokenType::BracketClose, b"]");
map.insert(TokenType::BracketOpen, b"[");
map.insert(TokenType::Caret, b"^");
map.insert(TokenType::CaretEquals, b"^=");
map.insert(TokenType::ChevronLeft, b"<");
map.insert(TokenType::ChevronLeftChevronLeft, b"<<");
map.insert(TokenType::ChevronLeftChevronLeftEquals, b"<<=");
map.insert(TokenType::ChevronLeftEquals, b"<=");
map.insert(TokenType::ChevronRight, b">");
map.insert(TokenType::ChevronRightChevronRight, b">>");
map.insert(TokenType::ChevronRightChevronRightChevronRight, b">>>");
map.insert(TokenType::ChevronRightChevronRightChevronRightEquals, b">>>=");
map.insert(TokenType::ChevronRightChevronRightEquals, b">>=");
map.insert(TokenType::ChevronRightEquals, b">=");
map.insert(TokenType::Colon, b":");
map.insert(TokenType::ColonColon, b"::");
map.insert(TokenType::Comma, b",");
map.insert(TokenType::Dot, b".");
map.insert(TokenType::DotDotDot, b"...");
map.insert(TokenType::Equals, b"=");
map.insert(TokenType::EqualsEquals, b"==");
map.insert(TokenType::Exclamation, b"!");
map.insert(TokenType::ExclamationEquals, b"!=");
map.insert(TokenType::Hyphen, b"-");
map.insert(TokenType::HyphenChevronRight, b"->");
map.insert(TokenType::HyphenEquals, b"-=");
map.insert(TokenType::ParenthesisClose, b")");
map.insert(TokenType::ParenthesisOpen, b"(");
map.insert(TokenType::Percent, b"%");
map.insert(TokenType::PercentEquals, b"%=");
map.insert(TokenType::Plus, b"+");
map.insert(TokenType::PlusEquals, b"+=");
map.insert(TokenType::QuestionDot, b"?.");
map.insert(TokenType::QuestionBracketOpen, b"?[");
map.insert(TokenType::QuestionParenthesisOpen, b"?(");
map.insert(TokenType::QuestionQuestion, b"??");
map.insert(TokenType::QuestionQuestionEquals, b"??=");
map.insert(TokenType::Semicolon, b";");
map.insert(TokenType::Slash, b"/");
map.insert(TokenType::SlashEquals, b"/=");
map
};
pub static ref KEYWORDS_MAPPING: HashMap<TokenType, &'static [u8]> = {
let mut map = HashMap::<TokenType, &'static [u8]>::new();
map.insert(TokenType::KeywordAs, b"as");
map.insert(TokenType::KeywordBreak, b"break");
map.insert(TokenType::KeywordContinue, b"continue");
map.insert(TokenType::KeywordCrate, b"crate");
map.insert(TokenType::KeywordElse, b"else");
map.insert(TokenType::KeywordFor, b"for");
map.insert(TokenType::KeywordIf, b"if");
map.insert(TokenType::KeywordImpl, b"impl");
map.insert(TokenType::KeywordIn, b"in");
map.insert(TokenType::KeywordLet, b"let");
map.insert(TokenType::KeywordLoop, b"loop");
map.insert(TokenType::KeywordMut, b"mut");
map.insert(TokenType::KeywordPub, b"pub");
map.insert(TokenType::KeywordReturn, b"return");
map.insert(TokenType::KeywordStruct, b"struct");
map.insert(TokenType::LiteralFalse, b"false");
map.insert(TokenType::LiteralNone, b"None");
map.insert(TokenType::LiteralTrue, b"true");
map
};
pub static ref KEYWORD_STRS: HashMap<&'static [u8], usize> = {
HashMap::<&'static [u8], usize>::from_iter(KEYWORDS_MAPPING.values().enumerate().map(|(i, v)| (*v, i)))
};
static ref PATTERNS: Vec<(TokenType, &'static [u8])> = {
let mut patterns: Vec<(TokenType, &'static [u8])> = Vec::new();
for (&k, &v) in OPERATORS_MAPPING.iter() {
patterns.push((k, v));
};
for (&k, &v) in KEYWORDS_MAPPING.iter() {
patterns.push((k, &v));
};
patterns.push((TokenType::ChevronLeftSlash, b"</"));
patterns.push((TokenType::CommentMultiple, b"/*"));
patterns.push((TokenType::CommentSingle, b"//"));
for c in ID_START_CHARSTR.chunks(1) {
patterns.push((TokenType::Identifier, c));
};
for c in b"0123456789".chunks(1) {
patterns.push((TokenType::_LiteralNumberDec, c));
};
patterns.push((TokenType::_LiteralNumberBin, b"0b"));
patterns.push((TokenType::_LiteralNumberBin, b"0B"));
patterns.push((TokenType::_LiteralNumberHex, b"0x"));
patterns.push((TokenType::_LiteralNumberHex, b"0X"));
patterns.push((TokenType::_LiteralNumberOct, b"0o"));
patterns.push((TokenType::_LiteralNumberOct, b"0O"));
patterns.push((TokenType::LiteralTemplatePartString, b"`"));
patterns
};
static ref MATCHER: AhoCorasick = AhoCorasickBuilder::new()
.anchored(true)
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(PATTERNS.iter().map(|(_, pat)| pat));
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&[b"*/"]);
}
fn lex_multiple_comment<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<()> {
lexer.skip_expect(2);
lexer.consume(lexer.aho_corasick(&COMMENT_END)?.mat);
Ok(())
}
fn lex_single_comment<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<()> {
lexer.skip_expect(2);
lexer.consume(lexer.through_char_or_end(b'\n'));
Ok(())
}
fn lex_identifier<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
let cp = lexer.checkpoint();
lexer.skip_expect(1);
loop {
lexer.consume(lexer.while_chars(&ID_CONTINUE));
if lexer.peek_or_eof(0).filter(|c| !c.is_ascii()).is_none() {
break;
};
lexer.skip_expect(1);
}
Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::Identifier,
))
}
fn lex_number_dec<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
let cp = lexer.checkpoint();
lexer.consume(lexer.while_chars(&DIGIT));
if !lexer.consume(lexer.if_char(b'n')).is_empty() {
return Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralBigInt,
));
}
lexer.consume(lexer.if_char(b'.'));
lexer.consume(lexer.while_chars(&DIGIT));
if lexer
.peek_or_eof(0)
.filter(|&c| c == b'e' || c == b'E')
.is_some()
{
lexer.skip_expect(1);
match lexer.peek(0)? {
b'+' | b'-' => lexer.skip_expect(1),
_ => {}
};
lexer.consume(lexer.while_chars(&DIGIT));
}
Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralInt,
))
}
fn lex_number_bin<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
let cp = lexer.checkpoint();
lexer.skip_expect(2);
lexer.consume(lexer.while_chars(&DIGIT_BIN));
if !lexer.consume(lexer.if_char(b'n')).is_empty() {
return Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralBigInt,
));
}
Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralInt,
))
}
fn lex_number_hex<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
let cp = lexer.checkpoint();
lexer.skip_expect(2);
lexer.consume(lexer.while_chars(&DIGIT_HEX));
if !lexer.consume(lexer.if_char(b'n')).is_empty() {
return Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralBigInt,
));
}
Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralInt,
))
}
fn lex_number_oct<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
let cp = lexer.checkpoint();
lexer.skip_expect(2);
lexer.consume(lexer.while_chars(&DIGIT_OCT));
if !lexer.consume(lexer.if_char(b'n')).is_empty() {
return Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralBigInt,
));
}
Ok(Token::new(
lexer.since_checkpoint(cp),
TokenType::LiteralInt,
))
}
pub fn lex_template_string_continue<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
let cp = lexer.checkpoint();
let mut ended = false;
let loc = loop {
lexer.consume(lexer.while_not_3_chars(b'\\', b'`', b'$'));
match lexer.peek(0)? {
b'\\' => {
lexer.consume(lexer.n(2)?);
}
b'`' => {
ended = true;
let loc = Some(lexer.since_checkpoint(cp));
lexer.skip_expect(1);
break loc;
}
b'$' => {
if lexer.peek(1)? == b'{' {
let loc = Some(lexer.since_checkpoint(cp));
lexer.skip_expect(2);
break loc;
} else {
lexer.skip_expect(1);
}
}
_ => unreachable!(),
};
};
Ok(Token::new(
loc.unwrap(),
if ended {
TokenType::LiteralTemplatePartStringEnd
} else {
TokenType::LiteralTemplatePartString
},
))
}
fn lex_template<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
lexer.skip_expect(1);
lex_template_string_continue(lexer)
}
pub fn lex_next<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
loop {
let ws = lexer.while_chars(&WHITESPACE);
lexer.consume(ws);
if lexer.at_end() {
return Ok(Token::new(lexer.eof_range(), TokenType::EOF));
};
let AhoCorasickMatch { id, mat } = lexer.aho_corasick(&MATCHER)?;
match PATTERNS[id].0 {
TokenType::CommentMultiple => lex_multiple_comment(lexer)?,
TokenType::CommentSingle => lex_single_comment(lexer)?,
pat => {
return match pat {
TokenType::Identifier => lex_identifier(lexer),
TokenType::_LiteralNumberDec => lex_number_dec(lexer),
TokenType::_LiteralNumberBin => lex_number_bin(lexer),
TokenType::_LiteralNumberHex => lex_number_hex(lexer),
TokenType::_LiteralNumberOct => lex_number_oct(lexer),
TokenType::LiteralTemplatePartString => lex_template(lexer),
typ => {
if KEYWORDS_MAPPING.contains_key(&typ)
&& lexer
.peek_or_eof(mat.len())
.filter(|c| ID_CONTINUE.has(*c))
.is_some()
{
return lex_identifier(lexer);
};
let loc = lexer.range(mat);
lexer.consume(mat);
Ok(Token::new(loc, typ))
}
};
}
};
}
}