use std::io;
use core::iter::{Iterator, Peekable};
use crate::language;
use crate::debug::{DebugSymbol, TokenIndex, TokenLength};
mod token;
mod errors;
mod tests;
pub mod rules;
pub use rules::MatchResult;
use rules::LexerRule;
use rules::comments::{LineCommentRule, BlockCommentRule};
pub use token::*;
pub use errors::*;
#[derive(Clone)]
pub struct LexerOptions {
skip_comments: bool,
}
pub struct LexerBuilder {
rules: Vec<Box<dyn LexerRule>>,
options: LexerOptions,
}
impl Default for LexerBuilder {
fn default() -> Self { Self::new() }
}
impl LexerBuilder {
pub fn new() -> Self {
LexerBuilder {
rules: Vec::new(),
options: LexerOptions {
skip_comments: true,
}
}
}
fn set_options(mut self, options: LexerOptions) -> Self {
self.options = options;
self
}
pub fn set_skip_comments(mut self, skip_comments: bool) -> Self {
self.options.skip_comments = skip_comments;
self
}
pub fn add_rule<R>(mut self, rule: R) -> Self
where R: LexerRule + 'static {
self.rules.push(Box::new(rule));
self
}
pub fn insert_rule<R>(mut self, index: usize, rule: impl LexerRule + 'static) -> Self {
self.rules.insert(index, Box::new(rule));
self
}
pub fn extend_rules(mut self, rules: impl Iterator<Item=impl LexerRule + 'static>) -> Self {
for rule in rules {
self.rules.push(Box::new(rule));
}
self
}
pub fn build_once<S>(self, source: S) -> Lexer<S> where S: Iterator<Item=io::Result<char>> {
Lexer::new(source, self.options, self.rules.into_iter())
}
pub fn build<S>(&self, source: S) -> Lexer<S> where S: Iterator<Item=io::Result<char>> {
Lexer::new(source, self.options.clone(), self.rules.clone().into_iter())
}
}
fn split_array_pair_mut<T>(pair: &mut [T; 2]) -> (&mut T, &mut T) {
let (first, rest) = pair.split_first_mut().unwrap();
let second = &mut rest[0];
(first, second)
}
type RuleID = usize;
pub struct Lexer<S> where S: Iterator<Item=io::Result<char>> {
source: Peekable<S>,
options: LexerOptions,
rules: Vec<Box<dyn LexerRule>>,
current: TokenIndex, last: Option<char>,
newline: bool,
active: [Vec<RuleID>; 2],
complete: [Vec<RuleID>; 2],
}
const THIS_CYCLE: usize = 0;
const NEXT_CYCLE: usize = 1;
impl<S> Iterator for Lexer<S> where S: Iterator<Item=io::Result<char>> {
type Item = Result<TokenMeta, LexerError>;
fn next(&mut self) -> Option<Self::Item> { Some(self.next_token()) }
}
type PrevNextChars = (Option<char>, Option<char>);
impl<S> Lexer<S> where S: Iterator<Item=io::Result<char>> {
pub fn new(source: S, options: LexerOptions, rules: impl Iterator<Item=Box<dyn LexerRule>>) -> Self {
Lexer {
options,
source: source.peekable(),
rules: rules.collect(),
current: 0,
last: None,
newline: true,
active: [Vec::new(), Vec::new()],
complete: [Vec::new(), Vec::new()],
}
}
fn get_next(&mut self) -> Result<Option<char>, LexerError> {
match self.source.next() {
None => Ok(None),
Some(result) => match result {
Ok(c) => Ok(Some(c)),
Err(error) => Err(self.error(ErrorKind::IOError, self.current).caused_by(Box::new(error))),
},
}
}
fn peek_next(&mut self) -> Result<Option<char>, LexerError> {
let result = match self.source.peek() {
None => Ok(None),
Some(Ok(c)) => Ok(Some(*c)),
Some(Err(..)) => Err(()),
};
result.map_err(|_| {
let ioerror = self.source.next().unwrap().unwrap_err();
self.error(ErrorKind::IOError, self.current).caused_by(Box::new(ioerror))
})
}
fn advance(&mut self) -> Result<PrevNextChars, LexerError> {
self.last = self.peek_next()?;
let next = self.get_next()?;
if next.is_some() {
if self.current == TokenIndex::MAX {
return Err(self.error(ErrorKind::SourceTooLong, self.current));
}
self.current += 1;
}
Ok((self.last, next))
}
fn peek(&mut self) -> Result<PrevNextChars, LexerError> {
Ok((self.last, self.peek_next()?))
}
pub fn at_eof(&mut self) -> bool {
self.source.peek().is_none()
}
fn skip_whitespace(&mut self) -> Result<(), LexerError> {
let mut next = self.peek_next()?;
while next.is_some() && next.unwrap().is_whitespace() {
if let (_, Some('\n')) = self.advance()? {
self.newline = true;
}
next = self.peek_next()?;
}
Ok(())
}
fn skip_comments(&mut self) -> Result<bool, LexerError> {
let line_rule = LineCommentRule::new(language::COMMENT_CHAR);
let block_rule = BlockCommentRule::new(language::NESTED_COMMENT_START, language::NESTED_COMMENT_END);
let mut line = Some(line_rule);
let mut block = Some(block_rule);
let start_pos = self.current;
loop {
let (prev, next) = self.peek()?;
let next = match next {
Some(ch) => ch,
None => break,
};
if let Some(rule) = line.as_mut() {
if !rule.try_match(prev, next).is_match() {
line = None;
}
}
if let Some(rule) = block.as_mut() {
if !rule.try_match(prev, next).is_match() {
block = None;
}
}
if line.is_none() && block.is_none() {
break;
}
if let (_, Some('\n')) = self.advance()? {
self.newline = true;
}
}
let continue_ = !self.at_eof() && self.current > start_pos;
Ok(continue_)
}
fn reset_rules(&mut self) {
for rule in self.rules.iter_mut() {
rule.reset();
}
for idx in 0..2 {
self.active[idx].clear();
self.complete[idx].clear();
}
}
pub fn next_token(&mut self) -> Result<TokenMeta, LexerError> {
self.skip_whitespace()?;
if self.options.skip_comments {
while self.skip_comments()? {
self.skip_whitespace()?;
}
}
let result = self.scan_token();
self.newline = matches!(self.last, Some('\n'));
result
}
fn scan_token(&mut self) -> Result<TokenMeta, LexerError> {
let token_start = self.current;
self.reset_rules();
let (mut prev, next) = self.peek()?;
let mut next = match next {
Some(ch) => ch,
None => {
return self.token_data(Token::EOF, token_start);
},
};
self.active[THIS_CYCLE].extend(0..self.rules.len());
loop {
{
let (active, next_active) = split_array_pair_mut(&mut self.active);
let (complete, next_complete) = split_array_pair_mut(&mut self.complete);
next_active.clear();
next_complete.clear();
for &rule_id in active.iter() {
let rule = &mut self.rules[rule_id];
let match_result = rule.try_match(prev, next);
if match_result.is_match() {
next_active.push(rule_id);
if match_result.is_complete_match() {
next_complete.push(rule_id);
}
}
}
if next_active.is_empty() && !complete.is_empty() {
let rule_id = *complete.iter().min().unwrap();
let matching_rule = &mut self.rules[rule_id];
let token = matching_rule.get_token()
.map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
return self.token_data(token, token_start);
}
}
self.advance()?;
{
let next_active = &self.active[NEXT_CYCLE];
if next_active.is_empty() {
return Err(self.error(ErrorKind::NoMatchingRule, token_start));
}
if next_active.len() == 1 {
let rule_id = next_active[0];
return self.exhaust_rule(rule_id, token_start);
}
prev = Some(next);
next = match self.peek_next()? {
Some(ch) => ch,
None => break,
};
self.active.swap(0, 1);
self.complete.swap(0, 1);
}
}
let next_complete = &self.complete[NEXT_CYCLE];
if !next_complete.is_empty() {
let rule_id = *next_complete.iter().min().unwrap();
let matching_rule = &mut self.rules[rule_id];
let token = matching_rule.get_token()
.map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
return self.token_data(token, token_start);
}
Err(self.error(ErrorKind::UnexpectedEOF, token_start))
}
fn exhaust_rule(&mut self, rule_id: RuleID, token_start: TokenIndex) -> Result<TokenMeta, LexerError> {
{
let rule = &mut self.rules[rule_id];
debug_assert!(!matches!(rule.current_state(), MatchResult::NoMatch));
}
loop {
let (prev, next) = self.peek()?;
let next = match next {
Some(ch) => ch,
None => break,
};
{
let rule = &mut self.rules[rule_id];
match rule.try_match(prev, next) {
MatchResult::NoMatch => break,
_ => { self.advance()?; },
}
}
}
let rule = &mut self.rules[rule_id];
if matches!(rule.current_state(), MatchResult::CompleteMatch) {
let token = rule.get_token()
.map_err(|err| self.error(ErrorKind::CouldNotReadToken, token_start).caused_by(err))?;
return self.token_data(token, token_start);
}
if self.at_eof() {
Err(self.error(ErrorKind::UnexpectedEOF, token_start))
} else {
Err(self.error(ErrorKind::NoMatchingRule, token_start))
}
}
fn get_symbol(start_idx: TokenIndex, end_idx: TokenIndex) -> Result<DebugSymbol, LexerError> {
let length = TokenLength::try_from(end_idx.saturating_sub(start_idx));
let symbol = DebugSymbol::new(start_idx, length.unwrap_or(0));
if length.is_err() {
Err(LexerError::new(ErrorKind::MaxTokenLengthExceeded, symbol))
} else {
Ok(symbol)
}
}
fn token_data(&self, token: Token, token_start: TokenIndex) -> Result<TokenMeta, LexerError> {
let symbol = Self::get_symbol(token_start, self.current)?;
Ok(TokenMeta { token, symbol, newline: self.newline })
}
fn error(&self, kind: ErrorKind, token_start: TokenIndex) -> LexerError {
let length = TokenLength::try_from(self.current.saturating_sub(token_start));
let symbol = DebugSymbol::new(token_start, length.unwrap_or(0));
LexerError::new(kind, symbol)
}
}