use crate::lexer::Lexer;
use crate::lexer::LexerError;
use crate::lexer::Loc;
use crate::lexer::ParserLanguage;
use crate::lexer::StrLit;
use crate::lexer::StrLitDecodeError;
use crate::lexer::Token;
use crate::lexer::TokenWithLocation;
use std::fmt;
#[derive(Debug)]
pub enum TokenizerError {
LexerError(LexerError),
StrLitDecodeError(StrLitDecodeError),
InternalError,
IncorrectInput, UnexpectedEof,
ExpectStrLit,
ExpectIntLit,
ExpectFloatLit,
ExpectIdent,
ExpectNamedIdent(String),
ExpectChar(char),
ExpectAnyChar(Vec<char>),
}
impl fmt::Display for TokenizerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TokenizerError::LexerError(e) => write!(f, "{}", e),
TokenizerError::StrLitDecodeError(e) => write!(f, "{}", e),
TokenizerError::InternalError => write!(f, "Internal tokenizer error"),
TokenizerError::IncorrectInput => write!(f, "Incorrect input"),
TokenizerError::UnexpectedEof => write!(f, "Unexpected EOF"),
TokenizerError::ExpectStrLit => write!(f, "Expecting string literal"),
TokenizerError::ExpectIntLit => write!(f, "Expecting int literal"),
TokenizerError::ExpectFloatLit => write!(f, "Expecting float literal"),
TokenizerError::ExpectIdent => write!(f, "Expecting identifier"),
TokenizerError::ExpectNamedIdent(n) => write!(f, "Expecting identifier {}", n),
TokenizerError::ExpectChar(c) => write!(f, "Expecting char {}", c),
TokenizerError::ExpectAnyChar(c) => write!(
f,
"Expecting one of: {}",
c.iter()
.map(|c| format!("{}", c))
.collect::<Vec<String>>()
.join(", ")
),
}
}
}
impl std::error::Error for TokenizerError {}
pub type TokenizerResult<R> = Result<R, TokenizerError>;
impl From<LexerError> for TokenizerError {
fn from(e: LexerError) -> Self {
TokenizerError::LexerError(e)
}
}
impl From<StrLitDecodeError> for TokenizerError {
fn from(e: StrLitDecodeError) -> Self {
TokenizerError::StrLitDecodeError(e)
}
}
#[derive(Clone)]
pub struct Tokenizer<'a> {
lexer: Lexer<'a>,
next_token: Option<TokenWithLocation>,
last_token_loc: Option<Loc>,
}
#[allow(dead_code)]
impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
Tokenizer {
lexer: Lexer::new(input, comment_style),
next_token: None,
last_token_loc: None,
}
}
pub fn loc(&self) -> Loc {
self.next_token
.as_ref()
.map(|t| t.loc)
.or(self.last_token_loc)
.unwrap_or(self.lexer.loc)
}
pub fn lookahead_loc(&mut self) -> Loc {
drop(self.lookahead());
self.loc()
}
fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
Ok(match self.next_token {
Some(ref token) => Some(&token.token),
None => {
self.next_token = self.lexer.next_token()?;
self.last_token_loc = self.next_token.as_ref().map(|t| t.loc);
match self.next_token {
Some(ref token) => Some(&token.token),
None => None,
}
}
})
}
pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
match self.lookahead()? {
Some(token) => Ok(token),
None => Err(TokenizerError::UnexpectedEof),
}
}
pub fn advance(&mut self) -> TokenizerResult<Token> {
self.next_token
.take()
.map(|TokenWithLocation { token, .. }| token)
.ok_or(TokenizerError::InternalError)
}
pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
Ok(self.lookahead()?.is_none())
}
pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
where
P: FnOnce(&Token) -> Option<R>,
{
self.lookahead()?;
let v = match self.next_token {
Some(ref token) => match p(&token.token) {
Some(v) => v,
None => return Ok(None),
},
_ => return Ok(None),
};
self.next_token = None;
Ok(Some(v))
}
pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
where
P: FnOnce(&Token) -> Result<R, E>,
E: From<TokenizerError>,
{
self.lookahead()?;
let r = match self.next_token {
Some(ref token) => p(&token.token)?,
None => return Err(TokenizerError::UnexpectedEof.into()),
};
self.next_token = None;
Ok(r)
}
fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
where
P: FnOnce(&Token) -> bool,
{
self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
}
pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
let v = match self.lookahead()? {
Some(&Token::Ident(ref next)) => {
if idents.iter().any(|&i| i == next) {
next.clone()
} else {
return Ok(None);
}
}
_ => return Ok(None),
};
self.advance()?;
Ok(Some(v))
}
pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
Ok(self.next_ident_if_in(&[word])? != None)
}
pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
if self.next_ident_if_eq(word)? {
Ok(())
} else {
Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
}
}
pub fn next_ident_if_eq_error(&mut self, word: &str) -> TokenizerResult<()> {
if self.clone().next_ident_if_eq(word)? {
return Err(TokenizerError::IncorrectInput);
}
Ok(())
}
pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
Ok(self.next_token_if(|token| matches!(token, &Token::Symbol(c) if c == symbol))? != None)
}
pub fn next_symbol_expect_eq(&mut self, symbol: char) -> TokenizerResult<()> {
if self.lookahead_is_symbol(symbol)? {
self.advance()?;
Ok(())
} else {
Err(TokenizerError::ExpectChar(symbol))
}
}
pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
Ok(match self.lookahead()? {
Some(&Token::Symbol(c)) => Some(c),
_ => None,
})
}
pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
Ok(self.lookahead_if_symbol()? == Some(symbol))
}
pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
Ok(match self.lookahead()? {
Some(Token::Ident(i)) => i == ident,
_ => false,
})
}
pub fn next_ident(&mut self) -> TokenizerResult<String> {
self.next_token_check_map(|token| match *token {
Token::Ident(ref ident) => Ok(ident.clone()),
_ => Err(TokenizerError::ExpectIdent),
})
}
pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
self.next_token_check_map(|token| match *token {
Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
_ => Err(TokenizerError::ExpectStrLit),
})
}
}
#[cfg(test)]
mod test {
use super::*;
fn tokenize<P, R>(input: &str, what: P) -> R
where
P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
{
let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
let eof = tokenizer
.syntax_eof()
.expect(&format!("check eof failed at {}", tokenizer.loc()));
assert!(eof, "{}", tokenizer.loc());
r
}
#[test]
fn test_ident() {
let msg = r#" aabb_c "#;
let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
assert_eq!("aabb_c", mess);
}
#[test]
fn test_str_lit() {
let msg = r#" "a\nb" "#;
let mess = tokenize(msg, |p| p.next_str_lit());
assert_eq!(
StrLit {
escaped: r#"a\nb"#.to_owned()
},
mess
);
}
}