use crate::ast::{VarRef, VarType};
use crate::reader::CharReader;
use std::io;
use std::iter::Peekable;
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Token {
Eof,
Eol,
Bad(String),
Boolean(bool),
Integer(i32),
Text(String),
Symbol(VarRef),
Comma,
Semicolon,
LeftParen,
RightParen,
Plus,
Minus,
Multiply,
Divide,
Equal,
NotEqual,
Less,
LessEqual,
Greater,
GreaterEqual,
And,
Not,
Or,
Xor,
If,
Else,
Elseif,
Then,
End,
While,
}
trait CharOps {
fn is_separator(&self) -> bool;
fn is_space(&self) -> bool;
}
impl CharOps for char {
fn is_separator(&self) -> bool {
match *self {
'\n' | '(' | ')' | '\'' | '=' | '<' | '>' | ';' | ',' | '+' | '-' | '*' | '/' => true,
ch => ch.is_space(),
}
}
fn is_space(&self) -> bool {
match *self {
' ' | '\t' | '\r' => true,
_ => false,
}
}
}
pub struct Lexer<'a> {
input: Peekable<CharReader<'a>>,
}
impl<'a> Lexer<'a> {
pub fn from(input: &'a mut dyn io::Read) -> Self {
Self {
input: CharReader::from(input).peekable(),
}
}
fn handle_bad_read<S: Into<String>>(&mut self, msg: S) -> io::Result<Token> {
loop {
match self.input.peek() {
Some(Ok(ch)) if ch.is_separator() => break,
Some(Ok(_)) => {
self.input.next().unwrap()?;
}
Some(Err(_)) => return Err(self.input.next().unwrap().unwrap_err()),
None => break,
}
}
Ok(Token::Bad(msg.into()))
}
fn handle_bad_peek<S: Into<String>>(&mut self, msg: S) -> io::Result<Token> {
self.input.next();
self.handle_bad_read(msg)
}
fn consume_integer(&mut self, first: char) -> io::Result<Token> {
let mut s = String::new();
s.push(first);
loop {
match self.input.peek() {
Some(Ok(ch)) if ch.is_digit(10) => s.push(self.input.next().unwrap()?),
Some(Ok(ch)) if ch.is_separator() => break,
Some(Ok(ch)) => {
let msg = format!("Unexpected character in integer: {}", ch);
return self.handle_bad_peek(msg);
}
Some(Err(_)) => return Err(self.input.next().unwrap().unwrap_err()),
None => break,
}
}
match s.parse::<i32>() {
Ok(i) => Ok(Token::Integer(i)),
Err(e) => self.handle_bad_read(format!("Bad integer {}: {}", s, e)),
}
}
fn consume_operator(&mut self, first: char) -> io::Result<Token> {
match (first, self.input.peek()) {
(_, Some(Err(_))) => Err(self.input.next().unwrap().unwrap_err()),
('<', Some(Ok('>'))) => {
self.input.next().unwrap()?;
Ok(Token::NotEqual)
}
('<', Some(Ok('='))) => {
self.input.next().unwrap()?;
Ok(Token::LessEqual)
}
('<', _) => Ok(Token::Less),
('>', Some(Ok('='))) => {
self.input.next().unwrap()?;
Ok(Token::GreaterEqual)
}
('>', _) => Ok(Token::Greater),
(_, _) => panic!("Should not have been called"),
}
}
fn consume_symbol(&mut self, first: char) -> io::Result<Token> {
let mut s = String::new();
s.push(first);
let mut vtype = VarType::Auto;
loop {
match self.input.peek() {
Some(Ok(ch)) if ch.is_alphanumeric() => s.push(self.input.next().unwrap()?),
Some(Ok(ch)) if ch.is_separator() => break,
Some(Ok('?')) => {
vtype = VarType::Boolean;
self.input.next().unwrap()?;
break;
}
Some(Ok('%')) => {
vtype = VarType::Integer;
self.input.next().unwrap()?;
break;
}
Some(Ok('$')) => {
vtype = VarType::Text;
self.input.next().unwrap()?;
break;
}
Some(Ok(ch)) => {
let msg = format!("Unexpected character in symbol: {}", ch);
return self.handle_bad_peek(msg);
}
Some(Err(_)) => return Err(self.input.next().unwrap().unwrap_err()),
None => break,
}
}
match s.to_uppercase().as_str() {
"AND" => Ok(Token::And),
"ELSE" => Ok(Token::Else),
"ELSEIF" => Ok(Token::Elseif),
"END" => Ok(Token::End),
"FALSE" => Ok(Token::Boolean(false)),
"IF" => Ok(Token::If),
"NOT" => Ok(Token::Not),
"OR" => Ok(Token::Or),
"REM" => self.consume_rest_of_line(),
"THEN" => Ok(Token::Then),
"TRUE" => Ok(Token::Boolean(true)),
"WHILE" => Ok(Token::While),
"XOR" => Ok(Token::Xor),
_ => Ok(Token::Symbol(VarRef::new(s, vtype))),
}
}
fn consume_text(&mut self, delim: char) -> io::Result<Token> {
let mut s = String::new();
let mut escaping = false;
loop {
match self.input.peek() {
Some(Ok(ch)) => {
if escaping {
s.push(self.input.next().unwrap()?);
escaping = false;
} else if *ch == '\\' {
self.input.next().unwrap()?;
escaping = true;
} else if *ch == delim {
self.input.next().unwrap()?;
break;
} else {
s.push(self.input.next().unwrap()?);
}
}
Some(Err(_)) => return Err(self.input.next().unwrap().unwrap_err()),
None => {
return self.handle_bad_peek(format!("Incomplete string due to EOF: {}", s))
}
}
}
Ok(Token::Text(s))
}
fn consume_rest_of_line(&mut self) -> io::Result<Token> {
loop {
match self.input.next() {
None => return Ok(Token::Eof),
Some(Ok('\n')) => return Ok(Token::Eol),
Some(Err(e)) => return Err(e),
Some(Ok(_)) => (),
}
}
}
fn advance_and_read_next(&mut self) -> io::Result<Option<char>> {
loop {
match self.input.next() {
Some(Ok(ch)) if ch.is_space() => (),
Some(Ok(ch)) => return Ok(Some(ch)),
Some(Err(e)) => return Err(e),
None => return Ok(None),
}
}
}
pub fn read(&mut self) -> io::Result<Token> {
let ch = self.advance_and_read_next()?;
if ch.is_none() {
return Ok(Token::Eof);
}
let ch = ch.unwrap();
match ch {
'\n' => Ok(Token::Eol),
'\'' => self.consume_rest_of_line(),
'"' => self.consume_text('"'),
';' => Ok(Token::Semicolon),
',' => Ok(Token::Comma),
'(' => Ok(Token::LeftParen),
')' => Ok(Token::RightParen),
'+' => Ok(Token::Plus),
'-' => Ok(Token::Minus),
'*' => Ok(Token::Multiply),
'/' => Ok(Token::Divide),
'=' => Ok(Token::Equal),
'<' | '>' => self.consume_operator(ch),
ch if ch.is_digit(10) => self.consume_integer(ch),
ch if ch.is_alphabetic() => self.consume_symbol(ch),
ch => self.handle_bad_read(format!("Unknown character: {}", ch)),
}
}
pub fn peekable(self) -> PeekableLexer<'a> {
PeekableLexer {
lexer: self,
peeked: None,
}
}
}
pub struct PeekableLexer<'a> {
lexer: Lexer<'a>,
peeked: Option<Token>,
}
impl<'a> PeekableLexer<'a> {
pub fn consume_peeked(&mut self) -> Token {
assert!(self.peeked.is_some());
self.peeked.take().unwrap()
}
pub fn peek(&mut self) -> io::Result<&Token> {
if self.peeked.is_none() {
let n = self.read()?;
self.peeked.replace(n);
}
Ok(self.peeked.as_ref().unwrap())
}
pub fn read(&mut self) -> io::Result<Token> {
match self.peeked.take() {
Some(t) => Ok(t),
None => self.lexer.read(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn do_ok_test(input: &str, exp_tokens: &[Token]) {
let mut cursor = io::Cursor::new(input.as_bytes());
let mut lexer = Lexer::from(&mut cursor);
let mut tokens = vec![];
loop {
let token = lexer.read().expect("Lexing failed");
if token == Token::Eof {
break;
}
tokens.push(token);
}
assert_eq!(exp_tokens, tokens.as_slice());
}
#[test]
fn test_empty() {
let mut cursor = io::Cursor::new(b"");
let mut lexer = Lexer::from(&mut cursor);
assert_eq!(Token::Eof, lexer.read().unwrap());
assert_eq!(Token::Eof, lexer.read().unwrap());
}
#[test]
fn test_read_past_eof() {
do_ok_test("", &[]);
}
#[test]
fn test_whitespace_only() {
do_ok_test(" \t ", &[]);
}
#[test]
fn test_multiple_lines() {
do_ok_test(" \n \t \n ", &[Token::Eol, Token::Eol]);
}
fn new_auto_symbol(name: &str) -> Token {
Token::Symbol(VarRef::new(name, VarType::Auto))
}
#[test]
fn test_some_tokens() {
do_ok_test(
"123 45 \n 6 abc a38z\na=3",
&[
Token::Integer(123),
Token::Integer(45),
Token::Eol,
Token::Integer(6),
new_auto_symbol("abc"),
new_auto_symbol("a38z"),
Token::Eol,
new_auto_symbol("a"),
Token::Equal,
Token::Integer(3),
],
);
}
#[test]
fn test_utf8() {
do_ok_test(
"가 나=7 a다b \"라 마\"",
&[
new_auto_symbol("가"),
new_auto_symbol("나"),
Token::Equal,
Token::Integer(7),
new_auto_symbol("a다b"),
Token::Text("라 마".to_owned()),
],
);
}
#[test]
fn test_remarks() {
do_ok_test(
"REM This is a comment\nNOT 'This is another comment\n",
&[Token::Eol, Token::Not, Token::Eol],
);
}
#[test]
fn test_var_types() {
do_ok_test(
"a b? i% s$",
&[
new_auto_symbol("a"),
Token::Symbol(VarRef::new("b", VarType::Boolean)),
Token::Symbol(VarRef::new("i", VarType::Integer)),
Token::Symbol(VarRef::new("s", VarType::Text)),
],
);
}
#[test]
fn test_strings() {
do_ok_test(
" \"this is a string\" 3",
&[
Token::Text("this is a string".to_owned()),
Token::Integer(3),
],
);
do_ok_test(
"\"this \\\"is escaped\\\" \\\\ \\a\" 1",
&[
Token::Text("this \"is escaped\" \\ a".to_owned()),
Token::Integer(1),
],
);
}
#[test]
fn test_if() {
do_ok_test(
"IF THEN ELSEIF ELSE END IF",
&[
Token::If,
Token::Then,
Token::Elseif,
Token::Else,
Token::End,
Token::If,
],
);
do_ok_test(
"if then elseif else end if",
&[
Token::If,
Token::Then,
Token::Elseif,
Token::Else,
Token::End,
Token::If,
],
);
}
#[test]
fn test_while() {
do_ok_test("WHILE END WHILE", &[Token::While, Token::End, Token::While]);
do_ok_test("while end while", &[Token::While, Token::End, Token::While]);
}
fn do_operator_test(op: &str, t: Token) {
do_ok_test(
format!("a {} 2", op).as_ref(),
&[new_auto_symbol("a"), t, Token::Integer(2)],
);
}
#[test]
fn test_operator_relational_ops() {
do_operator_test("=", Token::Equal);
do_operator_test("<>", Token::NotEqual);
do_operator_test("<", Token::Less);
do_operator_test("<=", Token::LessEqual);
do_operator_test(">", Token::Greater);
do_operator_test(">=", Token::GreaterEqual);
}
#[test]
fn test_operator_arithmetic_ops() {
do_operator_test("+", Token::Plus);
do_operator_test("-", Token::Minus);
do_operator_test("*", Token::Multiply);
do_operator_test("/", Token::Divide);
}
#[test]
fn test_operator_no_spaces() {
do_ok_test(
"z=2 654<>a32",
&[
new_auto_symbol("z"),
Token::Equal,
Token::Integer(2),
Token::Integer(654),
Token::NotEqual,
new_auto_symbol("a32"),
],
);
}
#[test]
fn test_parenthesis() {
do_ok_test(
"(a) (\"foo\") (3)",
&[
Token::LeftParen,
new_auto_symbol("a"),
Token::RightParen,
Token::LeftParen,
Token::Text("foo".to_owned()),
Token::RightParen,
Token::LeftParen,
Token::Integer(3),
Token::RightParen,
],
);
}
#[test]
fn test_peekable_lexer() {
let mut cursor = io::Cursor::new(b"a b 123");
let mut lexer = Lexer::from(&mut cursor).peekable();
assert_eq!(&new_auto_symbol("a"), lexer.peek().unwrap());
assert_eq!(&new_auto_symbol("a"), lexer.peek().unwrap());
assert_eq!(new_auto_symbol("a"), lexer.read().unwrap());
assert_eq!(new_auto_symbol("b"), lexer.read().unwrap());
assert_eq!(&Token::Integer(123), lexer.peek().unwrap());
assert_eq!(Token::Integer(123), lexer.read().unwrap());
assert_eq!(&Token::Eof, lexer.peek().unwrap());
assert_eq!(Token::Eof, lexer.read().unwrap());
}
#[test]
fn test_recoverable_errors() {
do_ok_test(
"9999999999+5",
&[
Token::Bad(
"Bad integer 9999999999: number too large to fit in target type".to_owned(),
),
Token::Plus,
Token::Integer(5),
],
);
do_ok_test(
"\n3!2 1",
&[
Token::Eol,
Token::Bad("Unexpected character in integer: !".to_owned()),
Token::Integer(1),
],
);
do_ok_test(
"a b_d 5",
&[
new_auto_symbol("a"),
Token::Bad("Unexpected character in symbol: _".to_owned()),
Token::Integer(5),
],
);
do_ok_test(
"( \"this is incomplete",
&[
Token::LeftParen,
Token::Bad("Incomplete string due to EOF: this is incomplete".to_owned()),
],
);
do_ok_test(
"+ - ! * /",
&[
Token::Plus,
Token::Minus,
Token::Bad("Unknown character: !".to_owned()),
Token::Multiply,
Token::Divide,
],
);
}
struct FaultyReader {
good: Option<Vec<u8>>,
}
impl FaultyReader {
fn new(good: &str) -> Self {
assert!(good.ends_with('\n'));
Self {
good: Some(good.as_bytes().to_owned()),
}
}
}
impl io::Read for FaultyReader {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if let Some(good) = self.good.take() {
assert!(buf.len() > good.len());
buf[0..good.len()].clone_from_slice(&good[..]);
Ok(good.len())
} else {
Err(io::Error::from(io::ErrorKind::InvalidData))
}
}
}
#[test]
fn test_unrecoverable_io_error() {
let mut reader = FaultyReader::new("3 + 5\n");
let mut lexer = Lexer::from(&mut reader);
assert_eq!(Token::Integer(3), lexer.read().unwrap());
assert_eq!(Token::Plus, lexer.read().unwrap());
assert_eq!(Token::Integer(5), lexer.read().unwrap());
assert_eq!(Token::Eol, lexer.read().unwrap());
let e = lexer.read().unwrap_err();
assert_eq!(io::ErrorKind::InvalidData, e.kind());
let e = lexer.read().unwrap_err();
assert_eq!(io::ErrorKind::Other, e.kind());
}
}