#![allow(dead_code)]
use std::iter::Peekable;
#[allow(missing_docs)]
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum Token {
Keyword(Keyword),
Ident(String),
Code(String),
LParen,
RParen,
LBrace,
RBrace,
Period,
Colon,
Comma,
Semicolon,
Pipe,
}
impl Token {
pub fn unwrap_ident(self) -> String {
match self {
Token::Ident(i) => i,
_ => panic!("token {:?} is not an identifier", self),
}
}
pub fn unwrap_code(self) -> String {
match self {
Token::Code(i) => i,
_ => panic!("token {:?} is not a code", self),
}
}
}
#[allow(missing_docs)]
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum Keyword {
Token,
Epsilon,
End,
}
pub struct Lexer<T: Iterator<Item = (usize, char)>> {
input: Peekable<T>,
}
impl<T: Iterator<Item = (usize, char)>> Lexer<T> {
pub fn new(input: T) -> Lexer<T> {
Lexer {
input: input.peekable(),
}
}
}
fn next_relevant<I: Iterator<Item = (usize, char)>>(
input: &mut Peekable<I>,
) -> Option<(usize, char)> {
'outer: while let Some((p, c)) = input.next() {
if c == '/' {
match input.peek() {
Some(&(_, '/')) => {
input.next();
while let Some((_, c)) = input.next() {
if c == '\n' {
continue 'outer;
}
}
return None; }
Some(&(_, '*')) => {
input.next();
while let Some((_, c)) = input.next() {
if c != '*' {
continue;
}
if let Some(&(_, c)) = input.peek() {
if c == '/' {
input.next();
continue 'outer;
}
}
}
return None; }
_ => (),
}
}
if !c.is_whitespace() {
return Some((p, c));
}
}
None
}
fn is_symbol(c: char) -> bool {
match c {
'(' | ')' | '.' | ':' | ',' | ';' | '|' => true,
_ => false,
}
}
impl<T: Iterator<Item = (usize, char)>> Iterator for Lexer<T> {
type Item = (usize, usize, Token);
fn next(&mut self) -> Option<(usize, usize, Token)> {
let (sp, sc) = match next_relevant(&mut self.input) {
Some(x) => x,
None => return None,
};
let mut sl = sp + sc.len_utf8();
let tkn = match sc {
'(' => Token::LParen,
')' => Token::RParen,
'{' => Token::LBrace,
'}' => Token::RBrace,
'.' => Token::Period,
':' => Token::Colon,
',' => Token::Comma,
';' => Token::Semicolon,
'|' => Token::Pipe,
'\'' => {
let mut buffer = String::new();
buffer.push(sc);
let mut escaped = false;
while let Some((ep, ec)) = self.input.next() {
escaped = if buffer.ends_with('\\') && !escaped {
buffer.pop();
true
} else {
false
};
buffer.push(ec);
sl = ep + ec.len_utf8();
if ec == '\'' && !escaped {
break;
}
}
Token::Ident(buffer)
}
'`' => {
let mut buffer = String::new();
let mut escaped = false;
while let Some((ep, ec)) = self.input.next() {
escaped = if buffer.ends_with('\\') && !escaped {
buffer.pop();
true
} else {
false
};
sl = ep + ec.len_utf8();
if ec == '`' && !escaped {
break;
}
buffer.push(ec);
}
Token::Code(buffer)
}
_ => {
let mut buffer = String::new();
buffer.push(sc);
while let Some(&(ep, ec)) = self.input.peek() {
if ec.is_whitespace() || is_symbol(ec) {
break;
}
buffer.push(ec);
sl = ep + ec.len_utf8();
self.input.next();
}
match buffer.as_str() {
"token" => Token::Keyword(Keyword::Token),
"epsilon" => Token::Keyword(Keyword::Epsilon),
"end" => Token::Keyword(Keyword::End),
_ => Token::Ident(buffer),
}
}
};
Some((sp, sl, tkn))
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::Token::*;
use super::Keyword as Kw;
fn lex<S: AsRef<str>>(input: S) -> Vec<Token> {
let lex = Lexer::new(input.as_ref().char_indices());
lex.map(|(_, _, tkn)| tkn).collect()
}
#[test]
fn tokens1() {
assert_eq!(
lex("token IDENT;"),
vec![Keyword(Kw::Token), Ident("IDENT".into()), Semicolon]
);
}
#[test]
fn tokens2() {
assert_eq!(
lex("ident_list : ident_list ',' IDENT | IDENT ;"),
vec![
Ident("ident_list".into()),
Colon,
Ident("ident_list".into()),
Ident("','".into()),
Ident("IDENT".into()),
Pipe,
Ident("IDENT".into()),
Semicolon,
]
);
}
#[test]
fn comment_single_line() {
assert_eq!(lex("| // comment\n ; // comment"), vec![Pipe, Semicolon]);
}
#[test]
fn comment_inline() {
assert_eq!(lex("| /* comment */ ;"), vec![Pipe, Semicolon]);
}
#[test]
fn comment_multiple_lines() {
assert_eq!(lex("| /* comment \n comment */ ;"), vec![Pipe, Semicolon]);
}
#[test]
fn quoted_escapes() {
assert_eq!(
lex("token 'some \\'stuff\\''"),
vec![Keyword(Kw::Token), Ident("'some 'stuff''".into())]
);
}
#[test]
fn quoted_escapes_backslash() {
assert_eq!(
lex("token 'abc \\\\ def'"),
vec![Keyword(Kw::Token), Ident("'abc \\ def'".into())]
);
}
}