use std::fmt;
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum TokenType {
Text, Space,
VariableStart, VariableEnd, Identifier, TagStart, TagEnd, String,
Int,
Float,
Bool,
Add, Substract, Multiply, Divide, Greater, GreaterOrEqual, Lower, LowerOrEqual, Equal, NotEqual, And, Or, Pipe, Error, Eof,
If,
Else,
Elif,
Endif,
For,
In,
Endfor,
Block,
Endblock,
Extends
}
impl fmt::Display for TokenType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
TokenType::Add => write!(f, "+"),
TokenType::Substract => write!(f, "-"),
TokenType::Multiply => write!(f, "*"),
TokenType::Divide => write!(f, "/"),
TokenType::Greater => write!(f, ">"),
TokenType::GreaterOrEqual => write!(f, ">="),
TokenType::Equal => write!(f, "=="),
TokenType::NotEqual => write!(f, "!="),
TokenType::Lower => write!(f, "<"),
TokenType::LowerOrEqual => write!(f, "<="),
TokenType::And => write!(f, "&&"),
TokenType::Or => write!(f, "||"),
_ => unreachable!()
}
}
}
#[derive(Clone, Debug)]
pub struct Token {
pub kind: TokenType,
pub value: String,
pub line: usize,
pub position: usize }
impl Token {
pub fn new(kind: TokenType, input: &str, line: usize, position: usize) -> Token {
Token {
kind: kind,
value: input.to_owned(),
line: line,
position: position
}
}
pub fn precedence(&self) -> usize {
match self.kind {
TokenType::Multiply | TokenType::Divide => 5,
TokenType::Add | TokenType::Substract => 4,
TokenType::Equal | TokenType::GreaterOrEqual | TokenType::Greater
| TokenType::NotEqual | TokenType::LowerOrEqual | TokenType::Lower => {
3
},
TokenType::And => 2,
TokenType::Or => 1,
_ => 0
}
}
}
struct StateFn(Option<fn(&mut Lexer) -> StateFn>);
impl fmt::Debug for StateFn {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "")
}
}
#[derive(Debug)]
enum BlockType {
Variable,
Block,
}
#[derive(Debug)]
enum DelimiterSide {
Left,
Right,
}
const EOF: char = '֍';
#[derive(Debug)]
pub struct Lexer {
name: String, input: String, chars: Vec<(usize, char)>, start: usize, position: usize, last_position: usize, current_char: usize, state: StateFn, current_block_type: BlockType, pub tokens: Vec<Token> }
impl Lexer {
pub fn new(name: &str, input: &str) -> Lexer {
Lexer {
name: name.to_owned(),
input: input.to_owned(),
chars: input.char_indices().collect(),
start: 0,
position: 0,
last_position: 0,
current_char: 0,
tokens: vec![],
current_block_type: BlockType::Variable, state: StateFn(Some(lex_text))
}
}
pub fn run(&mut self) {
loop {
let StateFn(state_fn) = self.state;
if state_fn.is_none() {
break;
}
self.state = state_fn.unwrap()(self);
}
}
fn next_char(&mut self) -> char {
if self.is_over() {
return EOF;
}
let current_char = self.chars[self.current_char];
let width = if self.current_char < self.chars.len() - 1 {
self.chars[self.current_char + 1].0 - current_char.0
} else {
self.input.len() - current_char.0
};
self.last_position = self.position;
self.position += width;
self.current_char += 1;
current_char.1
}
fn backup(&mut self) {
self.position = self.last_position;
self.current_char -= 1;
}
fn peek(&mut self) -> char {
let next_char = self.next_char();
self.backup();
next_char
}
fn get_line_number(&self) -> usize {
1 + self.get_substring(0, self.last_position)
.chars()
.filter(|&c| c == '\n')
.collect::<Vec<_>>()
.len()
}
fn add_text_token(&mut self) {
if self.position > self.start {
self.add_token(TokenType::Text);
}
}
fn get_substring(&self, start: usize, end: usize) -> String {
String::from_utf8(self.input.as_bytes()[start..end].to_vec()).unwrap()
}
fn add_token(&mut self, kind: TokenType) {
let line = self.get_line_number();
let mut substring = self.get_substring(self.start, self.position);
if kind == TokenType::String {
substring = substring.replace("\"", "");
}
self.tokens.push(Token::new(kind, &substring, line, self.start));
self.start = self.position;
}
fn accept(&mut self, valid: char) -> bool {
if self.next_char() == valid {
return true;
}
self.backup();
false
}
fn starts_with(&self, pattern: &str) -> bool {
self.get_substring(self.position, self.input.len()).starts_with(pattern)
}
fn error(&mut self, message: &str) -> StateFn {
let line = self.get_line_number();
self.tokens.push(
Token::new(TokenType::Error, message, line, self.position)
);
StateFn(None)
}
fn is_over(&self) -> bool {
self.position >= self.input.len()
}
fn add_delimiter(&mut self, side: DelimiterSide) -> StateFn {
self.position += 2;
match self.current_block_type {
BlockType::Block => match side {
DelimiterSide::Left => self.add_token(TokenType::TagStart),
DelimiterSide::Right => self.add_token(TokenType::TagEnd),
},
BlockType::Variable => match side {
DelimiterSide::Left => self.add_token(TokenType::VariableStart),
DelimiterSide::Right => self.add_token(TokenType::VariableEnd),
}
}
self.start = self.position;
self.current_char += 2;
match side {
DelimiterSide::Left => StateFn(Some(lex_inside_block)),
DelimiterSide::Right => StateFn(Some(lex_text)),
}
}
}
fn lex_text(lexer: &mut Lexer) -> StateFn {
while !lexer.is_over() {
match lexer.chars[lexer.current_char].1 {
'{' => {
if lexer.starts_with("{{") {
lexer.add_text_token();
lexer.current_block_type = BlockType::Variable;
return lexer.add_delimiter(DelimiterSide::Left);
} else if lexer.starts_with("{%") {
lexer.add_text_token();
lexer.current_block_type = BlockType::Block;
return lexer.add_delimiter(DelimiterSide::Left);
}
},
_ => {
if lexer.next_char() == EOF {
break;
}
}
}
}
lexer.add_text_token();
lexer.add_token(TokenType::Eof);
StateFn(None)
}
fn lex_space(lexer: &mut Lexer) -> StateFn {
if lexer.peek() == EOF {
return lexer.error("Found EOF while lexing spaces");
}
while lexer.peek().is_whitespace() {
lexer.next_char();
}
lexer.add_token(TokenType::Space);
StateFn(Some(lex_inside_block))
}
fn lex_number(lexer: &mut Lexer) -> StateFn {
let mut token_type = TokenType::Int;
loop {
match lexer.next_char() {
x if x.is_numeric() => continue,
'.' => {
if token_type == TokenType::Int {
token_type = TokenType::Float;
} else {
return lexer.error("Two dots in a number");
}
},
_ => {
lexer.backup();
lexer.add_token(token_type);
return StateFn(Some(lex_inside_block));
}
}
}
}
fn lex_identifier(lexer: &mut Lexer) -> StateFn {
loop {
match lexer.next_char() {
x if x.is_alphanumeric() || x == '_' || x == '.' => continue,
EOF => { return lexer.error("EOF while reading identifier");},
_ => {
lexer.backup();
match lexer.get_substring(lexer.start, lexer.position).as_ref() {
"if" => lexer.add_token(TokenType::If),
"else" => lexer.add_token(TokenType::Else),
"elif" => lexer.add_token(TokenType::Elif),
"endif" => lexer.add_token(TokenType::Endif),
"for" => lexer.add_token(TokenType::For),
"in" => lexer.add_token(TokenType::In),
"endfor" => lexer.add_token(TokenType::Endfor),
"true" | "false" => lexer.add_token(TokenType::Bool),
"block" => lexer.add_token(TokenType::Block),
"endblock" => lexer.add_token(TokenType::Endblock),
"extends" => lexer.add_token(TokenType::Extends),
_ => lexer.add_token(TokenType::Identifier)
}
return StateFn(Some(lex_inside_block));
}
}
}
}
fn lex_string(lexer: &mut Lexer) -> StateFn {
loop {
if lexer.next_char() == '"' {
lexer.add_token(TokenType::String);
return StateFn(Some(lex_inside_block));
}
}
}
fn lex_inside_block(lexer: &mut Lexer) -> StateFn {
while !lexer.is_over() {
if lexer.starts_with("}}") || lexer.starts_with("%}") {
return lexer.add_delimiter(DelimiterSide::Right);
}
match lexer.next_char() {
EOF => { return lexer.error("EOF while parsing a tag"); },
x if x.is_whitespace() => { return StateFn(Some(lex_space)); }
x if x.is_numeric() => { return StateFn(Some(lex_number)); }
x if x.is_alphabetic() || x == '_' || x == '.' => { return StateFn(Some(lex_identifier)); }
'"' => { return StateFn(Some(lex_string)); },
'-' => lexer.add_token(TokenType::Substract),
'+' => lexer.add_token(TokenType::Add),
'*' => lexer.add_token(TokenType::Multiply),
'/' => lexer.add_token(TokenType::Divide),
'=' => {
if lexer.accept('=') {
lexer.add_token(TokenType::Equal);
} else {
let got = lexer.peek();
lexer.error(&format!("Expected `=` after =, got {}", got));
}
},
'&' => {
if lexer.accept('&') {
lexer.add_token(TokenType::And);
} else {
let got = lexer.peek();
lexer.error(&format!("Expected `&` after &, got {}", got));
}
},
'|' => {
if lexer.accept('|') {
lexer.add_token(TokenType::Or);
} else {
lexer.add_token(TokenType::Pipe);
}
},
'!' => {
if lexer.accept('=') {
lexer.add_token(TokenType::NotEqual);
} else {
let got = lexer.peek();
lexer.error(&format!("Expected `=` after !, got {}", got));
}
},
'<' => {
if lexer.accept('=') {
lexer.add_token(TokenType::LowerOrEqual);
} else {
lexer.add_token(TokenType::Lower);
}
},
'>' => {
if lexer.accept('=') {
lexer.add_token(TokenType::GreaterOrEqual);
} else {
lexer.add_token(TokenType::Greater);
}
},
_ => { return StateFn(None); }
};
}
lexer.error("Unclosed Delimiter")
}
#[cfg(test)]
mod tests {
use super::{TokenType, Lexer};
use super::TokenType::*;
#[derive(Debug)]
struct TokenTest<'a> {
kind: TokenType,
value: &'a str,
}
impl<'a> TokenTest<'a> {
fn new(kind: TokenType, value: &'a str) -> TokenTest<'a> {
TokenTest { kind: kind, value: value }
}
}
const T_TAG_START: TokenTest<'static> = TokenTest { kind: TagStart, value: "{%"};
const T_TAG_END: TokenTest<'static> = TokenTest { kind: TagEnd, value: "%}"};
const T_VARIABLE_START: TokenTest<'static> = TokenTest { kind: VariableStart, value: "{{"};
const T_VARIABLE_END: TokenTest<'static> = TokenTest { kind: VariableEnd, value: "}}"};
const T_EOF: TokenTest<'static> = TokenTest { kind: Eof, value: ""};
const T_ADD: TokenTest<'static> = TokenTest { kind: Add, value: "+"};
const T_SUBSTRACT: TokenTest<'static> = TokenTest { kind: Substract, value: "-"};
const T_MULTIPLY: TokenTest<'static> = TokenTest { kind: Multiply, value: "*"};
const T_DIVIDE: TokenTest<'static> = TokenTest { kind: Divide, value: "/"};
const T_SPACE: TokenTest<'static> = TokenTest { kind: Space, value: " "};
const T_IF: TokenTest<'static> = TokenTest { kind: If, value: "if"};
const T_ELSE: TokenTest<'static> = TokenTest { kind: Else, value: "else"};
const T_ELIF: TokenTest<'static> = TokenTest { kind: Elif, value: "elif"};
const T_ENDIF: TokenTest<'static> = TokenTest { kind: Endif, value: "endif"};
const T_FOR: TokenTest<'static> = TokenTest { kind: For, value: "for"};
const T_IN: TokenTest<'static> = TokenTest { kind: In, value: "in"};
const T_ENDFOR: TokenTest<'static> = TokenTest { kind: Endfor, value: "endfor"};
const T_BLOCK: TokenTest<'static> = TokenTest { kind: Block, value: "block"};
const T_ENDBLOCK: TokenTest<'static> = TokenTest { kind: Endblock, value: "endblock"};
const T_EXTENDS: TokenTest<'static> = TokenTest { kind: Extends, value: "extends"};
const T_GREATER: TokenTest<'static> = TokenTest { kind: Greater, value: ">"};
const T_GREATER_OR_EQUAL: TokenTest<'static> = TokenTest { kind: GreaterOrEqual, value: ">="};
const T_LOWER: TokenTest<'static> = TokenTest { kind: Lower, value: "<"};
const T_LOWER_OR_EQUAL: TokenTest<'static> = TokenTest { kind: LowerOrEqual, value: "<="};
const T_EQUAL: TokenTest<'static> = TokenTest { kind: Equal, value: "=="};
const T_NOTEQUAL: TokenTest<'static> = TokenTest { kind: NotEqual, value: "!="};
const T_AND: TokenTest<'static> = TokenTest { kind: And, value: "&&"};
const T_OR: TokenTest<'static> = TokenTest { kind: Or, value: "||"};
fn identifier_token(ident: &str) -> TokenTest {
TokenTest::new(Identifier, ident)
}
fn string_token(string: &str) -> TokenTest {
TokenTest::new(String, string)
}
fn text_token(text: &str) -> TokenTest {
TokenTest::new(Text, text)
}
fn int_token(value: &str) -> TokenTest {
TokenTest::new(Int, value)
}
fn float_token(value: &str) -> TokenTest {
TokenTest::new(Float, value)
}
fn error_token(msg: &str) -> TokenTest {
TokenTest::new(Error, msg)
}
fn test_tokens(input: &str, test_tokens: Vec<TokenTest>) {
let mut lexer = Lexer::new("test", input);
lexer.run();
if test_tokens.len() != lexer.tokens.len() {
println!("Number of tokens not matching: expected {}, got {}", test_tokens.len(), lexer.tokens.len());
println!("{:#?}", lexer.tokens);
assert!(false);
}
for (i, t) in test_tokens.iter().enumerate() {
let lexer_token = &lexer.tokens[i];
if i == 0 {
assert_eq!(lexer_token.position, 0);
}
if t.kind != lexer_token.kind {
println!("Wrong kind. Expected: {:?}. \n Got: {:?}", t, lexer_token);
assert!(false);
}
if t.value != lexer_token.value {
println!("Wrong value. Expected: {:?}. \n Got: {:?}", t, lexer_token);
assert!(false);
}
}
}
#[test]
fn test_empty() {
let expected = vec![T_EOF];
test_tokens("", expected);
}
#[test]
fn test_only_text() {
let expected = vec![text_token("Hello\n 世界"), T_EOF];
test_tokens("Hello\n 世界", expected);
}
#[test]
fn test_variable_block_and_text() {
let expected = vec![
T_VARIABLE_START,
T_SPACE,
identifier_token("greeting"),
T_SPACE,
T_VARIABLE_END,
text_token(" 世界"),
T_EOF
];
test_tokens("{{ greeting }} 世界", expected);
}
#[test]
fn test_valid_numbers() {
let expected = vec![
T_VARIABLE_START,
T_SPACE,
int_token("1"),
T_SPACE,
float_token("3.14"),
T_SPACE,
T_VARIABLE_END,
T_EOF
];
test_tokens("{{ 1 3.14 }}", expected);
}
#[test]
fn test_numbers_and_variable() {
let expected = vec![
T_VARIABLE_START,
T_SPACE,
int_token("1"),
T_SPACE,
T_MULTIPLY,
T_SPACE,
identifier_token("vat_rate"),
T_SPACE,
T_VARIABLE_END,
T_EOF
];
test_tokens("{{ 1 * vat_rate }}", expected);
}
#[test]
fn test_operators() {
let expected = vec![
T_VARIABLE_START,
T_SUBSTRACT,
T_ADD,
T_MULTIPLY,
T_DIVIDE,
T_EQUAL,
T_AND,
T_LOWER_OR_EQUAL,
T_GREATER_OR_EQUAL,
T_NOTEQUAL,
T_OR,
T_VARIABLE_END,
T_EOF
];
test_tokens("{{-+*/==&&<=>=!=||}}", expected);
}
#[test]
fn test_tag() {
let expected = vec![
text_token("Hello "),
T_TAG_START, T_SPACE,
T_IF,
T_SPACE,
identifier_token("japanese"),
T_SPACE,
T_TAG_END,
text_token("世界"),
T_TAG_START,
T_SPACE,
T_ELSE,
T_SPACE,
T_TAG_END,
text_token("world"),
T_TAG_START,
T_SPACE,
T_ENDIF,
T_SPACE,
T_TAG_END,
T_EOF
];
test_tokens("Hello {% if japanese %}世界{% else %}world{% endif %}", expected);
}
#[test]
fn test_unclosed_block() {
let expected = vec![T_VARIABLE_START, error_token("Unclosed Delimiter")];
test_tokens("{{", expected);
}
#[test]
fn test_invalid_number() {
let expected = vec![T_VARIABLE_START, error_token("Two dots in a number")];
test_tokens("{{1.2.2", expected);
}
#[test]
fn test_block() {
let expected = vec![
T_TAG_START,
T_SPACE,
T_BLOCK,
T_SPACE,
identifier_token("hello"),
T_SPACE,
T_TAG_END,
T_TAG_START,
T_SPACE,
T_ENDBLOCK,
T_SPACE,
identifier_token("hello"),
T_SPACE,
T_TAG_END,
T_EOF
];
test_tokens("{% block hello %}{% endblock hello %}", expected);
}
#[test]
fn test_extends() {
let expected = vec![
T_TAG_START,
T_SPACE,
T_EXTENDS,
T_SPACE,
string_token("main.html"),
T_SPACE,
T_TAG_END,
T_EOF
];
test_tokens("{% extends \"main.html\" %}", expected);
}
#[test]
fn test_unterminated_with_newline() {
let expected = vec![
T_VARIABLE_START,
T_SPACE,
identifier_token("hello"),
T_SPACE,
error_token("Found EOF while lexing spaces")
];
test_tokens("{{ hello \n", expected);
}
}