use std::str::FromStr;
use simple_lexer_bootstrap::Lexer as LexerBootstrap;
use crate::{
grammar::{
LEXER_PRODUCTIONS,
PARSER_PRODUCTIONS,
Nonterminal,
as_productions,
},
Token,
};
use simple_parser_bootstrap::Parser;
type Result<T> = std::result::Result<T, &'static str>;
pub struct Lexer<T> {
lexer: LexerBootstrap<T>
}
impl<T: Clone + FromStr + Ord> Lexer<T> {
pub fn new(productions: &str) -> Result<Lexer<T>> {
let lexer = LexerBootstrap::new(LEXER_PRODUCTIONS.clone());
let parser = Parser::new(PARSER_PRODUCTIONS.clone(), Nonterminal::Root);
let tokens = lexer.lex(productions)?;
let parse_tree = parser.parse(&tokens).unwrap();
let productions = as_productions(&parse_tree)?;
Ok(Lexer { lexer: LexerBootstrap::new(productions) })
}
pub fn lex(&self, text: &str) -> Result<Vec<Token<T>>> {
self.lexer.lex(text)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use crate::{
Lexer,
Token,
};
use super::Result;
#[test]
fn test_1() -> Result<()> {
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
enum TokenKind {
A,
B,
};
impl FromStr for TokenKind {
type Err = &'static str;
fn from_str(text: &str) -> Result<Self> {
use TokenKind::*;
match text {
"A" => Ok(A),
"B" => Ok(B),
_ => Err("not token kind")
}
}
}
use TokenKind::*;
let lexer = Lexer::new(r#"
/A/ => A;
/B/ => B;
/ / => ;
"#)?;
let expected = vec![
Token::new(A, "A"),
Token::new(B, "B"),
Token::new(A, "A"),
];
let actual = lexer.lex("A B A ")?;
assert_eq!(expected, actual);
Ok(())
}
#[test]
fn test_2() -> Result<()> {
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
#[allow(non_camel_case_types)]
enum TokenKind {
A_REP,
B_REP
};
impl FromStr for TokenKind {
type Err = &'static str;
fn from_str(text: &str) -> Result<Self> {
use TokenKind::*;
match text {
"A_REP" => Ok(A_REP),
"B_REP" => Ok(B_REP),
_ => Err("not token kind")
}
}
}
use TokenKind::*;
let lexer = Lexer::new(r#"
/A*/ => A_REP;
/B*/ => B_REP;
/ / => ;
"#)?;
let expected = vec![
Token::new(A_REP, "AAAAAAA"),
Token::new(B_REP, "BBBB"),
Token::new(B_REP, "BBBB"),
];
let actual = lexer.lex("AAAAAAABBBB BBBB")?;
assert_eq!(expected, actual);
Ok(())
}
#[test]
fn test_3() -> Result<()> {
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
#[allow(non_camel_case_types)]
enum TokenKind {
A,
AB,
BB,
B,
};
impl FromStr for TokenKind {
type Err = &'static str;
fn from_str(text: &str) -> Result<Self> {
use TokenKind::*;
match text {
"A" => Ok(A),
"AB" => Ok(AB),
"BB" => Ok(BB),
"B" => Ok(B),
_ => Err("not token kind")
}
}
}
use TokenKind::*;
let lexer = Lexer::new(r#"
/A/ => A;
/AB/ => AB;
/BB/ => BB;
/B/ => B;
"#)?;
let expected = vec![
Token::new(AB, "AB"),
Token::new(B, "B"),
];
let actual = lexer.lex("ABB")?;
assert_eq!(expected, actual);
Ok(())
}
#[test]
fn test_4() -> Result<()> {
#[allow(non_camel_case_types)]
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
enum TokenKind {
VERTICAL_BAR,
ASTERISK,
PLUS_SIGN,
QUESTION_MARK,
LEFT_PARENTHESIS,
RIGHT_PARENTHESIS,
LEFT_SQUARE_BRACKET,
RIGHT_SQUARE_BRACKET,
LEFT_CURLY_BRACKET,
RIGHT_CURLY_BRACKET,
CARET,
HYPHEN,
COMMA,
DIGIT,
CONTROL,
UNESCAPED,
ESCAPED,
OCTAL,
HEXADECIMAL,
UNICODE,
}
impl FromStr for TokenKind {
type Err = &'static str;
fn from_str(text: &str) -> Result<Self> {
use TokenKind::*;
match text {
"VERTICAL_BAR" => Ok(VERTICAL_BAR),
"ASTERISK" => Ok(ASTERISK),
"PLUS_SIGN" => Ok(PLUS_SIGN),
"QUESTION_MARK" => Ok(QUESTION_MARK),
"LEFT_PARENTHESIS" => Ok(LEFT_PARENTHESIS),
"RIGHT_PARENTHESIS" => Ok(RIGHT_PARENTHESIS),
"LEFT_SQUARE_BRACKET" => Ok(LEFT_SQUARE_BRACKET),
"RIGHT_SQUARE_BRACKET" => Ok(RIGHT_SQUARE_BRACKET),
"LEFT_CURLY_BRACKET" => Ok(LEFT_CURLY_BRACKET),
"RIGHT_CURLY_BRACKET" => Ok(RIGHT_CURLY_BRACKET),
"CARET" => Ok(CARET),
"HYPHEN" => Ok(HYPHEN),
"COMMA" => Ok(COMMA),
"DIGIT" => Ok(DIGIT),
"CONTROL" => Ok(CONTROL),
"UNESCAPED" => Ok(UNESCAPED),
"ESCAPED" => Ok(ESCAPED),
"OCTAL" => Ok(OCTAL),
"HEXADECIMAL" => Ok(HEXADECIMAL),
"UNICODE" => Ok(UNICODE),
_ => Err("not token kind")
}
}
}
use TokenKind::*;
let lexer = Lexer::new(r#"
/\|/ => VERTICAL_BAR;
/\*/ => ASTERISK;
/\+/ => PLUS_SIGN;
/\?/ => QUESTION_MARK;
/\(/ => LEFT_PARENTHESIS;
/\)/ => RIGHT_PARENTHESIS;
/\[/ => LEFT_SQUARE_BRACKET;
/\]/ => RIGHT_SQUARE_BRACKET;
/\{/ => LEFT_CURLY_BRACKET;
/\}/ => RIGHT_CURLY_BRACKET;
/\^/ => CARET;
/\-/ => HYPHEN;
/,/ => COMMA;
/[0-9]/ => DIGIT;
/\\[nrt]/ => CONTROL;
/[^\/\|\*\+\?\(\)\[\]\{\}\^\-,0-9\n\r\t\\]/ => UNESCAPED;
/\\[\/\|\*\+\?\(\)\[\]\{\}\^\-\\]/ => ESCAPED;
/\\[0-7]{1,3}/ => OCTAL;
/\\x[0-9a-fA-F]{1,2}/ => HEXADECIMAL;
/\\(u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})/ => UNICODE;
"#)?;
let expected = vec![
Token::new(LEFT_SQUARE_BRACKET, "["),
Token::new(UNESCAPED, "A"),
Token::new(UNESCAPED, "🦄"),
Token::new(ESCAPED, "\\^"),
Token::new(RIGHT_SQUARE_BRACKET, "]"),
Token::new(LEFT_CURLY_BRACKET, "{"),
Token::new(DIGIT, "1"),
Token::new(COMMA, ","),
Token::new(DIGIT, "2"),
Token::new(RIGHT_CURLY_BRACKET, "}"),
Token::new(UNICODE, "\\UDEADBEEF"),
Token::new(OCTAL, "\\777"),
Token::new(HEXADECIMAL, "\\x45"),
];
let actual = lexer.lex("[A🦄\\^]{1,2}\\UDEADBEEF\\777\\x45")?;
assert_eq!(expected, actual);
Ok(())
}
}