1use std::str::FromStr;
2use simple_lexer_bootstrap::Lexer as LexerBootstrap;
3use crate::{
4 grammar::{
5 LEXER_PRODUCTIONS,
6 PARSER_PRODUCTIONS,
7 Nonterminal,
8 as_productions,
9 },
10 Token,
11};
12use simple_parser_bootstrap::Parser;
13
14type Result<T> = std::result::Result<T, &'static str>;
15
16pub struct Lexer<T> {
17 lexer: LexerBootstrap<T>
18}
19
20impl<T: Clone + FromStr + Ord> Lexer<T> {
21 pub fn new(productions: &str) -> Result<Lexer<T>> {
22 let lexer = LexerBootstrap::new(LEXER_PRODUCTIONS.clone());
23 let parser = Parser::new(PARSER_PRODUCTIONS.clone(), Nonterminal::Root);
24 let tokens = lexer.lex(productions)?;
25 let parse_tree = parser.parse(&tokens).unwrap();
26 let productions = as_productions(&parse_tree)?;
27 Ok(Lexer { lexer: LexerBootstrap::new(productions) })
28 }
29
30 pub fn lex(&self, text: &str) -> Result<Vec<Token<T>>> {
31 self.lexer.lex(text)
32 }
33}
34
35#[cfg(test)]
36mod tests {
37 use std::str::FromStr;
38 use crate::{
39 Lexer,
40 Token,
41 };
42 use super::Result;
43
44 #[test]
45 fn test_1() -> Result<()> {
46 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
47 enum TokenKind {
48 A,
49 B,
50 };
51 impl FromStr for TokenKind {
52 type Err = &'static str;
53 fn from_str(text: &str) -> Result<Self> {
54 use TokenKind::*;
55 match text {
56 "A" => Ok(A),
57 "B" => Ok(B),
58 _ => Err("not token kind")
59 }
60 }
61 }
62 use TokenKind::*;
63 let lexer = Lexer::new(r#"
64 /A/ => A;
65 /B/ => B;
66 / / => ;
67 "#)?;
68 let expected = vec![
69 Token::new(A, "A"),
70 Token::new(B, "B"),
71 Token::new(A, "A"),
72 ];
73 let actual = lexer.lex("A B A ")?;
74 assert_eq!(expected, actual);
75 Ok(())
76 }
77
78 #[test]
79 fn test_2() -> Result<()> {
80 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
81 #[allow(non_camel_case_types)]
82 enum TokenKind {
83 A_REP,
84 B_REP
85 };
86 impl FromStr for TokenKind {
87 type Err = &'static str;
88 fn from_str(text: &str) -> Result<Self> {
89 use TokenKind::*;
90 match text {
91 "A_REP" => Ok(A_REP),
92 "B_REP" => Ok(B_REP),
93 _ => Err("not token kind")
94 }
95 }
96 }
97 use TokenKind::*;
98 let lexer = Lexer::new(r#"
99 /A*/ => A_REP;
100 /B*/ => B_REP;
101 / / => ;
102 "#)?;
103 let expected = vec![
104 Token::new(A_REP, "AAAAAAA"),
105 Token::new(B_REP, "BBBB"),
106 Token::new(B_REP, "BBBB"),
107 ];
108 let actual = lexer.lex("AAAAAAABBBB BBBB")?;
109 assert_eq!(expected, actual);
110 Ok(())
111 }
112
113 #[test]
114 fn test_3() -> Result<()> {
115 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
116 #[allow(non_camel_case_types)]
117 enum TokenKind {
118 A,
119 AB,
120 BB,
121 B,
122 };
123 impl FromStr for TokenKind {
124 type Err = &'static str;
125 fn from_str(text: &str) -> Result<Self> {
126 use TokenKind::*;
127 match text {
128 "A" => Ok(A),
129 "AB" => Ok(AB),
130 "BB" => Ok(BB),
131 "B" => Ok(B),
132 _ => Err("not token kind")
133 }
134 }
135 }
136 use TokenKind::*;
137 let lexer = Lexer::new(r#"
138 /A/ => A;
139 /AB/ => AB;
140 /BB/ => BB;
141 /B/ => B;
142 "#)?;
143 let expected = vec![
144 Token::new(AB, "AB"),
145 Token::new(B, "B"),
146 ];
147 let actual = lexer.lex("ABB")?;
148 assert_eq!(expected, actual);
149 Ok(())
150 }
151
152 #[test]
153 fn test_4() -> Result<()> {
154 #[allow(non_camel_case_types)]
155 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
156 enum TokenKind {
157 VERTICAL_BAR,
158 ASTERISK,
159 PLUS_SIGN,
160 QUESTION_MARK,
161 LEFT_PARENTHESIS,
162 RIGHT_PARENTHESIS,
163 LEFT_SQUARE_BRACKET,
164 RIGHT_SQUARE_BRACKET,
165 LEFT_CURLY_BRACKET,
166 RIGHT_CURLY_BRACKET,
167 CARET,
168 HYPHEN,
169 COMMA,
170 DIGIT,
171 CONTROL,
172 UNESCAPED,
173 ESCAPED,
174 OCTAL,
175 HEXADECIMAL,
176 UNICODE,
177 }
178 impl FromStr for TokenKind {
179 type Err = &'static str;
180 fn from_str(text: &str) -> Result<Self> {
181 use TokenKind::*;
182 match text {
183 "VERTICAL_BAR" => Ok(VERTICAL_BAR),
184 "ASTERISK" => Ok(ASTERISK),
185 "PLUS_SIGN" => Ok(PLUS_SIGN),
186 "QUESTION_MARK" => Ok(QUESTION_MARK),
187 "LEFT_PARENTHESIS" => Ok(LEFT_PARENTHESIS),
188 "RIGHT_PARENTHESIS" => Ok(RIGHT_PARENTHESIS),
189 "LEFT_SQUARE_BRACKET" => Ok(LEFT_SQUARE_BRACKET),
190 "RIGHT_SQUARE_BRACKET" => Ok(RIGHT_SQUARE_BRACKET),
191 "LEFT_CURLY_BRACKET" => Ok(LEFT_CURLY_BRACKET),
192 "RIGHT_CURLY_BRACKET" => Ok(RIGHT_CURLY_BRACKET),
193 "CARET" => Ok(CARET),
194 "HYPHEN" => Ok(HYPHEN),
195 "COMMA" => Ok(COMMA),
196 "DIGIT" => Ok(DIGIT),
197 "CONTROL" => Ok(CONTROL),
198 "UNESCAPED" => Ok(UNESCAPED),
199 "ESCAPED" => Ok(ESCAPED),
200 "OCTAL" => Ok(OCTAL),
201 "HEXADECIMAL" => Ok(HEXADECIMAL),
202 "UNICODE" => Ok(UNICODE),
203 _ => Err("not token kind")
204 }
205 }
206 }
207 use TokenKind::*;
208 let lexer = Lexer::new(r#"
209 /\|/ => VERTICAL_BAR;
210 /\*/ => ASTERISK;
211 /\+/ => PLUS_SIGN;
212 /\?/ => QUESTION_MARK;
213 /\(/ => LEFT_PARENTHESIS;
214 /\)/ => RIGHT_PARENTHESIS;
215 /\[/ => LEFT_SQUARE_BRACKET;
216 /\]/ => RIGHT_SQUARE_BRACKET;
217 /\{/ => LEFT_CURLY_BRACKET;
218 /\}/ => RIGHT_CURLY_BRACKET;
219 /\^/ => CARET;
220 /\-/ => HYPHEN;
221 /,/ => COMMA;
222 /[0-9]/ => DIGIT;
223 /\\[nrt]/ => CONTROL;
224 /[^\/\|\*\+\?\(\)\[\]\{\}\^\-,0-9\n\r\t\\]/ => UNESCAPED;
225 /\\[\/\|\*\+\?\(\)\[\]\{\}\^\-\\]/ => ESCAPED;
226 /\\[0-7]{1,3}/ => OCTAL;
227 /\\x[0-9a-fA-F]{1,2}/ => HEXADECIMAL;
228 /\\(u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})/ => UNICODE;
229 "#)?;
230 let expected = vec![
231 Token::new(LEFT_SQUARE_BRACKET, "["),
232 Token::new(UNESCAPED, "A"),
233 Token::new(UNESCAPED, "🦄"),
234 Token::new(ESCAPED, "\\^"),
235 Token::new(RIGHT_SQUARE_BRACKET, "]"),
236 Token::new(LEFT_CURLY_BRACKET, "{"),
237 Token::new(DIGIT, "1"),
238 Token::new(COMMA, ","),
239 Token::new(DIGIT, "2"),
240 Token::new(RIGHT_CURLY_BRACKET, "}"),
241 Token::new(UNICODE, "\\UDEADBEEF"),
242 Token::new(OCTAL, "\\777"),
243 Token::new(HEXADECIMAL, "\\x45"),
244 ];
245 let actual = lexer.lex("[A🦄\\^]{1,2}\\UDEADBEEF\\777\\x45")?;
246 assert_eq!(expected, actual);
247 Ok(())
248 }
249}