1use std::collections::{
2 BTreeSet as Set,
3 BTreeMap as Map,
4 VecDeque,
5};
6use segment_map::Segment;
7use finite_automata::{
8 Enfa,
9 Dfa,
10 Subsume,
11 states_contains_from,
12};
13use regular_expression_bootstrap::Expression;
14use crate::{
15 TokenState,
16 TokenStateGenerator,
17};
18
19type Result<T> = std::result::Result<T, &'static str>;
20
21#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
22pub struct Token<T> {
23 kind: T,
24 text: String,
25}
26
27impl<T> Token<T> {
28 pub fn new(kind: T, text: &str) -> Token<T> {
29 Token { kind, text: String::from(text) }
30 }
31
32 pub fn kind(&self) -> &T {
33 &self.kind
34 }
35
36 pub fn text(&self) -> &str {
37 &self.text
38 }
39}
40
41#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
42pub struct Lexer<T> {
43 productions: Map<Expression, Option<T>>,
44 dfa: Dfa<Set<TokenState<T>>, u32>
45}
46
47impl<T: Clone + Ord> Lexer<T> {
48 pub fn new(productions: Map<Expression, Option<T>>) -> Lexer<T> {
49 let mut fas = Vec::new();
50 for (expression, token) in &productions {
51 fas.push(expression.as_enfa(&mut TokenStateGenerator::new(token.clone())));
52 }
53 let mut alt = Enfa::new(TokenState::new(None));
54 for fa in fas {
55 alt.subsume(&fa);
56 let fa_initial_index = states_contains_from(&alt, &fa, fa.initial_index()).expect("state does not exist");
57 alt.transitions_insert((alt.initial_index(), Segment::empty(), fa_initial_index));
58 for fa_final_index in fa.final_indices() {
59 let fa_final_index = states_contains_from(&alt, &fa, fa_final_index).expect("state does not exist");
60 alt.set_final(fa_final_index);
61 }
62 }
63 Lexer { productions, dfa: Dfa::from(&alt) }
64 }
65
66 pub fn lex(&self, text: &str) -> Result<Vec<Token<T>>> {
67 let mut tokens = Vec::new();
68 let mut token_text = String::from("");
69 let mut characters: VecDeque<char> = text.chars().collect();
70 let mut source_index = self.dfa.initial_index();
71 while let Some(character) = characters.pop_front() {
72 if let Some(transition_index) = self.dfa.transitions_contains_outgoing((source_index, &character.into())) {
73 let (_, _, target_index) = self.dfa.transitions_index(transition_index);
74 token_text.push(character);
75 source_index = target_index;
76 } else {
77 if self.dfa.is_final(source_index) {
78 let mut token_kind = &None;
79 for token_state in self.dfa.states_index(source_index) {
80 if token_kind.is_none() {
81 token_kind = token_state.token_kind();
82 } else {
83 if token_state.token_kind().is_some() && token_state.token_kind() != token_kind {
84 return Err("inconsistent tokens in final state");
85 }
86 }
87 }
88 if let Some(token_kind) = token_kind {
89 tokens.push(Token::new(token_kind.clone(), token_text.as_str()));
90 }
91 token_text.clear();
92 characters.push_front(character);
93 source_index = self.dfa.initial_index();
94 } else { return Err("partial match"); }
95 }
96 }
97 if self.dfa.is_final(source_index) {
98 let mut token_kind = &None;
99 for token_state in self.dfa.states_index(source_index) {
100 if token_kind.is_none() {
101 token_kind = token_state.token_kind();
102 } else {
103 if token_state.token_kind().is_some() && token_state.token_kind() != token_kind {
104 return Err("inconsistent tokens in final state");
105 }
106 }
107 }
108 if let Some(token_kind) = token_kind {
109 tokens.push(Token::new(token_kind.clone(), token_text.as_str()));
110 }
111 } else { return Err("partial match"); }
112 Ok(tokens)
113 }
114}
115
116#[cfg(test)]
117mod tests {
118 use regular_expression_bootstrap::{
119 sym,
120 rep,
121 con,
122 neg,
123 alt,
124 sgl,
125 rng,
126 ast,
127 };
128 use crate::{
129 Lexer,
130 Token,
131 };
132 use super::Result;
133
134 #[test]
135 fn test_1() -> Result<()> {
136 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
137 enum TokenKind {
138 A,
139 B,
140 };
141 use TokenKind::*;
142 let lexer = Lexer::new(map![
143 sym![sgl!('A')] => Some(A),
144 sym![sgl!('B')] => Some(B),
145 ast!(sym![sgl!(' ')]) => None
146 ]);
147 let expected = vec![
148 Token::new(A, "A"),
149 Token::new(B, "B"),
150 Token::new(A, "A"),
151 ];
152 let actual = lexer.lex("A B A ")?;
153 assert_eq!(expected, actual);
154 Ok(())
155 }
156
157 #[test]
158 fn test_2() -> Result<()> {
159 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
160 #[allow(non_camel_case_types)]
161 enum TokenKind {
162 A_REP,
163 B_REP
164 };
165 use TokenKind::*;
166 let lexer = Lexer::new(map![
167 ast!(sym![sgl!('A')]) => Some(A_REP),
168 ast!(sym![sgl!('B')]) => Some(B_REP),
169 ast!(sym![sgl!(' ')]) => None
170 ]);
171 let expected = vec![
172 Token::new(A_REP, "AAAAAAA"),
173 Token::new(B_REP, "BBBB"),
174 Token::new(B_REP, "BBBB"),
175 ];
176 let actual = lexer.lex("AAAAAAABBBB BBBB")?;
177 assert_eq!(expected, actual);
178 Ok(())
179 }
180
181 #[test]
182 fn test_3() -> Result<()> {
183 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
184 #[allow(non_camel_case_types)]
185 enum TokenKind {
186 A,
187 AB,
188 BB,
189 B,
190 };
191 use TokenKind::*;
192 let lexer = Lexer::new(map![
193 sym![sgl!('A')] => Some(A),
194 con![sym![sgl!('A')], sym![sgl!('B')]] => Some(AB),
195 con![sym![sgl!('B')], sym![sgl!('B')]] => Some(BB),
196 sym![sgl!('B')] => Some(B)
197 ]);
198 let expected = vec![
199 Token::new(AB, "AB"),
200 Token::new(B, "B"),
201 ];
202 let actual = lexer.lex("ABB")?;
203 assert_eq!(expected, actual);
204 Ok(())
205 }
206
207 #[test]
208 fn test_4() -> Result<()> {
209 #[allow(non_camel_case_types)]
210 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
211 enum TokenKind {
212 FULL_STOP,
213 CARET,
214 DOLLAR_SIGN,
215 ASTERISK,
216 PLUS_SIGN,
217 HYPHEN,
218 QUESTION_MARK,
219 LEFT_PARENTHESIS,
220 RIGHT_PARENTHESIS,
221 LEFT_SQUARE_BRACKET,
222 RIGHT_SQUARE_BRACKET,
223 LEFT_CURLY_BRACKET,
224 RIGHT_CURLY_BRACKET,
225 VERTICAL_BAR,
226 COMMA,
227 DIGIT_LITERAL,
228 UNESCAPED_LITERAL,
229 ESCAPED_LITERAL,
230 CONTROL_LITERAL,
231 OCTAL_LITERAL,
232 HEXADECIMAL_LITERAL,
233 UNICODE_LITERAL,
234 }
235 use TokenKind::*;
236 let lexer = Lexer::new(map![
237 sym![sgl!('.')] => Some(FULL_STOP),
238 sym![sgl!('^')] => Some(CARET),
239 sym![sgl!('$')] => Some(DOLLAR_SIGN),
240 sym![sgl!('*')] => Some(ASTERISK),
241 sym![sgl!('+')] => Some(PLUS_SIGN),
242 sym![sgl!('-')] => Some(HYPHEN),
243 sym![sgl!('?')] => Some(QUESTION_MARK),
244 sym![sgl!('(')] => Some(LEFT_PARENTHESIS),
245 sym![sgl!(')')] => Some(RIGHT_PARENTHESIS),
246 sym![sgl!('[')] => Some(LEFT_SQUARE_BRACKET),
247 sym![sgl!(']')] => Some(RIGHT_SQUARE_BRACKET),
248 sym![sgl!('{')] => Some(LEFT_CURLY_BRACKET),
249 sym![sgl!('}')] => Some(RIGHT_CURLY_BRACKET),
250 sym![sgl!('|')] => Some(VERTICAL_BAR),
251 sym![sgl!(',')] => Some(COMMA),
252 sym![rng!('0', '9')] => Some(DIGIT_LITERAL),
253 neg![sgl!('.'), sgl!('^'), sgl!('$'), sgl!('*'), sgl!('+'), sgl!('-'), sgl!('?'), sgl!('('), sgl!(')'), sgl!('['), sgl!(']'), sgl!('{'), sgl!('}'), sgl!('|'), sgl!(','), rng!('0', '9')] => Some(UNESCAPED_LITERAL),
254 con![sym![sgl!('\\')], sym![sgl!('.'), sgl!('^'), sgl!('$'), sgl!('*'), sgl!('+'), sgl!('-'), sgl!('?'), sgl!('('), sgl!(')'), sgl!('['), sgl!(']'), sgl!('{'), sgl!('}'), sgl!('|')]] => Some(ESCAPED_LITERAL),
255 con![sym![sgl!('\\')], sym![sgl!('n'), sgl!('r'), sgl!('t')]] => Some(CONTROL_LITERAL),
256 con![sym![sgl!('\\')], rep!(sym![rng!('0', '7')], Some(1), Some(3))] => Some(OCTAL_LITERAL),
257 con![sym![sgl!('\\')], sym![sgl!('x')], rep!(sym![rng!('0', '9'), rng!('a', 'f'), rng!('A', 'F')], Some(1), Some(2))] => Some(HEXADECIMAL_LITERAL),
258 con![sym![sgl!('\\')], alt![con![sym![sgl!('u')], rep!(sym![rng!('0', '9'), rng!('a', 'f'), rng!('A', 'F')], Some(4), Some(4))], con![sym![sgl!('U')], rep!(sym![rng!('0', '9'), rng!('a', 'f'), rng!('A', 'F')], Some(8), Some(8))]]] => Some(UNICODE_LITERAL)
259 ]);
260 let expected = vec![
261 Token::new(LEFT_SQUARE_BRACKET, "["),
262 Token::new(UNESCAPED_LITERAL, "A"),
263 Token::new(UNESCAPED_LITERAL, "🦄"),
264 Token::new(ESCAPED_LITERAL, "\\."),
265 Token::new(RIGHT_SQUARE_BRACKET, "]"),
266 Token::new(LEFT_CURLY_BRACKET, "{"),
267 Token::new(DIGIT_LITERAL, "1"),
268 Token::new(COMMA, ","),
269 Token::new(DIGIT_LITERAL, "2"),
270 Token::new(RIGHT_CURLY_BRACKET, "}"),
271 Token::new(UNICODE_LITERAL, "\\UDEADBEEF"),
272 Token::new(OCTAL_LITERAL, "\\777"),
273 Token::new(HEXADECIMAL_LITERAL, "\\x45"),
274 ];
275 let actual = lexer.lex("[A🦄\\.]{1,2}\\UDEADBEEF\\777\\x45")?;
276 assert_eq!(expected, actual);
277 Ok(())
278 }
279}