1use std::collections::{
2    BTreeSet as Set,
3    BTreeMap as Map,
4    VecDeque,
5};
6use segment_map::Segment;
7use finite_automata::{
8    Enfa,
9    Dfa,
10    Subsume,
11    states_contains_from,
12};
13use regular_expression_bootstrap::Expression;
14use crate::{
15    TokenState,
16    TokenStateGenerator,
17};
18
19type Result<T> = std::result::Result<T, &'static str>;
20
21#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
22pub struct Token<T> {
23    kind: T,
24    text: String,
25}
26
27impl<T> Token<T> {
28    pub fn new(kind: T, text: &str) -> Token<T> {
29        Token { kind, text: String::from(text) }
30    }
31
32    pub fn kind(&self) -> &T {
33        &self.kind
34    }
35    
36    pub fn text(&self) -> &str {
37        &self.text
38    }
39}
40
41#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
42pub struct Lexer<T> {
43    productions: Map<Expression, Option<T>>,
44    dfa: Dfa<Set<TokenState<T>>, u32>
45}
46
47impl<T: Clone + Ord> Lexer<T> {
48    pub fn new(productions: Map<Expression, Option<T>>) -> Lexer<T> {
49        let mut fas = Vec::new();
50        for (expression, token) in &productions {
51            fas.push(expression.as_enfa(&mut TokenStateGenerator::new(token.clone())));
52        }
53        let mut alt = Enfa::new(TokenState::new(None));
54        for fa in fas {
55            alt.subsume(&fa);
56            let fa_initial_index = states_contains_from(&alt, &fa, fa.initial_index()).expect("state does not exist");
57            alt.transitions_insert((alt.initial_index(), Segment::empty(), fa_initial_index));
58            for fa_final_index in fa.final_indices() {
59                let fa_final_index = states_contains_from(&alt, &fa, fa_final_index).expect("state does not exist");
60                alt.set_final(fa_final_index);
61            }
62        }
63        Lexer { productions, dfa: Dfa::from(&alt) }
64    }
65
66    pub fn lex(&self, text: &str) -> Result<Vec<Token<T>>> {
67        let mut tokens = Vec::new();
68        let mut token_text = String::from("");
69        let mut characters: VecDeque<char> = text.chars().collect();
70        let mut source_index = self.dfa.initial_index();
71        while let Some(character) = characters.pop_front() {
72            if let Some(transition_index) = self.dfa.transitions_contains_outgoing((source_index, &character.into())) {
73                let (_, _, target_index) = self.dfa.transitions_index(transition_index);
74                token_text.push(character);
75                source_index = target_index;
76            } else {
77                if self.dfa.is_final(source_index) {
78                    let mut token_kind = &None;
79                    for token_state in self.dfa.states_index(source_index) {
80                        if token_kind.is_none() {
81                            token_kind = token_state.token_kind();
82                        } else {
83                            if token_state.token_kind().is_some() && token_state.token_kind() != token_kind {
84                                return Err("inconsistent tokens in final state");
85                            }
86                        }
87                    }
88                    if let Some(token_kind) = token_kind {
89                        tokens.push(Token::new(token_kind.clone(), token_text.as_str()));
90                    }
91                    token_text.clear();
92                    characters.push_front(character);
93                    source_index = self.dfa.initial_index();
94                } else { return Err("partial match"); }
95            }
96        }
97        if self.dfa.is_final(source_index) {
98            let mut token_kind = &None;
99            for token_state in self.dfa.states_index(source_index) {
100                if token_kind.is_none() {
101                    token_kind = token_state.token_kind();
102                } else {
103                    if token_state.token_kind().is_some() && token_state.token_kind() != token_kind {
104                        return Err("inconsistent tokens in final state");
105                    }
106                }
107            }
108            if let Some(token_kind) = token_kind {
109                tokens.push(Token::new(token_kind.clone(), token_text.as_str()));
110            }
111        } else { return Err("partial match"); }
112        Ok(tokens)
113    }
114}
115
116#[cfg(test)]
117mod tests {
118    use regular_expression_bootstrap::{
119        sym,
120        rep,
121        con,
122        neg,
123        alt,
124        sgl,
125        rng,
126        ast,
127    };
128    use crate::{
129        Lexer,
130        Token,
131    };
132    use super::Result;
133
134    #[test]
135    fn test_1() -> Result<()> {
136        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
137        enum TokenKind {
138            A,
139            B,
140        };
141        use TokenKind::*;
142        let lexer = Lexer::new(map![
143            sym![sgl!('A')] => Some(A),
144            sym![sgl!('B')] => Some(B),
145            ast!(sym![sgl!(' ')]) => None
146        ]);
147        let expected = vec![
148            Token::new(A, "A"),
149            Token::new(B, "B"),
150            Token::new(A, "A"),
151        ];
152        let actual = lexer.lex("A B  A   ")?;
153        assert_eq!(expected, actual);
154        Ok(())
155    }
156
157    #[test]
158    fn test_2() -> Result<()> {
159        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
160        #[allow(non_camel_case_types)]
161        enum TokenKind {
162            A_REP,
163            B_REP
164        };
165        use TokenKind::*;
166        let lexer = Lexer::new(map![
167            ast!(sym![sgl!('A')]) => Some(A_REP),
168            ast!(sym![sgl!('B')]) => Some(B_REP),
169            ast!(sym![sgl!(' ')]) => None
170        ]);
171        let expected = vec![
172            Token::new(A_REP, "AAAAAAA"), 
173            Token::new(B_REP, "BBBB"),
174            Token::new(B_REP, "BBBB"),
175        ];
176        let actual = lexer.lex("AAAAAAABBBB   BBBB")?;
177        assert_eq!(expected, actual);
178        Ok(())
179    }
180
181    #[test]
182    fn test_3() -> Result<()> {
183        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
184        #[allow(non_camel_case_types)]
185        enum TokenKind {
186            A,
187            AB,
188            BB,
189            B,
190        };
191        use TokenKind::*;
192        let lexer = Lexer::new(map![
193            sym![sgl!('A')] => Some(A),
194            con![sym![sgl!('A')], sym![sgl!('B')]] => Some(AB),
195            con![sym![sgl!('B')], sym![sgl!('B')]] => Some(BB),
196            sym![sgl!('B')] => Some(B)
197        ]);
198        let expected = vec![
199            Token::new(AB, "AB"),
200            Token::new(B, "B"),
201        ];
202        let actual = lexer.lex("ABB")?;
203        assert_eq!(expected, actual);
204        Ok(())
205    }
206
207    #[test]
208    fn test_4() -> Result<()> {
209        #[allow(non_camel_case_types)]
210        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
211        enum TokenKind {
212            FULL_STOP,
213            CARET,
214            DOLLAR_SIGN,
215            ASTERISK,
216            PLUS_SIGN,
217            HYPHEN,
218            QUESTION_MARK,
219            LEFT_PARENTHESIS,
220            RIGHT_PARENTHESIS,
221            LEFT_SQUARE_BRACKET,
222            RIGHT_SQUARE_BRACKET,
223            LEFT_CURLY_BRACKET,
224            RIGHT_CURLY_BRACKET,
225            VERTICAL_BAR,
226            COMMA,
227            DIGIT_LITERAL,
228            UNESCAPED_LITERAL,
229            ESCAPED_LITERAL,
230            CONTROL_LITERAL,
231            OCTAL_LITERAL,
232            HEXADECIMAL_LITERAL,
233            UNICODE_LITERAL,
234        }
235        use TokenKind::*;
236        let lexer = Lexer::new(map![
237            sym![sgl!('.')] => Some(FULL_STOP),
238            sym![sgl!('^')] => Some(CARET),
239            sym![sgl!('$')] => Some(DOLLAR_SIGN),
240            sym![sgl!('*')] => Some(ASTERISK),
241            sym![sgl!('+')] => Some(PLUS_SIGN),
242            sym![sgl!('-')] => Some(HYPHEN),
243            sym![sgl!('?')] => Some(QUESTION_MARK),
244            sym![sgl!('(')] => Some(LEFT_PARENTHESIS),
245            sym![sgl!(')')] => Some(RIGHT_PARENTHESIS),
246            sym![sgl!('[')] => Some(LEFT_SQUARE_BRACKET),
247            sym![sgl!(']')] => Some(RIGHT_SQUARE_BRACKET),
248            sym![sgl!('{')] => Some(LEFT_CURLY_BRACKET),
249            sym![sgl!('}')] => Some(RIGHT_CURLY_BRACKET),
250            sym![sgl!('|')] => Some(VERTICAL_BAR),
251            sym![sgl!(',')] => Some(COMMA),
252            sym![rng!('0', '9')] => Some(DIGIT_LITERAL),
253            neg![sgl!('.'), sgl!('^'), sgl!('$'), sgl!('*'), sgl!('+'), sgl!('-'), sgl!('?'), sgl!('('), sgl!(')'), sgl!('['), sgl!(']'), sgl!('{'), sgl!('}'), sgl!('|'), sgl!(','), rng!('0', '9')] => Some(UNESCAPED_LITERAL),
254            con![sym![sgl!('\\')], sym![sgl!('.'), sgl!('^'), sgl!('$'), sgl!('*'), sgl!('+'), sgl!('-'), sgl!('?'), sgl!('('), sgl!(')'), sgl!('['), sgl!(']'), sgl!('{'), sgl!('}'), sgl!('|')]] => Some(ESCAPED_LITERAL),
255            con![sym![sgl!('\\')], sym![sgl!('n'), sgl!('r'), sgl!('t')]] => Some(CONTROL_LITERAL),
256            con![sym![sgl!('\\')], rep!(sym![rng!('0', '7')], Some(1), Some(3))] => Some(OCTAL_LITERAL),
257            con![sym![sgl!('\\')], sym![sgl!('x')], rep!(sym![rng!('0', '9'), rng!('a', 'f'), rng!('A', 'F')], Some(1), Some(2))] => Some(HEXADECIMAL_LITERAL),
258            con![sym![sgl!('\\')], alt![con![sym![sgl!('u')], rep!(sym![rng!('0', '9'), rng!('a', 'f'), rng!('A', 'F')], Some(4), Some(4))], con![sym![sgl!('U')], rep!(sym![rng!('0', '9'), rng!('a', 'f'), rng!('A', 'F')], Some(8), Some(8))]]] => Some(UNICODE_LITERAL)
259        ]);
260        let expected = vec![
261            Token::new(LEFT_SQUARE_BRACKET, "["),
262            Token::new(UNESCAPED_LITERAL, "A"),
263            Token::new(UNESCAPED_LITERAL, "🦄"),
264            Token::new(ESCAPED_LITERAL, "\\."),
265            Token::new(RIGHT_SQUARE_BRACKET, "]"),
266            Token::new(LEFT_CURLY_BRACKET, "{"),
267            Token::new(DIGIT_LITERAL, "1"),
268            Token::new(COMMA, ","),
269            Token::new(DIGIT_LITERAL, "2"),
270            Token::new(RIGHT_CURLY_BRACKET, "}"),
271            Token::new(UNICODE_LITERAL, "\\UDEADBEEF"),
272            Token::new(OCTAL_LITERAL, "\\777"),
273            Token::new(HEXADECIMAL_LITERAL, "\\x45"),
274        ];
275        let actual = lexer.lex("[A🦄\\.]{1,2}\\UDEADBEEF\\777\\x45")?;
276        assert_eq!(expected, actual);
277        Ok(())
278    }
279}