simple_lexer/
lexer.rs

1use std::str::FromStr;
2use simple_lexer_bootstrap::Lexer as LexerBootstrap;
3use crate::{
4    grammar::{
5        LEXER_PRODUCTIONS,
6        PARSER_PRODUCTIONS,
7        Nonterminal,
8        as_productions,
9    },
10    Token,
11};
12use simple_parser_bootstrap::Parser;
13
14type Result<T> = std::result::Result<T, &'static str>;
15
16pub struct Lexer<T> {
17    lexer: LexerBootstrap<T>
18}
19
20impl<T: Clone + FromStr + Ord> Lexer<T> {
21    pub fn new(productions: &str) -> Result<Lexer<T>> {
22        let lexer = LexerBootstrap::new(LEXER_PRODUCTIONS.clone());
23        let parser = Parser::new(PARSER_PRODUCTIONS.clone(), Nonterminal::Root);
24        let tokens = lexer.lex(productions)?;
25        let parse_tree = parser.parse(&tokens).unwrap();
26        let productions = as_productions(&parse_tree)?;
27        Ok(Lexer { lexer: LexerBootstrap::new(productions) })
28    }
29
30    pub fn lex(&self, text: &str) -> Result<Vec<Token<T>>> {
31        self.lexer.lex(text)
32    }
33}
34
35#[cfg(test)]
36mod tests {
37    use std::str::FromStr;
38    use crate::{
39        Lexer,
40        Token,
41    };
42    use super::Result;
43
44    #[test]
45    fn test_1() -> Result<()> {
46        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
47        enum TokenKind {
48            A,
49            B,
50        };
51        impl FromStr for TokenKind {
52            type Err = &'static str;
53            fn from_str(text: &str) -> Result<Self> {
54                use TokenKind::*;
55                match text {
56                    "A" => Ok(A),
57                    "B" => Ok(B),
58                    _ => Err("not token kind")
59                }
60            }
61        }
62        use TokenKind::*;
63        let lexer = Lexer::new(r#"
64            /A/ => A;
65            /B/ => B;
66            / / => ;
67        "#)?;
68        let expected = vec![
69            Token::new(A, "A"),
70            Token::new(B, "B"),
71            Token::new(A, "A"),
72        ];
73        let actual = lexer.lex("A B  A   ")?;
74        assert_eq!(expected, actual);
75        Ok(())
76    }
77
78    #[test]
79    fn test_2() -> Result<()> {
80        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
81        #[allow(non_camel_case_types)]
82        enum TokenKind {
83            A_REP,
84            B_REP
85        };
86        impl FromStr for TokenKind {
87            type Err = &'static str;
88            fn from_str(text: &str) -> Result<Self> {
89                use TokenKind::*;
90                match text {
91                    "A_REP" => Ok(A_REP),
92                    "B_REP" => Ok(B_REP),
93                    _ => Err("not token kind")
94                }
95            }
96        }
97        use TokenKind::*;
98        let lexer = Lexer::new(r#"
99            /A*/ => A_REP;
100            /B*/ => B_REP;
101            / / => ;
102        "#)?;
103        let expected = vec![
104            Token::new(A_REP, "AAAAAAA"), 
105            Token::new(B_REP, "BBBB"),
106            Token::new(B_REP, "BBBB"),
107        ];
108        let actual = lexer.lex("AAAAAAABBBB   BBBB")?;
109        assert_eq!(expected, actual);
110        Ok(())
111    }
112
113    #[test]
114    fn test_3() -> Result<()> {
115        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
116        #[allow(non_camel_case_types)]
117        enum TokenKind {
118            A,
119            AB,
120            BB,
121            B,
122        };
123        impl FromStr for TokenKind {
124            type Err = &'static str;
125            fn from_str(text: &str) -> Result<Self> {
126                use TokenKind::*;
127                match text {
128                    "A" => Ok(A),
129                    "AB" => Ok(AB),
130                    "BB" => Ok(BB),
131                    "B" => Ok(B),
132                    _ => Err("not token kind")
133                }
134            }
135        }
136        use TokenKind::*;
137        let lexer = Lexer::new(r#"
138            /A/ => A;
139            /AB/ => AB;
140            /BB/ => BB;
141            /B/ => B;
142        "#)?;
143        let expected = vec![
144            Token::new(AB, "AB"),
145            Token::new(B, "B"),
146        ];
147        let actual = lexer.lex("ABB")?;
148        assert_eq!(expected, actual);
149        Ok(())
150    }
151
152    #[test]
153    fn test_4() -> Result<()> {
154        #[allow(non_camel_case_types)]
155        #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
156        enum TokenKind {
157            VERTICAL_BAR,
158            ASTERISK,
159            PLUS_SIGN,
160            QUESTION_MARK,
161            LEFT_PARENTHESIS,
162            RIGHT_PARENTHESIS,
163            LEFT_SQUARE_BRACKET,
164            RIGHT_SQUARE_BRACKET,
165            LEFT_CURLY_BRACKET,
166            RIGHT_CURLY_BRACKET,
167            CARET,
168            HYPHEN,
169            COMMA,
170            DIGIT,
171            CONTROL,
172            UNESCAPED,
173            ESCAPED,
174            OCTAL,
175            HEXADECIMAL,
176            UNICODE,
177        }
178        impl FromStr for TokenKind {
179            type Err = &'static str;
180            fn from_str(text: &str) -> Result<Self> {
181                use TokenKind::*;
182                match text {
183                    "VERTICAL_BAR" => Ok(VERTICAL_BAR),
184                    "ASTERISK" => Ok(ASTERISK),
185                    "PLUS_SIGN" => Ok(PLUS_SIGN),
186                    "QUESTION_MARK" => Ok(QUESTION_MARK),
187                    "LEFT_PARENTHESIS" => Ok(LEFT_PARENTHESIS),
188                    "RIGHT_PARENTHESIS" => Ok(RIGHT_PARENTHESIS),
189                    "LEFT_SQUARE_BRACKET" => Ok(LEFT_SQUARE_BRACKET),
190                    "RIGHT_SQUARE_BRACKET" => Ok(RIGHT_SQUARE_BRACKET),
191                    "LEFT_CURLY_BRACKET" => Ok(LEFT_CURLY_BRACKET),
192                    "RIGHT_CURLY_BRACKET" => Ok(RIGHT_CURLY_BRACKET),
193                    "CARET" => Ok(CARET),
194                    "HYPHEN" => Ok(HYPHEN),
195                    "COMMA" => Ok(COMMA),
196                    "DIGIT" => Ok(DIGIT),
197                    "CONTROL" => Ok(CONTROL),
198                    "UNESCAPED" => Ok(UNESCAPED),
199                    "ESCAPED" => Ok(ESCAPED),
200                    "OCTAL" => Ok(OCTAL),
201                    "HEXADECIMAL" => Ok(HEXADECIMAL),
202                    "UNICODE" => Ok(UNICODE),
203                    _ => Err("not token kind")
204                }
205            }
206        }
207        use TokenKind::*;
208        let lexer = Lexer::new(r#"
209            /\|/ => VERTICAL_BAR;
210            /\*/ => ASTERISK;
211            /\+/ => PLUS_SIGN;
212            /\?/ => QUESTION_MARK;
213            /\(/ => LEFT_PARENTHESIS;
214            /\)/ => RIGHT_PARENTHESIS;
215            /\[/ => LEFT_SQUARE_BRACKET;
216            /\]/ => RIGHT_SQUARE_BRACKET;
217            /\{/ => LEFT_CURLY_BRACKET;
218            /\}/ => RIGHT_CURLY_BRACKET;
219            /\^/ => CARET;
220            /\-/ => HYPHEN;
221            /,/ => COMMA;
222            /[0-9]/ => DIGIT;
223            /\\[nrt]/ => CONTROL;
224            /[^\/\|\*\+\?\(\)\[\]\{\}\^\-,0-9\n\r\t\\]/ => UNESCAPED;
225            /\\[\/\|\*\+\?\(\)\[\]\{\}\^\-\\]/ => ESCAPED;
226            /\\[0-7]{1,3}/ => OCTAL;
227            /\\x[0-9a-fA-F]{1,2}/ => HEXADECIMAL;
228            /\\(u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})/ => UNICODE;
229        "#)?;
230        let expected = vec![
231            Token::new(LEFT_SQUARE_BRACKET, "["),
232            Token::new(UNESCAPED, "A"),
233            Token::new(UNESCAPED, "🦄"),
234            Token::new(ESCAPED, "\\^"),
235            Token::new(RIGHT_SQUARE_BRACKET, "]"),
236            Token::new(LEFT_CURLY_BRACKET, "{"),
237            Token::new(DIGIT, "1"),
238            Token::new(COMMA, ","),
239            Token::new(DIGIT, "2"),
240            Token::new(RIGHT_CURLY_BRACKET, "}"),
241            Token::new(UNICODE, "\\UDEADBEEF"),
242            Token::new(OCTAL, "\\777"),
243            Token::new(HEXADECIMAL, "\\x45"),
244        ];
245        let actual = lexer.lex("[A🦄\\^]{1,2}\\UDEADBEEF\\777\\x45")?;
246        assert_eq!(expected, actual);
247        Ok(())
248    }
249}