plexer/
lib.rs

1/*!
2**P**attern matching **LEXER**[^note] implementation.
3
4[^note]: More details on [Lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis).
5
6# Principle
7This lexer is making use of the [`Pattern`](pattern::Pattern) trait to find tokens. \
8The idea is to create `Tokens`, explain how to match them with a `Pattern` and build them from the matched `String` value.
9
10```ignore
11lexer!(
12    // Ordered by priority
13    NAME(optional types, ...) {
14        impl Pattern => |value: String| -> Token,
15        ...,
16    },
17    ...,
18);
19```
20
21The [`lexer!`] macro generates module `lexer` which contains `Token`, `LexerError`, `LexerResult` and `Lexer`.
22
23You can now call `Token::tokenize` to tokenize a `&str`,
24it should return a `Lexer` instance that implements `Iterator`. \
25Each iteration, the `Lexer` tries to match one of the given `Pattern` and returns a `LexerResult<Token>` built from the best match.
26
27# Example
28Here is an example for a simple math lexer.
29```
30# use regex::Regex;
31# use plexer::lexer;
32#
33lexer!(
34    // Different operators
35    OPERATOR(char) {
36        '+' => |_| Token::OPERATOR('+'),
37        '-' => |_| Token::OPERATOR('-'),
38        '*' => |_| Token::OPERATOR('*'),
39        '/' => |_| Token::OPERATOR('/'),
40        '=' => |_| Token::OPERATOR('='),
41    },
42    // Integer numbers
43    NUMBER(usize) {
44        |s: &str| s.chars().all(|c| c.is_digit(10))
45            => |v: String| Token::NUMBER(v.parse().unwrap()),
46    },
47    // Variable names
48    IDENTIFIER(String) {
49        regex!(r"[a-zA-Z_$][a-zA-Z_$0-9]*")
50            => |v: String| Token::IDENTIFIER(v),
51    },
52    WHITESPACE {
53        [' ', '\n'] => |_| Token::WHITESPACE,
54    },
55);
56```
57That will expand to these enum and structs.
58```ignore
59mod lexer {
60    pub enum Token {
61        OPERATOR(char),
62        NUMBER(usize),
63        IDENTIFIER(String),
64        WHITESPACE,
65    }
66
67    pub struct Lexer {...}
68    pub struct LexerError {...}
69    pub type LexerResult<T> = Result<T, LexerError>;
70}
71```
72And you can use them afterwards.
73```
74# use plexer::lexer;
75#
76# lexer!(
77#     OPERATOR(char) {
78#         '+' => |_| Token::OPERATOR('+'),
79#         '-' => |_| Token::OPERATOR('-'),
80#         '*' => |_| Token::OPERATOR('*'),
81#         '/' => |_| Token::OPERATOR('/'),
82#         '=' => |_| Token::OPERATOR('='),
83#     },
84#     NUMBER(usize) {
85#         |s: &str| s.chars().all(|c| c.is_digit(10))
86#             => |v: String| Token::NUMBER(v.parse().unwrap()),
87#     },
88#     IDENTIFIER(String) {
89#         regex!(r"[a-zA-Z_$][a-zA-Z_$0-9]*")
90#             => |v: String| Token::IDENTIFIER(v),
91#     },
92#     WHITESPACE {
93#         [' ', '\n'] => |_| Token::WHITESPACE,
94#     },
95# );
96use lexer::*;
97
98let mut lex = Token::tokenize("x_4 = 1 + 3 = 2 * 2");
99assert_eq!(lex.nth(2), Some(Ok(Token::OPERATOR('='))));
100assert_eq!(lex.nth(5), Some(Ok(Token::NUMBER(3))));
101
102// Our lexer doesn't handle parenthesis...
103let mut err = Token::tokenize("x_4 = (1 + 3)");
104assert!(err.nth(4).is_some_and(|res| res.is_err()));
105```
106*/
107
108pub mod pattern;
109
110/**
111Macro to build a [Regex](https://docs.rs/regex/latest/regex/struct.Regex.html).
112
113# Panics
114If the given pattern is not `@safe` and not a valid regex.
115```should_panic
116# use plexer::regex;
117#
118let err = regex!("(");
119```
120
121# Example
122```
123# use plexer::regex;
124#
125// Unwrap inside the macro
126let re = regex!("t|e|s|t");
127
128// Don't unwrap
129let gex = regex!(@safe "t|e|s|t").unwrap();
130```
131**/
132#[macro_export]
133macro_rules! regex {
134    ($pattern:literal) => {
135        regex::Regex::new($pattern).unwrap()
136    };
137    (@safe $pattern:literal) => {
138        regex::Regex::new($pattern)
139    };
140}
141
142/**
143Macro to build your own plugin-based lexer.
144
145# Usage
146```ignore
147lexer!(
148    // Ordered by priority
149    NAME(optional types, ...) {
150        impl Pattern => |value: String| -> Token,
151        ...,
152    },
153    ...,
154);
155```
156
157# Example
158Here is an example for a simple condition statement lexer.
159```
160# use plexer::lexer;
161#
162lexer!(
163    DELIMITER(char) {
164        '{' => |_| Token::DELIMITER('{'),
165        '}' => |_| Token::DELIMITER('}'),
166    },
167    KEYWORD(String) {
168        "if" => |v: String| Token::KEYWORD(v),
169        "else" => |v: String| Token::KEYWORD(v),
170    },
171    IDENTIFIER(String) {
172        regex!(r"[a-zA-Z_$][a-zA-Z_$0-9]*")
173            => |v: String| Token::IDENTIFIER(v),
174    },
175    WHITESPACE {
176        [' ', '\n', '\t'] => |_| Token::WHITESPACE,
177    },
178);
179
180let mut lex = lexer::Token::tokenize("if test { one } else { two }");
181assert_eq!(lex.next(), Some(Ok(lexer::Token::KEYWORD(String::from("if")))));
182```
183**/
184#[macro_export]
185macro_rules! lexer {
186    ($($token:ident $(($($field: ty),+))? {$( $pattern:expr => $build:expr,)+}),* $(,)*) => {
187        mod lexer {
188            use $crate::regex;
189            use $crate::pattern::Pattern;
190
191            const MAX_LENGTH: usize = 1024;
192
193            #[derive(Debug, Clone, PartialEq)]
194            pub enum Token<'a> {
195                $($token$(($($field),+))?),*,
196                _phantom(std::marker::PhantomData<&'a ()>),
197            }
198
199            #[allow(dead_code)]
200            impl<'a> Token<'a> {
201                pub fn tokenize(haystack: &'a str) -> Lexer<'a> {
202                    Lexer { haystack, cursor: 0 }
203                }
204            }
205
206            #[derive(Debug, Clone, PartialEq)]
207            pub struct LexerError<'a> {
208                haystack: &'a str,
209                cursor: usize,
210            }
211
212            impl<'a> LexerError<'a> {
213                 fn new(haystack: &'a str, cursor:usize) -> Self {
214                     Self { haystack, cursor }
215                 }
216            }
217
218            impl<'a> std::fmt::Display for LexerError<'a> {
219                fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
220                        write!(
221                            f, "unexpected character '{}' at index {}",
222                            &self.haystack[self.cursor..=self.cursor],
223                            self.cursor
224                        )
225                }
226            }
227
228            pub type LexerResult<'a, T> = Result<T, LexerError<'a>>;
229
230            #[derive(Debug)]
231            pub struct Lexer<'a> {
232                haystack: &'a str,
233                cursor: usize,
234            }
235
236            impl<'a> Iterator for Lexer<'a> {
237                type Item = LexerResult<'a, Token<'a>>;
238
239                fn next(&mut self) -> Option<Self::Item> {
240                    if self.cursor < self.haystack.len() {
241                        let start = self.cursor;
242                        let end = std::cmp::min(self.haystack.len(), self.cursor + MAX_LENGTH);
243
244                        let mut token = None;
245                        let mut len = 0;
246
247                        $($({
248                            if let Some(mat) = $pattern.find_prefix_in(&self.haystack[start..end]) {
249                                if mat.len() > len {
250                                    token = Some($build(mat.to_string()));
251                                    len = mat.len();
252                                }
253                            }
254                        })+)*
255
256                        self.cursor += std::cmp::max(len, 1);
257                        Some(token.ok_or(LexerError::new(self.haystack.clone(), self.cursor - 1)))
258                    } else {
259                        None
260                    }
261                }
262            }
263        }
264    };
265}