regex_lexer/lib.rs
1#![doc(html_root_url = "https://docs.rs/regex-lexer/0.2.0/regex-lexer")]
2//! A regex-based lexer (tokenizer).
3//!
4//! ```
5//! use regex_lexer::{LexerBuilder, Token};
6//!
7//! #[derive(Debug, PartialEq, Eq, Clone, Copy)]
8//! enum Tok {
9//! Num,
10//! Add,
11//! Sub,
12//! Mul,
13//! Div,
14//! Open,
15//! Close,
16//! }
17//!
18//! let lexer = LexerBuilder::new()
19//! .token(r"[0-9]+", Tok::Num)
20//! .token(r"\+", Tok::Add)
21//! .token(r"-", Tok::Sub)
22//! .token(r"\*", Tok::Mul)
23//! .token(r"/", Tok::Div)
24//! .token(r"\(", Tok::Open)
25//! .token(r"\)", Tok::Close)
26//! .ignore(r"\s+")
27//! .build()?;
28//!
29//! let source = "(1 + 2) * 3";
30//! assert_eq!(
31//! lexer.tokens(source).collect::<Vec<_>>(),
32//! vec![
33//! Token { kind: Tok::Open, span: 0..1, text: "(" },
34//! Token { kind: Tok::Num, span: 1..2, text: "1" },
35//! Token { kind: Tok::Add, span: 3..4, text: "+" },
36//! Token { kind: Tok::Num, span: 5..6, text: "2" },
37//! Token { kind: Tok::Close, span: 6..7, text: ")" },
38//! Token { kind: Tok::Mul, span: 8..9, text: "*" },
39//! Token { kind: Tok::Num, span: 10..11, text: "3" },
40//! ],
41//! );
42//! # Ok::<(), regex_lexer::Error>(())
43//! ```
44
45use std::ops::Range;
46
47use regex::{Regex, RegexSet};
48pub use regex::Error;
49
50/// A token returned by the lexer.
51#[derive(Debug, Clone, PartialEq, Eq)]
52pub struct Token<'t, K> {
53 pub kind: K,
54 pub span: Range<usize>,
55 pub text: &'t str,
56}
57
58/// Builder struct for [Lexer](struct.Lexer.html).
59pub struct LexerBuilder<'r, K> {
60 regexes: Vec<&'r str>,
61 kinds: Vec<Option<K>>,
62}
63
64impl<'r, K> Default for LexerBuilder<'r, K> {
65 fn default() -> Self {
66 Self::new()
67 }
68}
69
70impl<'r, K> LexerBuilder<'r, K> {
71 /// Create a new [LexerBuilder](struct.LexerBuilder.html).
72 pub fn new() -> Self {
73 LexerBuilder {
74 regexes: Vec::new(),
75 kinds: Vec::new(),
76 }
77 }
78
79 /// Add a new token that matches the regular expression `re`.
80 /// This uses the same syntax as the [regex](http://docs.rs/regex/1/regex) crate.
81 ///
82 /// If the regex matches, it will return a token of kind `kind`.
83 /// ```
84 /// use regex_lexer::{LexerBuilder, Token};
85 ///
86 /// #[derive(Debug, PartialEq, Eq, Clone, Copy)]
87 /// enum Tok {
88 /// Num,
89 /// // ...
90 /// }
91 ///
92 /// let lexer = LexerBuilder::new()
93 /// .token(r"[0-9]*", Tok::Num)
94 /// .ignore(r"\s+") // skip whitespace
95 /// // ...
96 /// .build()?;
97 ///
98 /// assert_eq!(
99 /// lexer.tokens("1 2 3").collect::<Vec<_>>(),
100 /// vec![
101 /// Token { kind: Tok::Num, span: 0..1, text: "1" },
102 /// Token { kind: Tok::Num, span: 2..3, text: "2" },
103 /// Token { kind: Tok::Num, span: 4..5, text: "3" },
104 /// ],
105 /// );
106 /// # Ok::<(), regex::Error>(())
107 /// ```
108 ///
109 /// If multiple regexes all match, then whichever is defined last
110 /// will be given priority.
111 /// ```
112 /// use regex_lexer::{LexerBuilder, Token};
113 ///
114 /// #[derive(Debug, PartialEq, Eq, Clone, Copy)]
115 /// enum Tok {
116 /// Ident,
117 /// Let,
118 /// // ...
119 /// }
120 ///
121 /// let lexer = LexerBuilder::new()
122 /// .token(r"[a-zA-Z_][a-zA-Z0-9_]*", Tok::Ident)
123 /// .token(r"let\b", Tok::Let)
124 /// // ...
125 /// .ignore(r"\s+")
126 /// .build()?;
127 ///
128 /// assert_eq!(
129 /// lexer.tokens("let lettuce").collect::<Vec<_>>(),
130 /// vec![
131 /// Token { kind: Tok::Let, span: 0..3, text: "let" },
132 /// Token { kind: Tok::Ident, span: 4..11, text: "lettuce" },
133 /// ],
134 /// );
135 /// # Ok::<(), regex::Error>(())
136 /// ```
137 pub fn token(mut self, re: &'r str, kind: K) -> Self
138 {
139 self.regexes.push(re);
140 self.kinds.push(Some(kind));
141 self
142 }
143
144 /// Add a new regex which if matched will ignore the matched text.
145 pub fn ignore(mut self, re: &'r str) -> Self {
146 self.regexes.push(re);
147 self.kinds.push(None);
148 self
149 }
150
151 /// Construct a [Lexer](struct.Lexer.html) which matches these tokens.
152 ///
153 /// ## Errors
154 ///
155 /// If a regex cannot be compiled, a [Error](https://docs.rs/regex/1/regex/enum.Error.html) is returned.
156 pub fn build(self) -> Result<Lexer<K>, Error> {
157 let regexes = self.regexes.into_iter().map(|r| format!("^{}", r));
158 let regex_set = RegexSet::new(regexes)?;
159 let mut regexes = Vec::new();
160 for pattern in regex_set.patterns() {
161 regexes.push(Regex::new(pattern)?);
162 }
163
164 Ok(Lexer {
165 kinds: self.kinds,
166 regexes,
167 regex_set,
168 })
169 }
170}
171
172/// A regex-based lexer.
173///
174/// ```
175/// use regex_lexer::{LexerBuilder, Token};
176///
177/// #[derive(Debug, PartialEq, Eq, Clone, Copy)]
178/// enum Tok {
179/// Ident,
180/// // ...
181/// }
182///
183/// let lexer = LexerBuilder::new()
184/// .token(r"\p{XID_Start}\p{XID_Continue}*", Tok::Ident)
185/// .ignore(r"\s+") // skip whitespace
186/// // ...
187/// .build()?;
188///
189/// let tokens = lexer.tokens("these are some identifiers");
190///
191/// # assert_eq!(
192/// # tokens.collect::<Vec<_>>(),
193/// # vec![
194/// # Token { kind: Tok::Ident, span: 0..5, text: "these" },
195/// # Token { kind: Tok::Ident, span: 6..9, text: "are" },
196/// # Token { kind: Tok::Ident, span: 10..14, text: "some" },
197/// # Token { kind: Tok::Ident, span: 15..26, text: "identifiers" },
198/// # ],
199/// # );
200/// # Ok::<(), regex::Error>(())
201/// ```
202#[derive(Debug)]
203pub struct Lexer<K> {
204 kinds: Vec<Option<K>>,
205 regexes: Vec<Regex>,
206 regex_set: RegexSet,
207}
208
209impl<K> Lexer<K> {
210 /// Create a [LexerBuilder](struct.LexerBuilder.html). This is the same as [LexerBuilder::new](struct.LexerBuilder.html#method.new).
211 pub fn builder<'r>() -> LexerBuilder<'r, K> {
212 LexerBuilder::new()
213 }
214
215 /// Return an iterator over all matched tokens.
216 pub fn tokens<'l, 't>(&'l self, source: &'t str) -> Tokens<'l, 't, K> {
217 Tokens {
218 lexer: self,
219 source,
220 position: 0,
221 }
222 }
223}
224
225/// The type returned by [Lexer::tokens](struct.Lexer.html#method.tokens).
226#[derive(Debug)]
227pub struct Tokens<'l, 't, K> {
228 lexer: &'l Lexer<K>,
229 source: &'t str,
230 position: usize,
231}
232
233impl<'l, 't, K: Copy> Iterator for Tokens<'l, 't, K> {
234 type Item = Token<'t, K>;
235
236 fn next(&mut self) -> Option<Self::Item> {
237 loop {
238 if self.position == self.source.len() {
239 return None;
240 }
241
242 let string = &self.source[self.position..];
243 let match_set = self.lexer.regex_set.matches(string);
244 let (len, i) = match_set
245 .into_iter()
246 .map(|i: usize| {
247 let m = self.lexer.regexes[i].find(string).unwrap();
248 assert!(m.start() == 0);
249 (m.end(), i)
250 })
251 //.max_by_key(|(len, _)| *len)
252 .next_back()
253 .unwrap();
254
255 let span = self.position..self.position + len;
256 let text = &self.source[span.clone()];
257 self.position += len;
258 match self.lexer.kinds[i] {
259 Some(kind) => return Some(Token { kind, span, text}),
260 None => {}
261 }
262 }
263 }
264}