regex_lexer/
lib.rs

1#![doc(html_root_url = "https://docs.rs/regex-lexer/0.2.0/regex-lexer")]
2//! A regex-based lexer (tokenizer).
3//!
4//! ```
5//! use regex_lexer::{LexerBuilder, Token};
6//!
7//! #[derive(Debug, PartialEq, Eq, Clone, Copy)]
8//! enum Tok {
9//!     Num,
10//!     Add,
11//!     Sub,
12//!     Mul,
13//!     Div,
14//!     Open,
15//!     Close,
16//! }
17//!
18//! let lexer = LexerBuilder::new()
19//!     .token(r"[0-9]+", Tok::Num)
20//!     .token(r"\+", Tok::Add)
21//!     .token(r"-", Tok::Sub)
22//!     .token(r"\*", Tok::Mul)
23//!     .token(r"/", Tok::Div)
24//!     .token(r"\(", Tok::Open)
25//!     .token(r"\)", Tok::Close)
26//!     .ignore(r"\s+")
27//!     .build()?;
28//!
29//! let source = "(1 + 2) * 3";
30//! assert_eq!(
31//!     lexer.tokens(source).collect::<Vec<_>>(),
32//!     vec![
33//!         Token { kind: Tok::Open, span: 0..1, text: "(" }, 
34//!         Token { kind: Tok::Num, span: 1..2, text: "1" }, 
35//!         Token { kind: Tok::Add, span: 3..4, text: "+" }, 
36//!         Token { kind: Tok::Num, span: 5..6, text: "2" },
37//!         Token { kind: Tok::Close, span: 6..7, text: ")" },
38//!         Token { kind: Tok::Mul, span: 8..9, text: "*" },
39//!         Token { kind: Tok::Num, span: 10..11, text: "3" },
40//!     ],
41//! );
42//! # Ok::<(), regex_lexer::Error>(())
43//! ```
44
45use std::ops::Range;
46
47use regex::{Regex, RegexSet};
48pub use regex::Error;
49
50/// A token returned by the lexer.
51#[derive(Debug, Clone, PartialEq, Eq)]
52pub struct Token<'t, K> {
53    pub kind: K,
54    pub span: Range<usize>,
55    pub text: &'t str,
56}
57
58/// Builder struct for [Lexer](struct.Lexer.html).
59pub struct LexerBuilder<'r, K> {
60    regexes: Vec<&'r str>,
61    kinds: Vec<Option<K>>,
62}
63
64impl<'r, K> Default for LexerBuilder<'r, K> {
65    fn default() -> Self {
66        Self::new()
67    }
68}
69
70impl<'r, K> LexerBuilder<'r, K> {
71    /// Create a new [LexerBuilder](struct.LexerBuilder.html).
72    pub fn new() -> Self {
73        LexerBuilder {
74            regexes: Vec::new(),
75            kinds: Vec::new(),
76        }
77    }
78
79    /// Add a new token that matches the regular expression `re`.
80    /// This uses the same syntax as the [regex](http://docs.rs/regex/1/regex) crate.
81    ///
82    /// If the regex matches, it will return a token of kind `kind`.
83    /// ```
84    /// use regex_lexer::{LexerBuilder, Token};
85    /// 
86    /// #[derive(Debug, PartialEq, Eq, Clone, Copy)]
87    /// enum Tok {
88    ///     Num,
89    ///     // ...
90    /// }
91    ///
92    /// let lexer = LexerBuilder::new()
93    ///     .token(r"[0-9]*", Tok::Num)
94    ///     .ignore(r"\s+") // skip whitespace
95    ///     // ...
96    ///     .build()?;
97    ///
98    /// assert_eq!(
99    ///     lexer.tokens("1 2 3").collect::<Vec<_>>(),
100    ///     vec![
101    ///         Token { kind: Tok::Num, span: 0..1, text: "1" },
102    ///         Token { kind: Tok::Num, span: 2..3, text: "2" },
103    ///         Token { kind: Tok::Num, span: 4..5, text: "3" },
104    ///     ],
105    /// );
106    /// # Ok::<(), regex::Error>(())
107    /// ```
108    ///
109    /// If multiple regexes all match, then whichever is defined last
110    /// will be given priority.
111    /// ```
112    /// use regex_lexer::{LexerBuilder, Token};
113    /// 
114    /// #[derive(Debug, PartialEq, Eq, Clone, Copy)]
115    /// enum Tok {
116    ///     Ident,
117    ///     Let,
118    ///     // ...
119    /// }
120    ///
121    /// let lexer = LexerBuilder::new()
122    ///     .token(r"[a-zA-Z_][a-zA-Z0-9_]*", Tok::Ident)
123    ///     .token(r"let\b", Tok::Let)
124    ///     // ...
125    ///     .ignore(r"\s+")
126    ///     .build()?;
127    ///
128    /// assert_eq!(
129    ///     lexer.tokens("let lettuce").collect::<Vec<_>>(), 
130    ///     vec![
131    ///         Token { kind: Tok::Let, span: 0..3, text: "let" },
132    ///         Token { kind: Tok::Ident, span: 4..11, text: "lettuce" },
133    ///     ],
134    /// );
135    /// # Ok::<(), regex::Error>(())
136    /// ```
137    pub fn token(mut self, re: &'r str, kind: K) -> Self
138    {
139        self.regexes.push(re);
140        self.kinds.push(Some(kind));
141        self
142    }
143
144    /// Add a new regex which if matched will ignore the matched text.
145    pub fn ignore(mut self, re: &'r str) -> Self {
146        self.regexes.push(re);
147        self.kinds.push(None);
148        self
149    }
150
151    /// Construct a [Lexer](struct.Lexer.html) which matches these tokens.
152    ///
153    /// ## Errors
154    ///
155    /// If a regex cannot be compiled, a [Error](https://docs.rs/regex/1/regex/enum.Error.html) is returned.
156    pub fn build(self) -> Result<Lexer<K>, Error> {
157        let regexes = self.regexes.into_iter().map(|r| format!("^{}", r));
158        let regex_set = RegexSet::new(regexes)?;
159        let mut regexes = Vec::new();
160        for pattern in regex_set.patterns() {
161            regexes.push(Regex::new(pattern)?);
162        }
163
164        Ok(Lexer {
165            kinds: self.kinds,
166            regexes,
167            regex_set,
168        })
169    }
170}
171
172/// A regex-based lexer.
173///
174/// ```
175/// use regex_lexer::{LexerBuilder, Token};
176/// 
177/// #[derive(Debug, PartialEq, Eq, Clone, Copy)]
178/// enum Tok {
179///     Ident,
180///     // ...
181/// }
182///
183/// let lexer = LexerBuilder::new()
184///     .token(r"\p{XID_Start}\p{XID_Continue}*", Tok::Ident)
185///     .ignore(r"\s+") // skip whitespace
186///     // ...
187///     .build()?;
188///
189/// let tokens = lexer.tokens("these are some identifiers");
190///
191/// # assert_eq!(
192/// #    tokens.collect::<Vec<_>>(),
193/// #    vec![
194/// #        Token { kind: Tok::Ident, span: 0..5, text: "these" }, 
195/// #        Token { kind: Tok::Ident, span: 6..9, text: "are" }, 
196/// #        Token { kind: Tok::Ident, span: 10..14, text: "some" }, 
197/// #        Token { kind: Tok::Ident, span: 15..26, text: "identifiers" },
198/// #    ],
199/// # );
200/// # Ok::<(), regex::Error>(())
201/// ```
202#[derive(Debug)]
203pub struct Lexer<K> {
204    kinds: Vec<Option<K>>,
205    regexes: Vec<Regex>,
206    regex_set: RegexSet,
207}
208
209impl<K> Lexer<K> {
210    /// Create a [LexerBuilder](struct.LexerBuilder.html). This is the same as [LexerBuilder::new](struct.LexerBuilder.html#method.new).
211    pub fn builder<'r>() -> LexerBuilder<'r, K> {
212        LexerBuilder::new()
213    }
214
215    /// Return an iterator over all matched tokens.
216    pub fn tokens<'l, 't>(&'l self, source: &'t str) -> Tokens<'l, 't, K> {
217        Tokens {
218            lexer: self,
219            source,
220            position: 0,
221        }
222    }
223}
224
225/// The type returned by [Lexer::tokens](struct.Lexer.html#method.tokens).
226#[derive(Debug)]
227pub struct Tokens<'l, 't, K> {
228    lexer: &'l Lexer<K>,
229    source: &'t str,
230    position: usize,
231}
232
233impl<'l, 't, K: Copy> Iterator for Tokens<'l, 't, K> {
234    type Item = Token<'t, K>;
235
236    fn next(&mut self) -> Option<Self::Item> {
237        loop {
238            if self.position == self.source.len() {
239                return None;
240            }
241
242            let string = &self.source[self.position..];
243            let match_set = self.lexer.regex_set.matches(string);
244            let (len, i) = match_set
245                .into_iter()
246                .map(|i: usize| {
247                    let m = self.lexer.regexes[i].find(string).unwrap();
248                    assert!(m.start() == 0);
249                    (m.end(), i)
250                })
251                //.max_by_key(|(len, _)| *len)
252                .next_back()
253                .unwrap();
254
255            let span = self.position..self.position + len;
256            let text = &self.source[span.clone()];
257            self.position += len;
258            match self.lexer.kinds[i] {
259                Some(kind) => return Some(Token { kind, span, text}),
260                None => {}
261            }
262        }
263    }
264}