panfix/
lexer.rs

1//! A lexer (a.k.a. tokenizer) that produces an iterator of (token, lexeme) pairs.
2//!
3//! Usage:
4//!
5//! ```
6//! use panfix::TOKEN_ERROR;
7//! use panfix::implementation::lexer::{LexerBuilder, Lexer};
8//!
9//! let whitespace_regex = r#"[ \t\r\n]+"#;
10//! let mut builder = LexerBuilder::new(whitespace_regex).unwrap();
11//! let tok_plus = builder.string("+").unwrap();
12//! let tok_var = builder.regex("[a-zA-Z_]+").unwrap();
13//! let lexer = builder.finish().unwrap();
14//!
15//! let mut lexemes = lexer.lex("x + y");
16//! assert_eq!(lexemes.next().unwrap().token, tok_var);
17//! assert_eq!(lexemes.next().unwrap().token, tok_plus);
18//! assert_eq!(lexemes.next().unwrap().token, tok_var);
19//! assert_eq!(lexemes.next(), None);
20//!
21//! let mut lexemes = lexer.lex("x @$!");
22//! assert_eq!(lexemes.next().unwrap().token, tok_var);
23//! assert_eq!(lexemes.next().unwrap().token, TOKEN_ERROR);
24//! ```
25//!
26//! Whitespace is skipped. If there is a lexing error, it is represented as an item in the iterator
27//! whose `token` is `TOKEN_ERROR`.
28//!
29//! If there are multiple possible matches:
30//!
31//! - The longest match is used.
32//! - If there is a tie, whichever token is a 'string' pattern instead of a 'regex' pattern will be
33//!   used.
34//! - If there is _still_ a tie, the regex that's first in the list provided to `Lexer::new()` will
35//!   be used.
36
37use crate::{Lexeme, Offset, Position, Span, TokenId, TOKEN_ERROR};
38use regex::{escape, Regex, RegexSet};
39
40pub use regex::Error as RegexError;
41
42/// White space as defined by the Pattern_White_Space Unicode property.
43pub const UNICODE_WHITESPACE_REGEX: &str =
44    "[\\u0009\\u000A\\u000B\\u000C\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029]*";
45
46#[derive(Debug, Clone)]
47struct Pattern {
48    regex: Regex,
49    length: Option<usize>,
50}
51
52impl PartialEq for Pattern {
53    fn eq(&self, other: &Pattern) -> bool {
54        self.regex.as_str() == other.regex.as_str() && self.length == other.length
55    }
56}
57
58impl Eq for Pattern {}
59
60/// A builder for `Lexer`. Specify the patterns to match.
61#[derive(Debug, Clone)]
62pub struct LexerBuilder {
63    whitespace: Regex,
64    patterns: Vec<Pattern>,
65}
66
67impl LexerBuilder {
68    pub fn new(whitespace_regex: &str) -> Result<LexerBuilder, RegexError> {
69        let mut builder = LexerBuilder {
70            whitespace: new_regex(whitespace_regex)?,
71            patterns: vec![],
72        };
73        builder.reserve_token()?; // Reserved for TOKEN_ERROR
74        builder.reserve_token()?; // Reserved for TOKEN_BLANK
75        builder.reserve_token()?; // Reserved for TOKEN_JUXTAPOSE
76        Ok(builder)
77    }
78
79    /// Add a pattern that matches exactly the string provided. Returns the token that will be
80    /// produced whenever this pattern matches. If the provided pattern was already added,
81    /// returns the pre-existing token for it.
82    pub fn string(&mut self, constant: &str) -> Result<TokenId, RegexError> {
83        let pattern = Pattern {
84            regex: new_regex(&escape(constant))?,
85            length: Some(constant.len()),
86        };
87
88        for (existing_token, existing_pattern) in self.patterns.iter().enumerate() {
89            if &pattern == existing_pattern {
90                return Ok(existing_token);
91            }
92        }
93
94        let token = self.patterns.len();
95        self.patterns.push(pattern);
96        Ok(token)
97    }
98
99    /// Add a pattern that matches the given regex. Returns the token that will be produced whenever
100    /// this pattern matches.
101    ///
102    /// The syntax is that of the `regex` crate. You do not need to begin the pattern with a
103    /// start-of-string character `^`.
104    pub fn regex(&mut self, regex: &str) -> Result<TokenId, RegexError> {
105        let pattern = Pattern {
106            regex: new_regex(regex)?,
107            length: None,
108        };
109
110        for (existing_token, existing_pattern) in self.patterns.iter().enumerate() {
111            if &pattern == existing_pattern {
112                return Ok(existing_token);
113            }
114        }
115
116        let token = self.patterns.len();
117        self.patterns.push(pattern);
118        Ok(token)
119    }
120
121    /// Reserve a token for personal use.
122    pub fn reserve_token(&mut self) -> Result<TokenId, RegexError> {
123        let pattern = Pattern {
124            regex: Regex::new("$.")?,
125            length: None,
126        };
127
128        let token = self.patterns.len();
129        self.patterns.push(pattern);
130        Ok(token)
131    }
132
133    /// Call this when you're done adding token patterns, to construct the lexer.
134    pub fn finish(self) -> Result<Lexer, RegexError> {
135        Ok(Lexer {
136            whitespace: self.whitespace,
137            regex_set: RegexSet::new(self.patterns.iter().map(|p| p.regex.as_str()))?,
138            patterns: self.patterns,
139        })
140    }
141}
142
143fn new_regex(regex: &str) -> Result<Regex, RegexError> {
144    Regex::new(&format!("^({})", regex))
145}
146
147/// A set of patterns to use to lex.
148#[derive(Debug, Clone)]
149pub struct Lexer {
150    whitespace: Regex,
151    patterns: Vec<Pattern>,
152    regex_set: RegexSet,
153}
154
155impl Lexer {
156    /// Split `source` into a stream of lexemes. It is frequently useful to wrap this in
157    /// [`iter::Peekable`](https://doc.rust-lang.org/stable/std/iter/struct.Peekable.html).
158    pub fn lex<'l, 's: 'l>(&'l self, source: &'s str) -> impl Iterator<Item = Lexeme> + 'l {
159        LexemeIter::new(self, source)
160    }
161
162    /// The number of tokens. Each `TokenId` returned by the builder is guaranteed to be smaller than
163    /// this number.
164    pub fn num_tokens(&self) -> usize {
165        self.patterns.len()
166    }
167}
168
169#[derive(Debug, Clone)]
170struct LexemeIter<'l, 's> {
171    lexer: &'l Lexer,
172    source: &'s str,
173    position: Position,
174    offset: Offset,
175}
176
177impl<'l, 's> LexemeIter<'l, 's> {
178    fn new(lexer: &'l Lexer, source: &'s str) -> LexemeIter<'l, 's> {
179        LexemeIter {
180            lexer,
181            source,
182            position: Position {
183                line: 0,
184                col: 0,
185                utf8_col: 0,
186            },
187            offset: 0,
188        }
189    }
190
191    fn consume(&mut self, len: usize) -> Span {
192        let start = self.position;
193        for ch in self.source[..len].chars() {
194            self.offset += ch.len_utf8();
195            self.position = self.position.advance_by_char(ch);
196        }
197        let end = self.position;
198
199        self.source = &self.source[len..];
200        Span { start, end }
201    }
202}
203
204impl Iterator for LexemeIter<'_, '_> {
205    type Item = Lexeme;
206
207    fn next(&mut self) -> Option<Lexeme> {
208        // Consume whitespace
209        if let Some(span) = self.lexer.whitespace.find(self.source) {
210            self.consume(span.end());
211        }
212
213        // If we're at the end of the file, we're done.
214        if self.source.is_empty() {
215            return None;
216        }
217
218        // Find the best match (longest, with a tie-breaker of is_str)
219        let mut best_match: Option<(TokenId, usize, bool)> = None;
220        for token in &self.lexer.regex_set.matches(self.source) {
221            let pattern = &self.lexer.patterns[token];
222
223            // Find the length (and tie-breaker is_str) of this match.
224            let (len, is_str) = if let Some(len) = pattern.length {
225                (len, true)
226            } else {
227                (pattern.regex.find(self.source).unwrap().end(), false)
228            };
229
230            // If this is longer (or tie breaks) the best match so far, replace it.
231            let is_best_match = if let Some((_, best_len, best_is_str)) = best_match {
232                (len, is_str) > (best_len, best_is_str)
233            } else {
234                true
235            };
236            if is_best_match {
237                best_match = Some((token, len, is_str));
238            }
239        }
240
241        // If there was a best match, consume and return it.
242        if let Some((token, len, _)) = best_match {
243            let span = self.consume(len);
244            return Some(Lexeme { token, span });
245        }
246
247        // Otherwise, nothing matched. Lex error! By definition we can't lex, but let's say the
248        // problem lies in the current chunk of non-basic-whitespace characters.
249        let basic_whitespace = &[' ', '\t', '\r', '\n'];
250        let len = self
251            .source
252            .find(basic_whitespace)
253            .unwrap_or(self.source.len());
254        let span = self.consume(len);
255        Some(Lexeme {
256            token: TOKEN_ERROR,
257            span,
258        })
259    }
260}