1use crate::{Lexeme, Offset, Position, Span, TokenId, TOKEN_ERROR};
38use regex::{escape, Regex, RegexSet};
39
40pub use regex::Error as RegexError;
41
42pub const UNICODE_WHITESPACE_REGEX: &str =
44 "[\\u0009\\u000A\\u000B\\u000C\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029]*";
45
46#[derive(Debug, Clone)]
47struct Pattern {
48 regex: Regex,
49 length: Option<usize>,
50}
51
52impl PartialEq for Pattern {
53 fn eq(&self, other: &Pattern) -> bool {
54 self.regex.as_str() == other.regex.as_str() && self.length == other.length
55 }
56}
57
58impl Eq for Pattern {}
59
60#[derive(Debug, Clone)]
62pub struct LexerBuilder {
63 whitespace: Regex,
64 patterns: Vec<Pattern>,
65}
66
67impl LexerBuilder {
68 pub fn new(whitespace_regex: &str) -> Result<LexerBuilder, RegexError> {
69 let mut builder = LexerBuilder {
70 whitespace: new_regex(whitespace_regex)?,
71 patterns: vec![],
72 };
73 builder.reserve_token()?; builder.reserve_token()?; builder.reserve_token()?; Ok(builder)
77 }
78
79 pub fn string(&mut self, constant: &str) -> Result<TokenId, RegexError> {
83 let pattern = Pattern {
84 regex: new_regex(&escape(constant))?,
85 length: Some(constant.len()),
86 };
87
88 for (existing_token, existing_pattern) in self.patterns.iter().enumerate() {
89 if &pattern == existing_pattern {
90 return Ok(existing_token);
91 }
92 }
93
94 let token = self.patterns.len();
95 self.patterns.push(pattern);
96 Ok(token)
97 }
98
99 pub fn regex(&mut self, regex: &str) -> Result<TokenId, RegexError> {
105 let pattern = Pattern {
106 regex: new_regex(regex)?,
107 length: None,
108 };
109
110 for (existing_token, existing_pattern) in self.patterns.iter().enumerate() {
111 if &pattern == existing_pattern {
112 return Ok(existing_token);
113 }
114 }
115
116 let token = self.patterns.len();
117 self.patterns.push(pattern);
118 Ok(token)
119 }
120
121 pub fn reserve_token(&mut self) -> Result<TokenId, RegexError> {
123 let pattern = Pattern {
124 regex: Regex::new("$.")?,
125 length: None,
126 };
127
128 let token = self.patterns.len();
129 self.patterns.push(pattern);
130 Ok(token)
131 }
132
133 pub fn finish(self) -> Result<Lexer, RegexError> {
135 Ok(Lexer {
136 whitespace: self.whitespace,
137 regex_set: RegexSet::new(self.patterns.iter().map(|p| p.regex.as_str()))?,
138 patterns: self.patterns,
139 })
140 }
141}
142
143fn new_regex(regex: &str) -> Result<Regex, RegexError> {
144 Regex::new(&format!("^({})", regex))
145}
146
147#[derive(Debug, Clone)]
149pub struct Lexer {
150 whitespace: Regex,
151 patterns: Vec<Pattern>,
152 regex_set: RegexSet,
153}
154
155impl Lexer {
156 pub fn lex<'l, 's: 'l>(&'l self, source: &'s str) -> impl Iterator<Item = Lexeme> + 'l {
159 LexemeIter::new(self, source)
160 }
161
162 pub fn num_tokens(&self) -> usize {
165 self.patterns.len()
166 }
167}
168
169#[derive(Debug, Clone)]
170struct LexemeIter<'l, 's> {
171 lexer: &'l Lexer,
172 source: &'s str,
173 position: Position,
174 offset: Offset,
175}
176
177impl<'l, 's> LexemeIter<'l, 's> {
178 fn new(lexer: &'l Lexer, source: &'s str) -> LexemeIter<'l, 's> {
179 LexemeIter {
180 lexer,
181 source,
182 position: Position {
183 line: 0,
184 col: 0,
185 utf8_col: 0,
186 },
187 offset: 0,
188 }
189 }
190
191 fn consume(&mut self, len: usize) -> Span {
192 let start = self.position;
193 for ch in self.source[..len].chars() {
194 self.offset += ch.len_utf8();
195 self.position = self.position.advance_by_char(ch);
196 }
197 let end = self.position;
198
199 self.source = &self.source[len..];
200 Span { start, end }
201 }
202}
203
204impl Iterator for LexemeIter<'_, '_> {
205 type Item = Lexeme;
206
207 fn next(&mut self) -> Option<Lexeme> {
208 if let Some(span) = self.lexer.whitespace.find(self.source) {
210 self.consume(span.end());
211 }
212
213 if self.source.is_empty() {
215 return None;
216 }
217
218 let mut best_match: Option<(TokenId, usize, bool)> = None;
220 for token in &self.lexer.regex_set.matches(self.source) {
221 let pattern = &self.lexer.patterns[token];
222
223 let (len, is_str) = if let Some(len) = pattern.length {
225 (len, true)
226 } else {
227 (pattern.regex.find(self.source).unwrap().end(), false)
228 };
229
230 let is_best_match = if let Some((_, best_len, best_is_str)) = best_match {
232 (len, is_str) > (best_len, best_is_str)
233 } else {
234 true
235 };
236 if is_best_match {
237 best_match = Some((token, len, is_str));
238 }
239 }
240
241 if let Some((token, len, _)) = best_match {
243 let span = self.consume(len);
244 return Some(Lexeme { token, span });
245 }
246
247 let basic_whitespace = &[' ', '\t', '\r', '\n'];
250 let len = self
251 .source
252 .find(basic_whitespace)
253 .unwrap_or(self.source.len());
254 let span = self.consume(len);
255 Some(Lexeme {
256 token: TOKEN_ERROR,
257 span,
258 })
259 }
260}