1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2pub enum Token {
3 Id,
5 Int,
7 Float,
9 String,
11 Dot,
13 Hash,
15 Star,
17 Plus,
19 Greater,
21 Tilde,
23 Whitespace,
25 Question,
27 BraceOpen,
29 BraceClose,
31 Dollar,
33 Pipe,
35 ParenOpen,
37 ParenClose,
39 Comma,
41 Colon,
43 Semi,
45 Less,
47 BracketOpen,
50 BracketClose,
53 Comment,
56 Eof,
58 Unknown,
60}
61
62mod statics {
63 use super::Token;
64 use regex::{Regex, RegexSet};
65 use std::sync::LazyLock;
66
67 macro_rules! make_regex_set {
68 {$vis: vis ($tokens: ident, $re_set: ident, $re_compiled: ident) = {$($tk: ident <- $pat: literal)*};} => {
69 $vis static $tokens: &[Token] = &[
70 $(Token::$tk, )*
71 ];
72
73 $vis static $re_set: LazyLock<RegexSet> = LazyLock::new(|| RegexSet::new(&[
74 $(
75 concat!("^", $pat),
76 )*
77 ]).expect("error building RegexSet"));
78
79 $vis static $re_compiled: LazyLock<Vec<Regex>> = LazyLock::new(|| vec![
80 $(
81 Regex::new(concat!("^", $pat)).expect(concat!("Error building Regex `", $pat, "`")),
82 )*
83 ]);
84 };
85 }
86
87 make_regex_set! {
88 pub(super) (TOKENS, REGEX_SET, REGEX_LIST) = {
89 Id <- "[a-zA-Z][a-zA-Z0-9_-]*"
90 Int <- "[+-]?[0-9]+"
91 Float <- r"[+-]?[0-9]+\.[0-9]*"
93 String <- r#""(\\.|[^\\"])*""#
94 Dot <- r"\."
95 Hash <- "#"
96 Star <- r"\*"
97 Plus <- r"\+"
98 Greater <- ">"
99 Tilde <- "~"
100 Whitespace <- r"\p{White_Space}+"
101 Question <- r"\?"
102 BraceOpen <- r"\{"
103 BraceClose <- r"\}"
104 Dollar <- r"\$"
105 Pipe <- r"\|"
106 ParenOpen <- r"\("
107 ParenClose <- r"\)"
108 Comma <- ","
109 Colon <- ":"
110 Semi <- ";"
111 Less <- "<"
112 BracketOpen <- r"\["
113 BracketClose <- r"\]"
114 Comment <- r"//[^\n]*"
115 };
116 }
117}
118
119#[derive(Debug, Clone)]
120pub struct Scanner<'a> {
121 slice: &'a str,
122 idx: usize,
123 line: usize,
124}
125
126#[derive(Debug, Clone, Copy, Default)]
127#[non_exhaustive]
128pub struct Span {
129 pub line: usize,
130 pub start: usize,
131 pub end: usize,
132}
133
134#[derive(Debug, Clone, Copy, PartialEq, Eq)]
135pub struct Lexeme<'a> {
136 pub token: Token,
137 pub value: &'a str,
138}
139
140const EOF: Lexeme = Lexeme {
141 token: Token::Eof,
142 value: "",
143};
144
145impl<'a> Scanner<'a> {
146 #[must_use]
147 pub const fn new(slice: &'a str) -> Self {
148 Self {
149 slice,
150 idx: 0,
151 line: 1,
152 }
153 }
154
155 #[must_use]
156 pub fn peek_token(&self) -> (Span, Lexeme<'a>) {
157 if self.idx >= self.slice.len() {
158 return (Span::default(), EOF);
159 }
160
161 statics::REGEX_SET
164 .matches(&self.slice[self.idx..])
165 .into_iter()
166 .map(|x| Lexeme {
167 token: statics::TOKENS[x],
168 value: statics::REGEX_LIST[x]
169 .find(&self.slice[self.idx..])
170 .expect("matched in set should match in list")
171 .as_str(),
172 })
173 .max_by_key(|x| x.value.len())
174 .map(|lx| {
175 (
176 Span {
177 line: self.line,
178 start: self.idx,
179 end: self.idx + lx.value.len(),
180 },
181 lx,
182 )
183 })
184 .unwrap_or((
185 Span {
186 line: self.line,
187 start: self.idx,
188 end: self.idx + 1,
189 },
190 Lexeme {
191 token: Token::Unknown,
192 value: &self.slice[self.idx..=self.idx],
193 },
194 ))
195 }
196
197 pub fn eat_token(&mut self) -> (Span, Lexeme<'a>) {
198 let (span, lexeme) = self.peek_token();
199 self.idx += lexeme.value.len();
200 self.line += lexeme.value.chars().filter(|&x| x == '\n').count();
201 (span, lexeme)
202 }
203
204 pub fn peek_non_comment(&mut self) -> (Span, Lexeme<'a>) {
207 while let (
208 _,
209 Lexeme {
210 token: Token::Comment,
211 ..
212 },
213 ) = self.peek_token()
214 {
215 self.eat_token();
216 }
217 self.peek_token()
218 }
219
220 pub fn peek_non_whitespace(&mut self) -> (Span, Lexeme<'a>) {
223 while let (
224 _,
225 Lexeme {
226 token: Token::Whitespace,
227 ..
228 },
229 ) = self.peek_non_comment()
230 {
231 self.eat_token();
232 }
233 self.peek_token()
234 }
235}
236
237#[cfg(test)]
238mod tests {
239 use super::{
240 statics::{REGEX_LIST, REGEX_SET},
241 Lexeme, Scanner, Token, EOF,
242 };
243
244 #[test]
245 fn test_tokens() {
246 let scanner = Scanner::new("");
247 assert_eq!(scanner.peek_token().1, EOF);
248
249 macro_rules! test_matches {
250 {$($tk: ident => $($pat: literal)+ $(!($($npat: literal)+))?)* } => {
251 $(
252 $(
253 assert_eq!(
254 Scanner::new($pat).peek_token().1,
255 Lexeme { token: Token::$tk, value: $pat }
256 );
257 )+
258
259 $(
260 $(
261 assert_ne!(
262 Scanner::new($npat).peek_token().1,
263 Lexeme { token: Token::$tk, value: $npat }
264 );
265 )*
266 )?
267 )*
268 };
269 }
270
271 test_matches! {
272 Id => "a" "a-" "A9-9-9-9" "a____a" !("9" "-" "_")
273 Int => "+1" "1" "1234" "-1" !("+" "-")
274 Float => "0." "-0.1234" "+0.12345" !("1" ".5" "-.5" ".")
275 String => r#""hello!""# r#""""# r#""\"""# !(r#"""""# r#""\""#)
276 Dot => "." !("a")
278 Star => "*"
279 Plus => "+"
280 Question => "?"
281 Pipe => "|"
282 BracketOpen => "["
283 BracketClose => "]"
284 }
285 }
286
287 macro_rules! lx {
288 ($tk: ident, $lit: literal) => {
289 Lexeme {
290 token: Token::$tk,
291 value: $lit,
292 }
293 };
294 }
295
296 #[test]
297 fn test_eat() {
298 let mut sc = Scanner::new("h3 h4#h5.h6 {}");
299 assert_eq!(sc.eat_token().1, lx!(Id, "h3"));
300 assert_eq!(sc.eat_token().1, lx!(Whitespace, " "));
301 assert_eq!(sc.eat_token().1, lx!(Id, "h4"));
302 assert_eq!(sc.eat_token().1, lx!(Hash, "#"));
303 assert_eq!(sc.eat_token().1, lx!(Id, "h5"));
304 assert_eq!(sc.eat_token().1, lx!(Dot, "."));
305 assert_eq!(sc.eat_token().1, lx!(Id, "h6"));
306 assert_eq!(sc.eat_token().1, lx!(Whitespace, " "));
307 assert_eq!(sc.eat_token().1, lx!(BraceOpen, "{"));
308 assert_eq!(sc.eat_token().1, lx!(BraceClose, "}"));
309 }
310
311 #[test]
312 fn test_peek_whitespace() {
313 let mut sc = Scanner::new("h3 h4#h5.h6 {}");
314 sc.peek_non_whitespace();
315 assert_eq!(sc.eat_token().1, lx!(Id, "h3"));
316 sc.peek_non_whitespace();
317 assert_eq!(sc.eat_token().1, lx!(Id, "h4"));
318 sc.peek_non_whitespace();
319 assert_eq!(sc.eat_token().1, lx!(Hash, "#"));
320 sc.peek_non_whitespace();
321 assert_eq!(sc.eat_token().1, lx!(Id, "h5"));
322 sc.peek_non_whitespace();
323 assert_eq!(sc.eat_token().1, lx!(Dot, "."));
324 sc.peek_non_whitespace();
325 assert_eq!(sc.eat_token().1, lx!(Id, "h6"));
326 sc.peek_non_whitespace();
327 assert_eq!(sc.eat_token().1, lx!(BraceOpen, "{"));
328 sc.peek_non_whitespace();
329 assert_eq!(sc.eat_token().1, lx!(BraceClose, "}"));
330 }
331
332 #[test]
333 fn test_whitespace_mix() {
334 let mut sc = Scanner::new("h3 h4#h5.h6 {}");
335 assert_eq!(sc.eat_token().1, lx!(Id, "h3"));
336 assert_eq!(sc.eat_token().1, lx!(Whitespace, " "));
337 assert_eq!(sc.eat_token().1, lx!(Id, "h4"));
338 assert_eq!(sc.eat_token().1, lx!(Hash, "#"));
339 assert_eq!(sc.eat_token().1, lx!(Id, "h5"));
340 assert_eq!(sc.eat_token().1, lx!(Dot, "."));
341 assert_eq!(sc.eat_token().1, lx!(Id, "h6"));
342 sc.peek_non_whitespace();
343 assert_eq!(sc.eat_token().1, lx!(BraceOpen, "{"));
344 assert_eq!(sc.eat_token().1, lx!(BraceClose, "}"));
345 }
346
347 #[test]
348 fn test_comments() {
349 let mut sc = Scanner::new(
350 r"// Hello! This is a comment!
351 b: a // and another! {
352 {
353 // } don't be fooled!
354 }",
355 );
356
357 assert_eq!(sc.peek_non_whitespace().1, lx!(Id, "b"));
358 sc.eat_token();
359 assert_eq!(sc.peek_non_whitespace().1, lx!(Colon, ":"));
360 sc.eat_token();
361 assert_eq!(sc.peek_non_whitespace().1, lx!(Id, "a"));
362 sc.eat_token();
363 assert_eq!(sc.peek_non_whitespace().1, lx!(BraceOpen, "{"));
364 sc.eat_token();
365 assert_eq!(sc.eat_token().1.token, Token::Whitespace);
366 assert_eq!(sc.eat_token().1, lx!(Comment, "// } don't be fooled!"));
367 assert_eq!(sc.peek_non_whitespace().1, lx!(BraceClose, "}"));
368 sc.eat_token();
369 assert_eq!(sc.eat_token().1.token, Token::Eof);
370 }
371
372 #[test]
373 fn all_regex_is_valid() {
374 let _ = &*REGEX_SET;
375 let _ = &*REGEX_LIST;
376 }
377}