1use lexgen::lexer;
2use thiserror::Error;
3
4pub type Loc = lexgen_util::Loc;
5pub type LexerError = lexgen_util::LexerError<LexicalError>;
6
7#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
8pub enum Keyword {
9 Or,
10 And,
11 Module,
12 Import,
13 Include,
14 Def,
15 As,
16 Label,
17 Break,
18 Null,
19 False,
20 True,
21 If,
22 Then,
23 Elif,
24 Else,
25 End,
26 Try,
27 TryNoCatch,
28 Catch,
29 Reduce,
30 Foreach,
31}
32impl Keyword {
33 #[must_use]
34 pub const fn to_str(&self) -> &'static str {
35 match self {
36 Self::Or => "or",
37 Self::And => "and",
38 Self::Module => "module",
39 Self::Import => "import",
40 Self::Include => "include",
41 Self::Def => "def",
42 Self::As => "as",
43 Self::Label => "label",
44 Self::Break => "break",
45 Self::Null => "null",
46 Self::False => "false",
47 Self::True => "true",
48 Self::If => "if",
49 Self::Then => "then",
50 Self::Elif => "elif",
51 Self::Else => "else",
52 Self::End => "end",
53 Self::Try => "try",
54 Self::TryNoCatch => "try",
55 Self::Catch => "catch",
56 Self::Reduce => "reduce",
57 Self::Foreach => "foreach",
58 }
59 }
60}
61
62#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
63pub enum StringFragment<'input> {
64 String(&'input str),
65 Char(char),
66}
67#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
68pub enum Token<'input> {
69 Plus,
70 Minus,
71 Star,
72 Slash,
73 Percent,
74
75 Eq,
76 PlusEq,
77 MinusEq,
78 StarEq,
79 SlashEq,
80 PercentEq,
81 SlashSlashEq,
82 PipeEq,
83
84 EqEq,
85 NotEq,
86 LtEq,
87 GtEq,
88 Lt,
89 Gt,
90
91 Comma,
92 Dot,
93 Semicolon,
94 Colon,
95 DotDot,
96
97 Pipe,
98 Question,
99 SlashSlash,
100 QuestionSlashSlash,
101
102 LParen,
103 RParen,
104 LBrace,
105 RBrace,
106 LBracket,
107 RBracket,
108
109 StringStart,
110 StringFragment(StringFragment<'input>),
111 InterpolationStart,
112 InterpolationEnd,
113 StringEnd,
114
115 Keyword(Keyword),
116 Field(&'input str),
117 Identifier(&'input str),
118 ModuleIdentifier(&'input str),
119 Variable(&'input str),
120 ModuleVariable(&'input str),
121 Format(&'input str),
122 Number(crate::Number),
123
124 DefScopeEnd,
126 LabelScopeEnd,
127 BindScopeEnd,
128}
129
130#[derive(Debug, Clone, Eq, PartialEq, Error)]
131pub enum LexicalError {
132 #[error("Unmatching open {0:?} and close {1:?}")]
133 UnmatchingOpenClose(OpenCloseType, OpenCloseType),
134 #[error("Expected `{0}` but got `{1}`")]
135 UnexpectedToken(String, String),
136 #[error("Encountered an unexpected escaped character `\\{0}`")]
137 InvalidEscape(char),
138 #[error("Expected token `{0}`")]
139 OrphanToken(String),
140 #[error("No matching open for close {0:?}")]
141 TooManyClose(OpenCloseType),
142 #[error("Invalid unicode scalar value: `{0}`")]
143 InvalidUnicodeScalar(u32),
144 #[error("Unable to parse number: `{0}`")]
145 InvalidNumber(String),
146 #[error("Something went wrong")]
147 InvalidState,
148}
149
150enum ContextType<'input> {
151 Balancing(Token<'input>),
153 AutoCloseAndEmit(Token<'input>),
155 Try(usize),
157}
158pub struct Lexer<'input> {
159 input: &'input str,
160}
161
162impl<'input> Lexer<'input> {
163 #[must_use]
164 pub const fn new(input: &'input str) -> Self {
165 Self { input }
166 }
167}
168
169impl<'input> IntoIterator for Lexer<'input> {
170 type Item = Result<(Loc, Token<'input>, Loc), LexerError>;
171
172 type IntoIter = <Vec<Self::Item> as IntoIterator>::IntoIter;
173
174 fn into_iter(self) -> Self::IntoIter {
175 #[derive(Default)]
176 struct State<'input> {
177 ret: Vec<Result<(Loc, Token<'input>, Loc), LexerError>>,
178 pos: Loc,
179 stack: Vec<ContextType<'input>>,
180 }
181 impl<'input> State<'input> {
182 fn track_pos(&mut self, pos: Loc) {
183 self.pos = pos;
184 }
185 fn open(&mut self, ty: ContextType<'input>) {
186 self.stack.push(ty);
187 }
188 fn open_balancing(&mut self, token: Token<'input>) {
189 self.open(ContextType::Balancing(token));
190 }
191 fn close_to_try(&mut self) -> Result<(), LexicalError> {
192 while let Some(item) = self.stack.last() {
193 match item {
194 ContextType::Balancing(token) => {
195 return Err(LexicalError::UnexpectedToken(
196 format!("{token:?}"),
197 "catch".to_string(),
198 ))
199 }
200 ContextType::AutoCloseAndEmit(term) => {
201 self.ret.push(Ok((self.pos, term.clone(), self.pos)));
202 self.stack.pop();
203 }
204 ContextType::Try(_) => {
205 self.stack.pop();
206 return Ok(());
207 }
208 }
209 }
210 Err(LexicalError::OrphanToken("catch".to_string()))
211 }
212 fn close_autoclose(&mut self) -> Option<Token<'input>> {
213 while let Some(item) = self.stack.last() {
214 match item {
215 ContextType::Balancing(token) => return Some(token.clone()),
216 ContextType::AutoCloseAndEmit(term) => {
217 self.ret.push(Ok((self.pos, term.clone(), self.pos)));
218 self.stack.pop();
219 }
220 ContextType::Try(i) => {
221 if let Some(Ok((_, Token::Keyword(ref mut keyword), _))) =
222 self.ret.get_mut(*i).map(Result::as_mut)
223 {
224 *keyword = Keyword::TryNoCatch;
225 self.stack.pop();
226 } else {
227 panic!("Something went wrong with parsing try catch");
228 }
229 }
230 }
231 }
232 None
233 }
234 fn close_balancing(&mut self, token: &Token<'input>) -> Result<(), LexicalError> {
235 self.close_autoclose();
236 match self.stack.last() {
237 Some(ContextType::Balancing(expected)) => {
238 if token == expected {
239 self.stack.pop();
240 Ok(())
241 } else {
242 Err(LexicalError::UnexpectedToken(
243 format!("{expected:?}"),
244 format!("{token:?}"),
245 ))
246 }
247 }
248 Some(ContextType::AutoCloseAndEmit(_)) => unreachable!(),
249 Some(ContextType::Try(_)) => unreachable!(),
250 None => Err(LexicalError::OrphanToken(format!("{token:?}"))),
251 }
252 }
253 fn flush_or_close(&mut self, token: &Token<'input>) -> bool {
254 self.close_autoclose();
255 match self.stack.last() {
256 Some(ContextType::Balancing(expected)) => {
257 if token == expected {
258 self.stack.pop();
259 return true;
260 }
261 }
262 Some(ContextType::AutoCloseAndEmit(_)) => unreachable!(),
263 Some(ContextType::Try(_)) => unreachable!(),
264 None => {}
265 }
266 false
267 }
268 fn try_close_without_flush(&mut self, token: &Token<'input>) -> bool {
269 if let Some(ContextType::Balancing(expected)) = self.stack.last() {
270 if token == expected {
271 self.stack.pop();
272 return true;
273 }
274 }
275 false
276 }
277
278 fn handle_token(&mut self, token: &Token<'input>) -> Result<(), LexicalError> {
279 match token {
280 Token::LParen => self.open_balancing(Token::RParen),
281 Token::LBrace => self.open_balancing(Token::RBrace),
282 Token::LBracket => self.open_balancing(Token::RBracket),
283 Token::Keyword(Keyword::If) => {
284 self.open_balancing(Token::Keyword(Keyword::End))
285 }
286 Token::RParen
287 | Token::RBrace
288 | Token::RBracket
289 | Token::Keyword(Keyword::End) => {
290 self.close_balancing(token)?;
291 }
292 Token::Semicolon
293 | Token::Colon
294 | Token::Keyword(Keyword::Then | Keyword::Elif | Keyword::Else) => {
295 self.flush_or_close(token);
296 }
297 Token::Keyword(Keyword::Def) => {
298 self.open(ContextType::AutoCloseAndEmit(Token::DefScopeEnd));
300 self.open_balancing(Token::Semicolon);
301 self.open_balancing(Token::Colon);
302 }
303 Token::Keyword(Keyword::Try) => {
304 self.open(ContextType::Try(self.ret.len()));
305 }
306 Token::Keyword(Keyword::Catch) => {
307 self.close_to_try()?;
308 }
309 Token::Keyword(Keyword::Reduce | Keyword::Foreach) => {
310 self.open_balancing(Token::Keyword(Keyword::As));
311 }
312 Token::Keyword(Keyword::As) => {
313 if !self.try_close_without_flush(token) {
314 self.open(ContextType::AutoCloseAndEmit(Token::BindScopeEnd));
315 }
316 }
317 Token::Keyword(Keyword::Label) => {
318 self.open(ContextType::AutoCloseAndEmit(Token::LabelScopeEnd));
319 }
320 _ => {}
321 }
322 Ok(())
323 }
324 fn handle_item(&mut self, item: Result<(Loc, Token<'input>, Loc), LexerError>) {
325 match item {
326 Ok((l, token, r)) => {
327 self.track_pos(l);
328 let to_push =
329 self.handle_token(&token)
330 .map(|_| (l, token, r))
331 .map_err(|e| LexerError {
332 kind: lexgen_util::LexerErrorKind::Custom(e),
333 location: l,
334 });
335 self.ret.push(to_push);
336 }
337 Err(_) => {
338 self.ret.push(item);
339 }
340 }
341 }
342 fn finish(mut self) -> Vec<Result<(Loc, Token<'input>, Loc), LexerError>> {
343 if let Some(token) = self.close_autoclose() {
344 self.ret.push(Err(LexerError {
345 kind: lexgen_util::LexerErrorKind::Custom(LexicalError::UnexpectedToken(
346 format!("{token:?}"),
347 "EOF".to_string(),
348 )),
349 location: self.pos,
350 }));
351 }
352 self.ret
353 }
354 }
355
356 let lexer = LexerImpl::new(self.input);
357 let mut state = State::default();
358
359 for item in lexer {
360 state.handle_item(item);
361 }
362 state.finish().into_iter()
363 }
364}
365
366#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
367pub enum OpenCloseType {
368 Parenthesis,
370 Bracket,
372 Brace,
374 Interpolation,
376}
377
378#[derive(Debug, Default)]
379struct LexerState {
380 stack: Vec<OpenCloseType>,
381}
382impl LexerState {
383 fn open(&mut self, ty: OpenCloseType) {
384 self.stack.push(ty);
385 }
386 fn current_type(&self) -> Option<OpenCloseType> {
387 self.stack.last().cloned()
388 }
389 fn close(&mut self, ty: OpenCloseType) -> Result<(), LexicalError> {
390 if let Some(open) = self.current_type() {
391 if open == ty {
392 self.stack.pop();
393 Ok(())
394 } else {
395 Err(LexicalError::UnmatchingOpenClose(open, ty))
396 }
397 } else {
398 Err(LexicalError::TooManyClose(ty))
399 }
400 }
401}
402
403macro_rules! handle_keyword {
404 ($lexer: expr, $keyword: expr) => {
405 if $lexer.state().current_type() == Some(OpenCloseType::Brace) {
406 $lexer.return_(Token::Identifier($lexer.match_()))
408 } else {
409 $lexer.return_(Token::Keyword($keyword))
410 }
411 };
412}
413
414lexer! {
415 LexerImpl(LexerState) -> Token<'input>;
416 type Error = LexicalError;
417
418 let ws = [' ' '\t' '\n'] | "\r\n";
419 let comment = '#' (_ # ['\r' '\n'])*;
420 let ident_start = ['a'-'z' 'A'-'Z' '_'];
421 let digit = ['0'-'9'];
422 let hex_digit = $digit | ['a'-'f' 'A'-'F'];
423 let ident_follow = $ident_start | $digit;
424
425 rule Init {
426 $ws,
427 $comment,
428 '+' = Token::Plus,
429 '-' = Token::Minus,
430 '*' = Token::Star,
431 '/' = Token::Slash,
432 '%' = Token::Percent,
433
434 '=' = Token::Eq,
435 "+=" = Token::PlusEq,
436 "-=" = Token::MinusEq,
437 "*=" = Token::StarEq,
438 "/=" = Token::SlashEq,
439 "%=" = Token::PercentEq,
440 "//=" = Token::SlashSlashEq,
441 "|=" = Token::PipeEq,
442
443 "==" = Token::EqEq,
444 "!=" = Token::NotEq,
445 "<=" = Token::LtEq,
446 ">=" = Token::GtEq,
447 '<' = Token::Lt,
448 '>' = Token::Gt,
449
450 ',' = Token::Comma,
451 '.' = Token::Dot,
452 ';' = Token::Semicolon,
453 ':' = Token::Colon,
454 ".." = Token::DotDot,
455
456 '|' = Token::Pipe,
457 '?' = Token::Question,
458 "//" = Token::SlashSlash,
459 "?//" = Token::QuestionSlashSlash,
460
461 '(' => |lexer| {
462 lexer.state().open(OpenCloseType::Parenthesis);
463 lexer.return_(Token::LParen)
464 },
465 ')' =? |lexer| {
466 if lexer.state().current_type() == Some(OpenCloseType::Interpolation) {
467 let token = lexer.state().close(OpenCloseType::Interpolation).map(|_| Token::InterpolationEnd);
468 lexer.switch_and_return(LexerImplRule::InString, token)
469 } else {
470 let token = lexer.state().close(OpenCloseType::Parenthesis).map(|_| Token::RParen);
471 lexer.return_(token)
472 }
473 },
474 '{' => |lexer| {
475 lexer.state().open(OpenCloseType::Brace);
476 lexer.return_(Token::LBrace)
477 },
478 '}' =? |lexer| {
479 let token = lexer.state().close(OpenCloseType::Brace).map(|_| Token::RBrace);
480 lexer.return_(token)
481 },
482 '[' => |lexer| {
483 lexer.state().open(OpenCloseType::Bracket);
484 lexer.return_(Token::LBracket)
485 },
486 ']' =? |lexer| {
487 let token = lexer.state().close(OpenCloseType::Bracket).map(|_| Token::RBracket);
488 lexer.return_(token)
489 },
490
491 "or" => |lexer| handle_keyword!(lexer, Keyword::Or),
492 "and" => |lexer| handle_keyword!(lexer, Keyword::And),
493 "module" => |lexer| handle_keyword!(lexer, Keyword::Module),
494 "import" => |lexer| handle_keyword!(lexer, Keyword::Import),
495 "include" => |lexer| handle_keyword!(lexer, Keyword::Include),
496 "def" => |lexer| handle_keyword!(lexer, Keyword::Def),
497 "as" => |lexer| handle_keyword!(lexer, Keyword::As),
498 "label" => |lexer| handle_keyword!(lexer, Keyword::Label),
499 "break" => |lexer| handle_keyword!(lexer, Keyword::Break),
500 "if" => |lexer| handle_keyword!(lexer, Keyword::If),
501 "then" => |lexer| handle_keyword!(lexer, Keyword::Then),
502 "elif" => |lexer| handle_keyword!(lexer, Keyword::Elif),
503 "else" => |lexer| handle_keyword!(lexer, Keyword::Else),
504 "end" => |lexer| handle_keyword!(lexer, Keyword::End),
505 "try" => |lexer| handle_keyword!(lexer, Keyword::Try),
506 "catch" => |lexer| handle_keyword!(lexer, Keyword::Catch),
507 "reduce" => |lexer| handle_keyword!(lexer, Keyword::Reduce),
508 "foreach" => |lexer| handle_keyword!(lexer, Keyword::Foreach),
509
510 "null" = Token::Keyword(Keyword::Null),
514 "false" = Token::Keyword(Keyword::False),
515 "true" = Token::Keyword(Keyword::True),
516
517 '.' $ident_start $ident_follow* => |lexer| {
518 lexer.return_(Token::Field(&lexer.match_()[1..]))
519 },
520 $ident_start $ident_follow* => |lexer| {
521 lexer.return_(Token::Identifier(lexer.match_()))
522 },
523 $ident_start $ident_follow* ("::" $ident_start $ident_follow*)+ => |lexer| {
524 lexer.return_(Token::ModuleIdentifier(lexer.match_()))
525 },
526 '$' $ident_start $ident_follow* => |lexer| {
527 lexer.return_(Token::Variable(&lexer.match_()[1..]))
528 },
529 '$' $ident_start $ident_follow* ("::" $ident_start $ident_follow*)+ => |lexer| {
530 lexer.return_(Token::ModuleVariable(&lexer.match_()[1..]))
531 },
532 '@' $ident_start $ident_follow* => |lexer| {
533 lexer.return_(Token::Format(&lexer.match_()[1..]))
534 },
535 ($digit+ | $digit+ '.' $digit* | $digit* '.' $digit+) (['e' 'E'] (['+' '-']? $digit+))? =? |lexer| {
536 use std::str::FromStr;
537 let parsed = crate::Number::from_str(lexer.match_())
538 .map_err(|_| LexicalError::InvalidNumber(lexer.match_().to_string()))
539 .map(Token::Number);
540 lexer.return_(parsed)
541 },
542 '"' => |lexer| {
543 lexer.switch_and_return(LexerImplRule::InString, Token::StringStart)
544 },
545 }
546 rule InString {
547 "\\n" = Token::StringFragment(StringFragment::Char('\n')),
548 "\\r" = Token::StringFragment(StringFragment::Char('\r')),
549 "\\t" = Token::StringFragment(StringFragment::Char('\t')),
550 "\\b" = Token::StringFragment(StringFragment::Char('\u{08}')),
551 "\\f" = Token::StringFragment(StringFragment::Char('\u{0C}')),
552 "\\\\" = Token::StringFragment(StringFragment::Char('\\')),
553 "\\/" = Token::StringFragment(StringFragment::Char('/')),
554 "\\\"" = Token::StringFragment(StringFragment::Char('"')),
555 "\\u" ['d' 'D'] ['8' '9' 'a' 'b' 'A' 'B'] $hex_digit $hex_digit "\\u" ['d' 'D'] ['c'-'f' 'C'-'F'] $hex_digit $hex_digit =? |lexer| {
556 let higher_surrogate = u32::from_str_radix(&lexer.match_()[2..6], 16).unwrap();
557 let lower_surrogate = u32::from_str_radix(&lexer.match_()[8..12], 16).unwrap();
558 assert!((0xD800..0xDC00).contains(&higher_surrogate));
559 assert!((0xDC00..=0xDFFF).contains(&lower_surrogate));
560 let value = (((higher_surrogate - 0xD800) as u32) << 10 | (lower_surrogate - 0xDC00) as u32) + 0x1_0000;
561 match char::from_u32(value) {
562 Some(c) => {
563 lexer.return_(Ok(Token::StringFragment(StringFragment::Char(c))))
564 }
565 None => lexer.return_(Err(LexicalError::InvalidUnicodeScalar(value)))
566 }
567 },
568 "\\u" $hex_digit $hex_digit $hex_digit $hex_digit =? |lexer| {
569 let value = u32::from_str_radix(&lexer.match_()[2..], 16).unwrap();
570 match char::from_u32(value) {
571 Some(c) => {
572 lexer.return_(Ok(Token::StringFragment(StringFragment::Char(c))))
573 }
574 None => lexer.return_(Err(LexicalError::InvalidUnicodeScalar(value)))
575 }
576 },
577 "\\(" => |lexer| {
578 lexer.state().open(OpenCloseType::Interpolation);
579 lexer.switch_and_return(LexerImplRule::Init, Token::InterpolationStart)
580 },
581 '"' => |lexer| {
582 lexer.switch_and_return(LexerImplRule::Init, Token::StringEnd)
583 },
584 '\\' _ =? |lexer| {
585 lexer.return_(Err(LexicalError::InvalidEscape(lexer.match_().chars().nth(1).unwrap())))
586 },
587 (_ # ['\\' '"'])+ => |lexer| {
588 lexer.return_(Token::StringFragment(StringFragment::String(lexer.match_())))
589 },
590 }
591}
592
593#[cfg(test)]
594mod test {
595 use super::{Lexer, StringFragment, Token};
596
597 fn string_fragment(s: &str) -> Token {
598 Token::StringFragment(StringFragment::String(s))
599 }
600 fn assert_lex(q: &str, expected_tokens: &[Token]) {
601 let tokens: Vec<_> = Lexer::new(q)
602 .into_iter()
603 .map(Result::unwrap)
604 .map(|(_, token, _)| token)
605 .collect();
606 assert_eq!(&tokens[..], expected_tokens);
607 }
608
609 #[test]
610 fn test_ident() {
611 assert_lex(r#"abc"#, &[Token::Identifier("abc")]);
612 assert_lex(r#"abc12"#, &[Token::Identifier("abc12")]);
613 assert_lex(
614 r#"ab ab12"#,
615 &[Token::Identifier("ab"), Token::Identifier("ab12")],
616 );
617 assert_lex(
618 r#"ab_ ab_12"#,
619 &[Token::Identifier("ab_"), Token::Identifier("ab_12")],
620 );
621 }
622
623 #[test]
624 fn test_string() {
625 assert_lex(
626 r#""abc""#,
627 &[Token::StringStart, string_fragment("abc"), Token::StringEnd],
628 );
629 }
630
631 #[test]
632 fn test_string_interpolation() {
633 assert_lex(
634 r#"(ab"(\(a"\()")))")"#,
635 &[
636 Token::LParen,
637 Token::Identifier("ab"),
638 Token::StringStart,
639 string_fragment("("),
640 Token::InterpolationStart,
641 Token::Identifier("a"),
642 Token::StringStart,
643 Token::InterpolationStart,
644 Token::InterpolationEnd,
645 Token::StringEnd,
646 Token::InterpolationEnd,
647 string_fragment("))"),
648 Token::StringEnd,
649 Token::RParen,
650 ],
651 );
652 }
653
654 #[test]
655 fn test_number() {
656 assert_lex(
657 r#"2 12 1e3 1.5 .2 .3e-1"#,
658 &[
659 Token::Number(2.0.into()),
660 Token::Number(12.0.into()),
661 Token::Number(1000.0.into()),
662 Token::Number(1.5.into()),
663 Token::Number(0.2.into()),
664 Token::Number(0.03.into()),
665 ],
666 );
667 }
668
669 #[test]
670 fn test_comment() {
671 assert_lex(
672 r#""\(
673 1# This
674 + # is
675 2 #
676 )" # comment"#,
677 &[
678 Token::StringStart,
679 Token::InterpolationStart,
680 Token::Number(1.0.into()),
681 Token::Plus,
682 Token::Number(2.0.into()),
683 Token::InterpolationEnd,
684 Token::StringEnd,
685 ],
686 );
687 }
688}