1use crate::token::{Token, TokenKind, lookup_functions, lookup_keyword};
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4pub enum LexErrorKind {
5 UnsupportedCharacter,
6 LoneAmpersand,
7 InvalidLineContinuation,
8 InvalidNumber,
9 UnterminatedString,
10 UnterminatedRegex,
11}
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub struct LexError<'a> {
15 pub kind: LexErrorKind,
16 pub literal: &'a str,
17 pub start: usize,
18}
19
20#[derive(Debug)]
21pub struct Lexer<'a> {
22 input: &'a str,
23 position: usize,
24 read_position: usize,
25 ch: Option<u8>,
26 errors: Vec<LexError<'a>>,
27}
28
29impl<'a> Lexer<'a> {
30 pub fn new(src: &'a str) -> Self {
31 let mut lexer = Lexer {
32 input: src,
33 position: 0,
34 read_position: 0,
35 ch: None,
36 errors: Vec::new(),
37 };
38
39 lexer.read_char();
40 lexer
41 }
42
43 pub fn next_token(&mut self) -> Token<'a> {
44 self.next_token_impl(false)
45 }
46
47 pub fn next_token_regex_aware(&mut self) -> Token<'a> {
48 self.next_token_impl(true)
49 }
50
51 pub fn errors(&self) -> &[LexError<'a>] {
52 &self.errors
53 }
54
55 pub fn has_errors(&self) -> bool {
56 !self.errors.is_empty()
57 }
58
59 fn next_token_impl(&mut self, allow_regex: bool) -> Token<'a> {
60 self.skip_whitespace();
61 self.skip_comment();
62
63 let start = self.position;
64 let token = match self.ch {
65 Some(b'{') => Token::new(TokenKind::LeftCurlyBrace, "{", start),
66 Some(b'}') => Token::new(TokenKind::RightCurlyBrace, "}", start),
67 Some(b'(') => Token::new(TokenKind::LeftParen, "(", start),
68 Some(b')') => Token::new(TokenKind::RightParen, ")", start),
69 Some(b'[') => Token::new(TokenKind::LeftSquareBracket, "[", start),
70 Some(b']') => Token::new(TokenKind::RightSquareBracket, "]", start),
71 Some(b',') => Token::new(TokenKind::Comma, ",", start),
72 Some(b';') => Token::new(TokenKind::Semicolon, ";", start),
73 Some(b'\n') => Token::new(TokenKind::NewLine, "<newline>", start),
74 Some(b'+') => {
75 if self.peek_char() == Some(b'=') {
76 self.read_char();
77 Token::new(TokenKind::AddAssign, "+=", start)
78 } else if self.peek_char() == Some(b'+') {
79 self.read_char();
80 Token::new(TokenKind::Increment, "++", start)
81 } else {
82 Token::new(TokenKind::Plus, "+", start)
83 }
84 }
85 Some(b'-') => {
86 if self.peek_char() == Some(b'=') {
87 self.read_char();
88 Token::new(TokenKind::SubtractAssign, "-=", start)
89 } else if self.peek_char() == Some(b'-') {
90 self.read_char();
91 Token::new(TokenKind::Decrement, "--", start)
92 } else {
93 Token::new(TokenKind::Minus, "-", start)
94 }
95 }
96 Some(b'*') => {
97 if self.peek_char() == Some(b'*') {
98 if self.peek_next_char() == Some(b'=') {
99 self.read_char();
100 self.read_char();
101 Token::new(TokenKind::PowerAssign, "**=", start)
102 } else {
103 self.read_char();
104 Token::new(TokenKind::Caret, "**", start)
105 }
106 } else if self.peek_char() == Some(b'=') {
107 self.read_char();
108 Token::new(TokenKind::MultiplyAssign, "*=", start)
109 } else {
110 Token::new(TokenKind::Asterisk, "*", start)
111 }
112 }
113 Some(b'%') => {
114 if self.peek_char() == Some(b'=') {
115 self.read_char();
116 Token::new(TokenKind::ModuloAssign, "%=", start)
117 } else {
118 Token::new(TokenKind::Percent, "%", start)
119 }
120 }
121 Some(b'^') => {
122 if self.peek_char() == Some(b'=') {
123 self.read_char();
124 Token::new(TokenKind::PowerAssign, "^=", start)
125 } else {
126 Token::new(TokenKind::Caret, "^", start)
127 }
128 }
129 Some(b'!') => {
130 if self.peek_char() == Some(b'=') {
131 self.read_char();
132 Token::new(TokenKind::NotEqual, "!=", start)
133 } else if self.peek_char() == Some(b'~') {
134 self.read_char();
135 Token::new(TokenKind::NoMatch, "!~", start)
136 } else {
137 Token::new(TokenKind::ExclamationMark, "!", start)
138 }
139 }
140 Some(b'>') => {
141 if self.peek_char() == Some(b'=') {
142 self.read_char();
143 Token::new(TokenKind::GreaterThanOrEqual, ">=", start)
144 } else if self.peek_char() == Some(b'>') {
145 self.read_char();
146 Token::new(TokenKind::Append, ">>", start)
147 } else {
148 Token::new(TokenKind::GreaterThan, ">", start)
149 }
150 }
151 Some(b'<') => {
152 if self.peek_char() == Some(b'=') {
153 self.read_char();
154 Token::new(TokenKind::LessThanOrEqual, "<=", start)
155 } else {
156 Token::new(TokenKind::LessThan, "<", start)
157 }
158 }
159 Some(b'|') => {
160 if self.peek_char() == Some(b'|') {
161 self.read_char();
162 Token::new(TokenKind::Or, "||", start)
163 } else {
164 Token::new(TokenKind::Pipe, "|", start)
165 }
166 }
167 Some(b'?') => Token::new(TokenKind::QuestionMark, "?", start),
168 Some(b':') => Token::new(TokenKind::Colon, ":", start),
169 Some(b'~') => Token::new(TokenKind::Tilde, "~", start),
170 Some(b'$') => Token::new(TokenKind::DollarSign, "$", start),
171 Some(b'=') => {
172 if self.peek_char() == Some(b'=') {
173 self.read_char();
174 Token::new(TokenKind::Equal, "==", start)
175 } else {
176 Token::new(TokenKind::Assign, "=", start)
177 }
178 }
179 Some(b'/') => {
180 if allow_regex {
181 self.read_regex()
182 } else if self.peek_char() == Some(b'=') {
183 self.read_char();
184 Token::new(TokenKind::DivideAssign, "/=", start)
185 } else {
186 Token::new(TokenKind::Division, "/", start)
187 }
188 }
189 Some(b'&') => {
190 if self.peek_char() == Some(b'&') {
191 self.read_char();
192 Token::new(TokenKind::And, "&&", start)
193 } else {
194 self.illegal_token(
195 LexErrorKind::LoneAmpersand,
196 start,
197 "<illegal>",
198 &self.input[start..self.read_position],
199 )
200 }
201 }
202 Some(b'\\') => {
203 if self.peek_char() == Some(b'\n') {
204 self.read_char();
205 Token::new(TokenKind::NewLine, "<newline>", start)
206 } else if self.peek_char() == Some(b'\r') && self.peek_next_char() == Some(b'\n') {
207 self.read_char();
208 self.read_char();
209 Token::new(TokenKind::NewLine, "<newline>", start)
210 } else {
211 self.illegal_token(
212 LexErrorKind::InvalidLineContinuation,
213 start,
214 "<illegal>",
215 &self.input[start..self.read_position],
216 )
217 }
218 }
219 Some(b'"') => self.read_string(),
220 ch if is_ascii_alphabetic(ch) || ch == Some(b'_') => self.read_identifier(),
221 ch if is_digit(ch) => self.read_number(),
222 Some(b'.')
223 if self
224 .peek_char()
225 .is_some_and(|arg0: u8| is_digit(Some(arg0))) =>
226 {
227 self.read_number()
228 }
229 None => return Token::new(TokenKind::Eof, "", start),
230 _ => self.illegal_token(
231 LexErrorKind::UnsupportedCharacter,
232 start,
233 "<illegal>",
234 &self.input[start..self.read_position],
235 ),
236 };
237
238 self.read_char();
239
240 token
241 }
242
243 fn read_char(&mut self) {
244 if self.read_position >= self.input.len() {
245 self.ch = None;
246 } else {
247 self.ch = Some(self.input.as_bytes()[self.read_position]);
248 }
249 self.position = self.read_position;
250 self.read_position += 1;
251 }
252
253 fn read_identifier(&mut self) -> Token<'a> {
254 let position = self.position;
255 while is_ascii_alphabetic(self.ch) || is_digit(self.ch) || self.ch == Some(b'_') {
256 self.read_char();
257 }
258 let literal = &self.input[position..self.position];
259
260 let token = if let Some(token_kind) = lookup_keyword(literal) {
261 Token::new(token_kind, literal, position)
262 } else if let Some(token_kind) = lookup_functions(literal) {
263 Token::new(token_kind, literal, position)
264 } else {
265 Token::new(TokenKind::Identifier, literal, position)
266 };
267
268 self.rewind_one();
269 token
270 }
271
272 fn read_number(&mut self) -> Token<'a> {
273 let position = self.position;
274 let mut got_digit = false;
275
276 if self.ch == Some(b'0')
277 && matches!(self.peek_char(), Some(b'x') | Some(b'X'))
278 && !is_hex_digit(self.peek_next_char())
279 {
280 let literal = &self.input[position..position + 1];
281 return Token::new(TokenKind::Number, literal, position);
282 }
283
284 if self.ch != Some(b'.') {
286 got_digit = true;
287
288 if self.ch == Some(b'0')
289 && matches!(self.peek_char(), Some(b'x') | Some(b'X'))
290 && is_hex_digit(self.peek_next_char())
291 {
292 self.read_char(); self.read_char(); while matches!(
297 self.ch,
298 Some(b'0'..=b'9') | Some(b'a'..=b'f') | Some(b'A'..=b'F')
299 ) {
300 self.read_char();
301 }
302
303 let literal = &self.input[position..self.position];
304 match u64::from_str_radix(&literal[2..], 16) {
305 Ok(_) => {
306 let token = Token::new(TokenKind::Number, literal, position);
307 self.rewind_one();
308 return token;
309 }
310 Err(_) => {
311 if self.ch.is_some() {
312 self.rewind_one();
313 }
314 return self.illegal_token(
315 LexErrorKind::InvalidNumber,
316 position,
317 "<illegal>",
318 literal,
319 );
320 }
321 }
322 }
323 while is_digit(self.ch) {
324 self.read_char();
325 }
326
327 if self.ch == Some(b'.') {
328 self.read_char();
329 }
330 } else {
331 self.read_char();
333 }
334
335 while is_digit(self.ch) {
337 got_digit = true;
338
339 self.read_char();
340 }
341
342 if matches!(self.ch, Some(b'e') | Some(b'E')) {
343 let exponent_sign = self.peek_char();
344 let exponent_digit = if matches!(exponent_sign, Some(b'+') | Some(b'-')) {
345 self.peek_next_char()
346 } else {
347 exponent_sign
348 };
349
350 if is_digit(exponent_digit) {
351 self.read_char();
352 if matches!(self.ch, Some(b'+') | Some(b'-')) {
353 self.read_char();
354 }
355 while is_digit(self.ch) {
356 self.read_char();
357 }
358 }
359 }
360
361 if !got_digit {
362 let literal = &self.input[position..self.position];
363 return self.illegal_token(LexErrorKind::InvalidNumber, position, "<illegal>", literal);
364 }
365
366 let literal = &self.input[position..self.position];
367
368 let token = Token::new(TokenKind::Number, literal, position);
369 self.rewind_one();
370 token
371 }
372
373 fn read_string(&mut self) -> Token<'a> {
374 self.read_char();
376 let position = self.position;
377 let mut escaped = false;
378
379 while let Some(ch) = self.ch {
380 if !escaped && ch == b'"' {
381 break;
382 }
383 escaped = !escaped && ch == b'\\';
384 self.read_char();
385 }
386
387 let literal = &self.input[position..self.position];
388
389 if self.ch != Some(b'"') {
390 return self.illegal_token(
391 LexErrorKind::UnterminatedString,
392 position,
393 literal,
394 literal,
395 );
396 };
397
398 Token::new(TokenKind::String, literal, position)
399 }
400
401 fn read_regex(&mut self) -> Token<'a> {
402 self.read_char();
404 let position = self.position;
405 let mut escaped = false;
406
407 while let Some(ch) = self.ch {
408 if !escaped && ch == b'/' {
409 break;
410 }
411 if !escaped && ch == b'\n' {
412 break;
413 }
414 escaped = !escaped && ch == b'\\';
415
416 self.read_char();
417 }
418
419 let literal = &self.input[position..self.position];
420
421 if self.ch != Some(b'/') {
422 if self.ch == Some(b'\n') {
423 self.rewind_one();
424 }
425 return self.illegal_token(LexErrorKind::UnterminatedRegex, position, literal, literal);
426 }
427
428 Token::new(TokenKind::Regex, literal, position)
429 }
430
431 fn skip_whitespace(&mut self) {
432 while is_whitespace(self.ch) {
433 self.read_char();
434 }
435 }
436
437 fn skip_comment(&mut self) {
438 if Some(b'#') == self.ch {
439 while self.ch != Some(b'\n') && self.ch.is_some() {
440 self.read_char();
441 }
442 }
443 }
444
445 fn peek_char(&self) -> Option<u8> {
446 if self.read_position >= self.input.len() {
447 None
448 } else {
449 Some(self.input.as_bytes()[self.read_position])
450 }
451 }
452
453 fn peek_next_char(&self) -> Option<u8> {
454 let next = self.read_position + 1;
455 if next >= self.input.len() {
456 None
457 } else {
458 Some(self.input.as_bytes()[next])
459 }
460 }
461
462 fn rewind_one(&mut self) {
463 if self.position == 0 {
464 return;
465 }
466 self.read_position = self.position;
467 self.position -= 1;
468 self.ch = Some(self.input.as_bytes()[self.position]);
469 }
470
471 fn illegal_token(
472 &mut self,
473 kind: LexErrorKind,
474 start: usize,
475 token_literal: &'a str,
476 diagnostic_literal: &'a str,
477 ) -> Token<'a> {
478 self.errors.push(LexError {
479 kind,
480 literal: diagnostic_literal,
481 start,
482 });
483 Token::new(TokenKind::Illegal, token_literal, start)
484 }
485}
486
487fn is_ascii_alphabetic(ch: Option<u8>) -> bool {
488 match ch {
489 Some(byte) => byte.is_ascii_alphabetic(),
490 None => false,
491 }
492}
493
494fn is_whitespace(ch: Option<u8>) -> bool {
495 match ch {
496 Some(byte) => byte == b' ' || byte == b'\t' || byte == b'\r',
497 None => false,
498 }
499}
500
501fn is_digit(ch: Option<u8>) -> bool {
502 match ch {
503 Some(byte) => byte.is_ascii_digit(),
504 None => false,
505 }
506}
507
508fn is_hex_digit(ch: Option<u8>) -> bool {
509 match ch {
510 Some(byte) => byte.is_ascii_hexdigit(),
511 None => false,
512 }
513}
514
515#[cfg(test)]
516mod tests {
517 use super::*;
518
519 fn assert_token(token: Token<'_>, kind: TokenKind, literal: &str) {
520 assert_eq!(kind, token.kind);
521 assert_eq!(literal, token.literal);
522 }
523
524 fn assert_lex_error(error: LexError<'_>, kind: LexErrorKind, literal: &str, start: usize) {
525 assert_eq!(kind, error.kind);
526 assert_eq!(literal, error.literal);
527 assert_eq!(start, error.start);
528 }
529
530 #[test]
531 fn empty_input_returns_eof_token() {
532 let input = "";
533 let mut lexer = Lexer::new(input);
534
535 let token = lexer.next_token();
536
537 assert_token(token, TokenKind::Eof, "");
538 }
539
540 #[test]
541 fn next_left_curly_brace_token() {
542 let expected_token = Token::new(TokenKind::LeftCurlyBrace, "{", 0);
543 let input = "{";
544 let mut lexer = Lexer::new(input);
545
546 let token = lexer.next_token();
547
548 assert_eq!(expected_token, token);
549 }
550
551 #[test]
552 fn next_right_curly_brace_token() {
553 let input = "}";
554 let mut lexer = Lexer::new(input);
555
556 let token = lexer.next_token();
557
558 assert_token(token, TokenKind::RightCurlyBrace, "}");
559 }
560
561 #[test]
562 fn next_pipe_token() {
563 let input = "|";
564 let mut lexer = Lexer::new(input);
565
566 let token = lexer.next_token();
567
568 assert_token(token, TokenKind::Pipe, "|");
569 }
570
571 #[test]
572 fn next_one_character_token() {
573 let input = "{}()[],;\n+-*/%^!><|?:~$=";
574 let mut lexer = Lexer::new(input);
575 let expected_tokens = vec![
576 (TokenKind::LeftCurlyBrace, "{"),
577 (TokenKind::RightCurlyBrace, "}"),
578 (TokenKind::LeftParen, "("),
579 (TokenKind::RightParen, ")"),
580 (TokenKind::LeftSquareBracket, "["),
581 (TokenKind::RightSquareBracket, "]"),
582 (TokenKind::Comma, ","),
583 (TokenKind::Semicolon, ";"),
584 (TokenKind::NewLine, "<newline>"),
585 (TokenKind::Plus, "+"),
586 (TokenKind::Minus, "-"),
587 (TokenKind::Asterisk, "*"),
588 (TokenKind::Division, "/"),
589 (TokenKind::Percent, "%"),
590 (TokenKind::Caret, "^"),
591 (TokenKind::ExclamationMark, "!"),
592 (TokenKind::GreaterThan, ">"),
593 (TokenKind::LessThan, "<"),
594 (TokenKind::Pipe, "|"),
595 (TokenKind::QuestionMark, "?"),
596 (TokenKind::Colon, ":"),
597 (TokenKind::Tilde, "~"),
598 (TokenKind::DollarSign, "$"),
599 (TokenKind::Assign, "="),
600 (TokenKind::Eof, ""),
601 ];
602
603 for (expected_kind, expected_literal) in expected_tokens {
604 let token = lexer.next_token();
605 assert_token(token, expected_kind, expected_literal);
606 }
607 }
608
609 #[test]
610 fn next_while_token() {
611 let expected_token = Token::new(TokenKind::While, "while", 1);
612 let input = " while";
613 let mut lexer = Lexer::new(input);
614
615 let token = lexer.next_token();
616
617 assert_eq!(expected_token, token);
618 }
619
620 #[test]
621 fn next_identifier_token() {
622 let input = "BEGIN END break continue delete do else exit for function if in next print printf return while";
623 let mut lexer = Lexer::new(input);
624
625 let expected_tokens = vec![
626 (TokenKind::Begin, "BEGIN"),
627 (TokenKind::End, "END"),
628 (TokenKind::Break, "break"),
629 (TokenKind::Continue, "continue"),
630 (TokenKind::Delete, "delete"),
631 (TokenKind::Do, "do"),
632 (TokenKind::Else, "else"),
633 (TokenKind::Exit, "exit"),
634 (TokenKind::For, "for"),
635 (TokenKind::Function, "function"),
636 (TokenKind::If, "if"),
637 (TokenKind::In, "in"),
638 (TokenKind::Next, "next"),
639 (TokenKind::Print, "print"),
640 (TokenKind::Printf, "printf"),
641 (TokenKind::Return, "return"),
642 (TokenKind::While, "while"),
643 (TokenKind::Eof, ""),
644 ];
645
646 for (expected_kind, expected_literal) in expected_tokens {
647 let token = lexer.next_token();
648 assert_token(token, expected_kind, expected_literal);
649 }
650 }
651
652 #[test]
653 fn next_number_token() {
654 let input = "123 4567 890 42.0 .75 0.001";
655 let mut lexer = Lexer::new(input);
656
657 let expected_tokens = vec![
658 (TokenKind::Number, "123"),
659 (TokenKind::Number, "4567"),
660 (TokenKind::Number, "890"),
661 (TokenKind::Number, "42.0"),
662 (TokenKind::Number, ".75"),
663 (TokenKind::Number, "0.001"),
664 (TokenKind::Eof, ""),
665 ];
666
667 for (expected_kind, expected_literal) in expected_tokens {
668 let token = lexer.next_token();
669 assert_token(token, expected_kind, expected_literal);
670 }
671 }
672
673 #[test]
674 fn hex_number_token() {
675 let input = "0xAA 0xaa 0xFEED 0xBEAF";
676 let mut lexer = Lexer::new(input);
677
678 let expected_tokens = vec![
679 (TokenKind::Number, "0xAA"),
680 (TokenKind::Number, "0xaa"),
681 (TokenKind::Number, "0xFEED"),
682 (TokenKind::Number, "0xBEAF"),
683 (TokenKind::Eof, ""),
684 ];
685
686 for (expected_kind, expected_literal) in expected_tokens {
687 let token = lexer.next_token();
688 assert_token(token, expected_kind, expected_literal);
689 }
690 }
691
692 #[test]
693 fn invalid_hex_number_token() {
694 let input = "0xG1 5x03";
695 let mut lexer = Lexer::new(input);
696
697 let expected_tokens = vec![
698 (TokenKind::Number, "0"),
699 (TokenKind::Identifier, "xG1"),
700 (TokenKind::Number, "5"),
701 (TokenKind::Identifier, "x03"),
702 (TokenKind::Eof, ""),
703 ];
704
705 for (expected_kind, expected_literal) in expected_tokens {
706 let token = lexer.next_token();
707 assert_token(token, expected_kind, expected_literal);
708 }
709 }
710
711 #[test]
712 fn next_or_token() {
713 let expected_token = Token::new(TokenKind::Or, "||", 0);
714 let input = "||";
715 let mut lexer = Lexer::new(input);
716
717 let token = lexer.next_token();
718
719 assert_eq!(expected_token, token);
720 }
721
722 #[test]
723 fn next_two_character_token() {
724 let input = "+= -= *= /= %= ^= **= ** || && !~ == <= >= != ++ -- >>";
725 let mut lexer = Lexer::new(input);
726
727 let expected_tokens = vec![
728 (TokenKind::AddAssign, "+="),
729 (TokenKind::SubtractAssign, "-="),
730 (TokenKind::MultiplyAssign, "*="),
731 (TokenKind::DivideAssign, "/="),
732 (TokenKind::ModuloAssign, "%="),
733 (TokenKind::PowerAssign, "^="),
734 (TokenKind::PowerAssign, "**="),
735 (TokenKind::Caret, "**"),
736 (TokenKind::Or, "||"),
737 (TokenKind::And, "&&"),
738 (TokenKind::NoMatch, "!~"),
739 (TokenKind::Equal, "=="),
740 (TokenKind::LessThanOrEqual, "<="),
741 (TokenKind::GreaterThanOrEqual, ">="),
742 (TokenKind::NotEqual, "!="),
743 (TokenKind::Increment, "++"),
744 (TokenKind::Decrement, "--"),
745 (TokenKind::Append, ">>"),
746 (TokenKind::Eof, ""),
747 ];
748
749 for (expected_kind, expected_literal) in expected_tokens {
750 let token = lexer.next_token();
751 assert_token(token, expected_kind, expected_literal);
752 }
753 }
754
755 #[test]
756 fn consume_comment() {
757 let input = "# This is a comment\n123";
758 let mut lexer = Lexer::new(input);
759
760 let expected_tokens = vec![
761 (TokenKind::NewLine, "<newline>"),
762 (TokenKind::Number, "123"),
763 (TokenKind::Eof, ""),
764 ];
765
766 for (expected_kind, expected_literal) in expected_tokens {
767 let token = lexer.next_token();
768 assert_token(token, expected_kind, expected_literal);
769 }
770 }
771
772 #[test]
773 fn expect_newline_after_backslash() {
774 let input = "123 \\\n456";
775 let mut lexer = Lexer::new(input);
776
777 let expected_tokens = vec![
778 (TokenKind::Number, "123"),
779 (TokenKind::NewLine, "<newline>"),
780 (TokenKind::Number, "456"),
781 (TokenKind::Eof, ""),
782 ];
783 for (expected_kind, expected_literal) in expected_tokens {
784 let token = lexer.next_token();
785 assert_token(token, expected_kind, expected_literal);
786 }
787 }
788
789 #[test]
790 fn scientific_number_token() {
791 let input = "1E2 12e-2 .75e+1";
792 let mut lexer = Lexer::new(input);
793
794 let expected_tokens = vec![
795 (TokenKind::Number, "1E2"),
796 (TokenKind::Number, "12e-2"),
797 (TokenKind::Number, ".75e+1"),
798 (TokenKind::Eof, ""),
799 ];
800
801 for (expected_kind, expected_literal) in expected_tokens {
802 let token = lexer.next_token();
803 assert_token(token, expected_kind, expected_literal);
804 }
805 }
806
807 #[test]
808 fn expect_newline_after_backslash_with_crlf() {
809 let input = "123 \\\r\n456";
810 let mut lexer = Lexer::new(input);
811
812 let expected_tokens = vec![
813 (TokenKind::Number, "123"),
814 (TokenKind::NewLine, "<newline>"),
815 (TokenKind::Number, "456"),
816 (TokenKind::Eof, ""),
817 ];
818 for (expected_kind, expected_literal) in expected_tokens {
819 let token = lexer.next_token();
820 assert_token(token, expected_kind, expected_literal);
821 }
822 }
823
824 #[test]
825 fn backslash_without_newline_is_illegal() {
826 let input = "123 \\ 456";
827 let mut lexer = Lexer::new(input);
828 let expected_tokens = vec![
829 (TokenKind::Number, "123"),
830 (TokenKind::Illegal, "<illegal>"),
831 (TokenKind::Number, "456"),
832 (TokenKind::Eof, ""),
833 ];
834
835 for (expected_kind, expected_literal) in expected_tokens {
836 let token = lexer.next_token();
837 assert_token(token, expected_kind, expected_literal);
838 }
839 }
840
841 #[test]
842 fn read_string_token() {
843 let input = r#""Hello, World!" 123 "Hello, again!";"#;
844 let mut lexer = Lexer::new(input);
845 let expected_tokens = vec![
846 (TokenKind::String, "Hello, World!"),
847 (TokenKind::Number, "123"),
848 (TokenKind::String, "Hello, again!"),
849 (TokenKind::Semicolon, ";"),
850 (TokenKind::Eof, ""),
851 ];
852
853 for (expected_kind, expected_literal) in expected_tokens {
854 let token = lexer.next_token();
855 assert_token(token, expected_kind, expected_literal);
856 }
857 }
858
859 #[test]
860 fn read_string_token_with_escaped_quote() {
861 let input = r#""\"""#;
862 let mut lexer = Lexer::new(input);
863
864 let token = lexer.next_token();
865 assert_token(token, TokenKind::String, r#"\""#);
866
867 let token = lexer.next_token();
868 assert_token(token, TokenKind::Eof, "");
869 }
870
871 #[test]
872 fn read_regex_token_when_allowed() {
873 let input = r"/foo\//";
874 let mut lexer = Lexer::new(input);
875
876 let token = lexer.next_token_regex_aware();
877
878 assert_token(token, TokenKind::Regex, r"foo\/");
879 }
880
881 #[test]
882 fn slash_is_division_when_regex_not_allowed() {
883 let input = "/foo/";
884 let mut lexer = Lexer::new(input);
885
886 let token = lexer.next_token();
887
888 assert_token(token, TokenKind::Division, "/");
889 }
890
891 #[test]
892 fn unterminated_string_token() {
893 let input = r#""This is an unterminated string"#;
894 let mut lexer = Lexer::new(input);
895
896 let token = lexer.next_token();
897 assert_token(token, TokenKind::Illegal, "This is an unterminated string");
898 }
899
900 #[test]
901 fn lone_ampersand_is_illegal() {
902 let input = "&";
903 let mut lexer = Lexer::new(input);
904
905 let token = lexer.next_token();
906
907 assert_token(token, TokenKind::Illegal, "<illegal>");
908 assert_eq!(1, lexer.errors().len());
909 assert_lex_error(lexer.errors()[0], LexErrorKind::LoneAmpersand, "&", 0);
910 }
911
912 #[test]
913 fn unsupported_character_is_illegal() {
914 let input = "@";
915 let mut lexer = Lexer::new(input);
916
917 let token = lexer.next_token();
918
919 assert_token(token, TokenKind::Illegal, "<illegal>");
920 assert_eq!(1, lexer.errors().len());
921 assert_lex_error(
922 lexer.errors()[0],
923 LexErrorKind::UnsupportedCharacter,
924 "@",
925 0,
926 );
927 }
928
929 #[test]
930 fn unterminated_regex_token_is_illegal() {
931 let input = r"/foo";
932 let mut lexer = Lexer::new(input);
933
934 let token = lexer.next_token_regex_aware();
935
936 assert_token(token, TokenKind::Illegal, "foo");
937 assert_eq!(1, lexer.errors().len());
938 assert_lex_error(lexer.errors()[0], LexErrorKind::UnterminatedRegex, "foo", 1);
939 }
940
941 #[test]
942 fn bare_dot_is_illegal() {
943 let input = ".";
944 let mut lexer = Lexer::new(input);
945
946 let token = lexer.next_token();
947
948 assert_token(token, TokenKind::Illegal, "<illegal>");
949 assert_eq!(1, lexer.errors().len());
950 assert_lex_error(
951 lexer.errors()[0],
952 LexErrorKind::UnsupportedCharacter,
953 ".",
954 0,
955 );
956 }
957
958 #[test]
959 fn backslash_without_newline_records_diagnostic() {
960 let input = "\\";
961 let mut lexer = Lexer::new(input);
962
963 let token = lexer.next_token();
964
965 assert_token(token, TokenKind::Illegal, "<illegal>");
966 assert_eq!(1, lexer.errors().len());
967 assert_lex_error(
968 lexer.errors()[0],
969 LexErrorKind::InvalidLineContinuation,
970 "\\",
971 0,
972 );
973 }
974
975 #[test]
976 fn has_errors_tracks_whether_diagnostics_were_recorded() {
977 let mut clean_lexer = Lexer::new("123");
978 assert!(!clean_lexer.has_errors());
979
980 let token = clean_lexer.next_token();
981 assert_token(token, TokenKind::Number, "123");
982 assert!(!clean_lexer.has_errors());
983
984 let mut error_lexer = Lexer::new("@");
985 assert!(!error_lexer.has_errors());
986
987 let token = error_lexer.next_token();
988 assert_token(token, TokenKind::Illegal, "<illegal>");
989 assert!(error_lexer.has_errors());
990 }
991
992 #[test]
993 fn read_number_without_any_digits_is_invalid_number() {
994 let mut lexer = Lexer::new(".");
995
996 let token = lexer.read_number();
997
998 assert_token(token, TokenKind::Illegal, "<illegal>");
999 assert!(lexer.has_errors());
1000 assert_eq!(1, lexer.errors().len());
1001 assert_lex_error(lexer.errors()[0], LexErrorKind::InvalidNumber, ".", 0);
1002 }
1003
1004 #[test]
1005 fn peek_next_char_returns_none_at_end_of_input() {
1006 let lexer = Lexer::new("a");
1007
1008 assert_eq!(None, lexer.peek_next_char());
1009 }
1010
1011 #[test]
1012 fn rewind_one_is_noop_when_position_is_zero() {
1013 let mut lexer = Lexer::new("a");
1014
1015 lexer.rewind_one();
1016
1017 assert_eq!(0, lexer.position);
1018 assert_eq!(1, lexer.read_position);
1019 assert_eq!(Some(b'a'), lexer.ch);
1020 }
1021
1022 #[test]
1023 fn unterminated_string_records_diagnostic() {
1024 let input = r#""unterminated"#;
1025 let mut lexer = Lexer::new(input);
1026
1027 let token = lexer.next_token();
1028
1029 assert_token(token, TokenKind::Illegal, "unterminated");
1030 assert_eq!(1, lexer.errors().len());
1031 assert_lex_error(
1032 lexer.errors()[0],
1033 LexErrorKind::UnterminatedString,
1034 "unterminated",
1035 1,
1036 );
1037 }
1038
1039 #[test]
1040 fn overflowing_hex_number_records_diagnostic_and_preserves_next_token() {
1041 let input = "0x10000000000000000z";
1042 let mut lexer = Lexer::new(input);
1043
1044 let token = lexer.next_token();
1045 assert_token(token, TokenKind::Illegal, "<illegal>");
1046
1047 let token = lexer.next_token();
1048 assert_token(token, TokenKind::Identifier, "z");
1049
1050 assert_eq!(1, lexer.errors().len());
1051 assert_lex_error(
1052 lexer.errors()[0],
1053 LexErrorKind::InvalidNumber,
1054 "0x10000000000000000",
1055 0,
1056 );
1057 }
1058
1059 #[test]
1060 fn unterminated_regex_before_newline_preserves_newline_token() {
1061 let input = "/foo\n123";
1062 let mut lexer = Lexer::new(input);
1063
1064 let token = lexer.next_token_regex_aware();
1065 assert_token(token, TokenKind::Illegal, "foo");
1066
1067 let token = lexer.next_token_regex_aware();
1068 assert_token(token, TokenKind::NewLine, "<newline>");
1069
1070 let token = lexer.next_token();
1071 assert_token(token, TokenKind::Number, "123");
1072
1073 assert_eq!(1, lexer.errors().len());
1074 assert_lex_error(lexer.errors()[0], LexErrorKind::UnterminatedRegex, "foo", 1);
1075 }
1076
1077 #[test]
1078 fn built_in_functions() {
1079 let input = "atan2 close cos exp gsub index int length log match rand sin split sprintf sqrt srand sub substr system tolower toupper";
1080 let mut lexer = Lexer::new(input);
1081 let expected_tokens = vec![
1082 (TokenKind::Atan2, "atan2"),
1083 (TokenKind::Close, "close"),
1084 (TokenKind::Cos, "cos"),
1085 (TokenKind::Exp, "exp"),
1086 (TokenKind::Gsub, "gsub"),
1087 (TokenKind::Index, "index"),
1088 (TokenKind::Int, "int"),
1089 (TokenKind::Length, "length"),
1090 (TokenKind::Log, "log"),
1091 (TokenKind::Match, "match"),
1092 (TokenKind::Rand, "rand"),
1093 (TokenKind::Sin, "sin"),
1094 (TokenKind::Split, "split"),
1095 (TokenKind::Sprintf, "sprintf"),
1096 (TokenKind::Sqrt, "sqrt"),
1097 (TokenKind::Srand, "srand"),
1098 (TokenKind::Sub, "sub"),
1099 (TokenKind::Substr, "substr"),
1100 (TokenKind::System, "system"),
1101 (TokenKind::ToLower, "tolower"),
1102 (TokenKind::ToUpper, "toupper"),
1103 (TokenKind::Eof, ""),
1104 ];
1105
1106 for (expected_kind, expected_literal) in expected_tokens {
1107 let token = lexer.next_token();
1108 assert_token(token, expected_kind, expected_literal);
1109 }
1110 }
1111
1112 #[test]
1113 fn test_identifiers() {
1114 let input = "my_variable, anotherVar _privateVar var123";
1115 let mut lexer = Lexer::new(input);
1116
1117 let expected_tokens = vec![
1118 (TokenKind::Identifier, "my_variable"),
1119 (TokenKind::Comma, ","),
1120 (TokenKind::Identifier, "anotherVar"),
1121 (TokenKind::Identifier, "_privateVar"),
1122 (TokenKind::Identifier, "var123"),
1123 (TokenKind::Eof, ""),
1124 ];
1125
1126 for (expected_kind, expected_literal) in expected_tokens {
1127 let token = lexer.next_token();
1128 assert_token(token, expected_kind, expected_literal);
1129 }
1130 }
1131
1132 #[test]
1133 fn is_ascii_alphabetic_lowercase() {
1134 assert!(is_ascii_alphabetic(Some(b'a')));
1135 assert!(is_ascii_alphabetic(Some(b'z')));
1136 assert!(is_ascii_alphabetic(Some(b'm')));
1137 }
1138
1139 #[test]
1140 fn is_ascii_alphabetic_uppercase() {
1141 assert!(is_ascii_alphabetic(Some(b'A')));
1142 assert!(is_ascii_alphabetic(Some(b'Z')));
1143 assert!(is_ascii_alphabetic(Some(b'M')));
1144 }
1145
1146 #[test]
1147 fn is_ascii_alphabetic_digits() {
1148 assert!(!is_ascii_alphabetic(Some(b'0')));
1149 assert!(!is_ascii_alphabetic(Some(b'5')));
1150 assert!(!is_ascii_alphabetic(Some(b'9')));
1151 }
1152
1153 #[test]
1154 fn is_ascii_alphabetic_special_chars() {
1155 assert!(!is_ascii_alphabetic(Some(b'!')));
1156 assert!(!is_ascii_alphabetic(Some(b' ')));
1157 assert!(!is_ascii_alphabetic(Some(b'{')));
1158 assert!(!is_ascii_alphabetic(Some(b'=')));
1159 }
1160
1161 #[test]
1162 fn is_ascii_alphabetic_none() {
1163 assert!(!is_ascii_alphabetic(None));
1164 }
1165
1166 #[test]
1167 fn is_whitespace_space() {
1168 assert!(is_whitespace(Some(b' ')), "space is considered whitespace");
1169 assert!(is_whitespace(Some(b'\t')), "tab is considered whitespace");
1170 assert!(
1171 is_whitespace(Some(b'\r')),
1172 "carriage return is considered whitespace"
1173 );
1174 }
1175
1176 #[test]
1177 fn is_whitespace_special_chars() {
1178 assert!(!is_whitespace(Some(b'!')));
1179 assert!(!is_whitespace(Some(b'{')));
1180 assert!(!is_whitespace(Some(b'=')));
1181 }
1182
1183 #[test]
1184 fn is_whitespace_none() {
1185 assert!(!is_whitespace(None));
1186 }
1187
1188 #[test]
1189 fn is_digit_valid() {
1190 assert!(is_digit(Some(b'0')));
1191 assert!(is_digit(Some(b'5')));
1192 assert!(is_digit(Some(b'9')));
1193 }
1194
1195 #[test]
1196 fn is_digit_invalid() {
1197 assert!(!is_digit(Some(b'a')));
1198 assert!(!is_digit(Some(b'z')));
1199 assert!(!is_digit(Some(b'A')));
1200 assert!(!is_digit(Some(b'Z')));
1201 assert!(!is_digit(Some(b'!')));
1202 assert!(!is_digit(Some(b' ')));
1203 assert!(!is_digit(Some(b'{')));
1204 assert!(!is_digit(Some(b'=')));
1205 }
1206
1207 #[test]
1208 fn is_digit_none() {
1209 assert!(!is_digit(None));
1210 }
1211
1212 #[test]
1213 fn is_hex_digit_valid() {
1214 assert!(is_hex_digit(Some(b'0')));
1215 assert!(is_hex_digit(Some(b'5')));
1216 assert!(is_hex_digit(Some(b'9')));
1217 assert!(is_hex_digit(Some(b'a')));
1218 assert!(is_hex_digit(Some(b'f')));
1219 assert!(is_hex_digit(Some(b'A')));
1220 assert!(is_hex_digit(Some(b'F')));
1221 }
1222
1223 #[test]
1224 fn is_hex_digit_invalid() {
1225 assert!(!is_hex_digit(Some(b'g')));
1226 assert!(!is_hex_digit(Some(b'z')));
1227 assert!(!is_hex_digit(Some(b'G')));
1228 assert!(!is_hex_digit(Some(b'Z')));
1229 assert!(!is_hex_digit(Some(b'!')));
1230 assert!(!is_hex_digit(Some(b' ')));
1231 assert!(!is_hex_digit(Some(b'{')));
1232 assert!(!is_hex_digit(Some(b'=')));
1233 assert!(!is_hex_digit(None));
1234 }
1235}