1use crate::token::{Token, TokenKind, lookup_functions, lookup_keyword};
2
3#[derive(Debug)]
4pub struct Lexer<'a> {
5 input: &'a str,
6 position: usize,
7 read_position: usize,
8 ch: Option<u8>,
9}
10
11impl<'a> Lexer<'a> {
12 pub fn new(src: &'a str) -> Self {
13 let mut lexer = Lexer {
14 input: src,
15 position: 0,
16 read_position: 0,
17 ch: None,
18 };
19
20 lexer.read_char();
21 lexer
22 }
23
24 pub fn next_token(&mut self) -> Token<'a> {
25 self.next_token_impl(false)
26 }
27
28 pub fn next_token_regex_aware(&mut self) -> Token<'a> {
29 self.next_token_impl(true)
30 }
31
32 fn next_token_impl(&mut self, allow_regex: bool) -> Token<'a> {
33 self.skip_whitespace();
34 self.skip_comment();
35
36 let start = self.position;
37 let token = match self.ch {
38 Some(b'{') => Token::new(TokenKind::LeftCurlyBrace, "{", start),
39 Some(b'}') => Token::new(TokenKind::RightCurlyBrace, "}", start),
40 Some(b'(') => Token::new(TokenKind::LeftParen, "(", start),
41 Some(b')') => Token::new(TokenKind::RightParen, ")", start),
42 Some(b'[') => Token::new(TokenKind::LeftSquareBracket, "[", start),
43 Some(b']') => Token::new(TokenKind::RightSquareBracket, "]", start),
44 Some(b',') => Token::new(TokenKind::Comma, ",", start),
45 Some(b';') => Token::new(TokenKind::Semicolon, ";", start),
46 Some(b'\n') => Token::new(TokenKind::NewLine, "<newline>", start),
47 Some(b'+') => {
48 if self.peek_char() == Some(b'=') {
49 self.read_char();
50 Token::new(TokenKind::AddAssign, "+=", start)
51 } else if self.peek_char() == Some(b'+') {
52 self.read_char();
53 Token::new(TokenKind::Increment, "++", start)
54 } else {
55 Token::new(TokenKind::Plus, "+", start)
56 }
57 }
58 Some(b'-') => {
59 if self.peek_char() == Some(b'=') {
60 self.read_char();
61 Token::new(TokenKind::SubtractAssign, "-=", start)
62 } else if self.peek_char() == Some(b'-') {
63 self.read_char();
64 Token::new(TokenKind::Decrement, "--", start)
65 } else {
66 Token::new(TokenKind::Minus, "-", start)
67 }
68 }
69 Some(b'*') => {
70 if self.peek_char() == Some(b'*') {
71 if self.peek_next_char() == Some(b'=') {
72 self.read_char();
73 self.read_char();
74 Token::new(TokenKind::PowerAssign, "**=", start)
75 } else {
76 self.read_char();
77 Token::new(TokenKind::Caret, "**", start)
78 }
79 } else if self.peek_char() == Some(b'=') {
80 self.read_char();
81 Token::new(TokenKind::MultiplyAssign, "*=", start)
82 } else {
83 Token::new(TokenKind::Asterisk, "*", start)
84 }
85 }
86 Some(b'%') => {
87 if self.peek_char() == Some(b'=') {
88 self.read_char();
89 Token::new(TokenKind::ModuloAssign, "%=", start)
90 } else {
91 Token::new(TokenKind::Percent, "%", start)
92 }
93 }
94 Some(b'^') => {
95 if self.peek_char() == Some(b'=') {
96 self.read_char();
97 Token::new(TokenKind::PowerAssign, "^=", start)
98 } else {
99 Token::new(TokenKind::Caret, "^", start)
100 }
101 }
102 Some(b'!') => {
103 if self.peek_char() == Some(b'=') {
104 self.read_char();
105 Token::new(TokenKind::NotEqual, "!=", start)
106 } else if self.peek_char() == Some(b'~') {
107 self.read_char();
108 Token::new(TokenKind::NoMatch, "!~", start)
109 } else {
110 Token::new(TokenKind::ExclamationMark, "!", start)
111 }
112 }
113 Some(b'>') => {
114 if self.peek_char() == Some(b'=') {
115 self.read_char();
116 Token::new(TokenKind::GreaterThanOrEqual, ">=", start)
117 } else if self.peek_char() == Some(b'>') {
118 self.read_char();
119 Token::new(TokenKind::Append, ">>", start)
120 } else {
121 Token::new(TokenKind::GreaterThan, ">", start)
122 }
123 }
124 Some(b'<') => {
125 if self.peek_char() == Some(b'=') {
126 self.read_char();
127 Token::new(TokenKind::LessThanOrEqual, "<=", start)
128 } else {
129 Token::new(TokenKind::LessThan, "<", start)
130 }
131 }
132 Some(b'|') => {
133 if self.peek_char() == Some(b'|') {
134 self.read_char();
135 Token::new(TokenKind::Or, "||", start)
136 } else {
137 Token::new(TokenKind::Pipe, "|", start)
138 }
139 }
140 Some(b'?') => Token::new(TokenKind::QuestionMark, "?", start),
141 Some(b':') => Token::new(TokenKind::Colon, ":", start),
142 Some(b'~') => Token::new(TokenKind::Tilde, "~", start),
143 Some(b'$') => Token::new(TokenKind::DollarSign, "$", start),
144 Some(b'=') => {
145 if self.peek_char() == Some(b'=') {
146 self.read_char();
147 Token::new(TokenKind::Equal, "==", start)
148 } else {
149 Token::new(TokenKind::Assign, "=", start)
150 }
151 }
152 Some(b'/') => {
153 if allow_regex {
154 self.read_regex()
155 } else if self.peek_char() == Some(b'=') {
156 self.read_char();
157 Token::new(TokenKind::DivideAssign, "/=", start)
158 } else {
159 Token::new(TokenKind::Division, "/", start)
160 }
161 }
162 Some(b'&') => {
163 if self.peek_char() == Some(b'&') {
164 self.read_char();
165 Token::new(TokenKind::And, "&&", start)
166 } else {
167 Token::new(TokenKind::Illegal, "<illegal>", start)
168 }
169 }
170 Some(b'\\') => {
171 if self.peek_char() == Some(b'\n') {
172 self.read_char();
173 Token::new(TokenKind::NewLine, "<newline>", start)
174 } else if self.peek_char() == Some(b'\r') && self.peek_next_char() == Some(b'\n') {
175 self.read_char();
176 self.read_char();
177 Token::new(TokenKind::NewLine, "<newline>", start)
178 } else {
179 Token::new(TokenKind::Illegal, "<illegal>", start)
180 }
181 }
182 Some(b'"') => self.read_string(),
183 ch if is_ascii_alphabetic(ch) || ch == Some(b'_') => self.read_identifier(),
184 ch if is_digit(ch) => self.read_number(),
185 Some(b'.')
186 if self
187 .peek_char()
188 .is_some_and(|arg0: u8| is_digit(Some(arg0))) =>
189 {
190 self.read_number()
191 }
192 None => return Token::new(TokenKind::Eof, "", start),
193 _ => Token::new(TokenKind::Illegal, "<illegal>", start),
194 };
195
196 self.read_char();
197
198 token
199 }
200
201 fn read_char(&mut self) {
202 if self.read_position >= self.input.len() {
203 self.ch = None;
204 } else {
205 self.ch = Some(self.input.as_bytes()[self.read_position]);
206 }
207 self.position = self.read_position;
208 self.read_position += 1;
209 }
210
211 fn read_identifier(&mut self) -> Token<'a> {
212 let position = self.position;
213 while is_ascii_alphabetic(self.ch) || is_digit(self.ch) || self.ch == Some(b'_') {
214 self.read_char();
215 }
216 let literal = &self.input[position..self.position];
217
218 let token = if let Some(token_kind) = lookup_keyword(literal) {
219 Token::new(token_kind, literal, position)
220 } else if let Some(token_kind) = lookup_functions(literal) {
221 Token::new(token_kind, literal, position)
222 } else {
223 Token::new(TokenKind::Identifier, literal, position)
224 };
225
226 self.rewind_one();
227 token
228 }
229
230 fn read_number(&mut self) -> Token<'a> {
231 let position = self.position;
232 let mut got_digit = false;
233
234 if self.ch == Some(b'0')
235 && matches!(self.peek_char(), Some(b'x') | Some(b'X'))
236 && !is_hex_digit(self.peek_next_char())
237 {
238 let literal = &self.input[position..position + 1];
239 return Token::new(TokenKind::Number, literal, position);
240 }
241
242 if self.ch != Some(b'.') {
244 got_digit = true;
245
246 if self.ch == Some(b'0')
247 && matches!(self.peek_char(), Some(b'x') | Some(b'X'))
248 && is_hex_digit(self.peek_next_char())
249 {
250 self.read_char(); self.read_char(); while matches!(
255 self.ch,
256 Some(b'0'..=b'9') | Some(b'a'..=b'f') | Some(b'A'..=b'F')
257 ) {
258 self.read_char();
259 }
260
261 let literal = &self.input[position..self.position];
262 match u64::from_str_radix(&literal[2..], 16) {
263 Ok(_) => {
264 let token = Token::new(TokenKind::Number, literal, position);
265 self.rewind_one();
266 return token;
267 }
268 Err(_) => {
269 return Token::new(TokenKind::Illegal, "<illegal>", position);
270 }
271 }
272 }
273 while is_digit(self.ch) {
274 self.read_char();
275 }
276
277 if self.ch == Some(b'.') {
278 self.read_char();
279 }
280 } else {
281 self.read_char();
283 }
284
285 while is_digit(self.ch) {
287 got_digit = true;
288
289 self.read_char();
290 }
291
292 if matches!(self.ch, Some(b'e') | Some(b'E')) {
293 let exponent_sign = self.peek_char();
294 let exponent_digit = if matches!(exponent_sign, Some(b'+') | Some(b'-')) {
295 self.peek_next_char()
296 } else {
297 exponent_sign
298 };
299
300 if is_digit(exponent_digit) {
301 self.read_char();
302 if matches!(self.ch, Some(b'+') | Some(b'-')) {
303 self.read_char();
304 }
305 while is_digit(self.ch) {
306 self.read_char();
307 }
308 }
309 }
310
311 if !got_digit {
312 return Token::new(TokenKind::Illegal, "<illegal>", position);
313 }
314
315 let literal = &self.input[position..self.position];
316
317 let token = Token::new(TokenKind::Number, literal, position);
318 self.rewind_one();
319 token
320 }
321
322 fn read_string(&mut self) -> Token<'a> {
323 self.read_char();
325 let position = self.position;
326 let mut escaped = false;
327
328 while let Some(ch) = self.ch {
329 if !escaped && ch == b'"' {
330 break;
331 }
332 escaped = !escaped && ch == b'\\';
333 self.read_char();
334 }
335
336 let literal = &self.input[position..self.position];
337
338 if self.ch != Some(b'"') {
339 return Token::new(TokenKind::Illegal, literal, position);
340 };
341
342 Token::new(TokenKind::String, literal, position)
343 }
344
345 fn read_regex(&mut self) -> Token<'a> {
346 self.read_char();
348 let position = self.position;
349 let mut escaped = false;
350
351 while let Some(ch) = self.ch {
352 if !escaped && ch == b'/' {
353 break;
354 }
355 if !escaped && ch == b'\n' {
356 break;
357 }
358 escaped = !escaped && ch == b'\\';
359
360 self.read_char();
361 }
362
363 let literal = &self.input[position..self.position];
364
365 if self.ch != Some(b'/') {
366 return Token::new(TokenKind::Illegal, literal, position);
367 }
368
369 Token::new(TokenKind::Regex, literal, position)
370 }
371
372 fn skip_whitespace(&mut self) {
373 while is_whitespace(self.ch) {
374 self.read_char();
375 }
376 }
377
378 fn skip_comment(&mut self) {
379 if Some(b'#') == self.ch {
380 while self.ch != Some(b'\n') && self.ch.is_some() {
381 self.read_char();
382 }
383 }
384 }
385
386 fn peek_char(&self) -> Option<u8> {
387 if self.read_position >= self.input.len() {
388 None
389 } else {
390 Some(self.input.as_bytes()[self.read_position])
391 }
392 }
393
394 fn peek_next_char(&self) -> Option<u8> {
395 let next = self.read_position + 1;
396 if next >= self.input.len() {
397 None
398 } else {
399 Some(self.input.as_bytes()[next])
400 }
401 }
402
403 fn rewind_one(&mut self) {
404 if self.position == 0 {
405 return;
406 }
407 self.read_position = self.position;
408 self.position -= 1;
409 self.ch = Some(self.input.as_bytes()[self.position]);
410 }
411}
412
413fn is_ascii_alphabetic(ch: Option<u8>) -> bool {
414 match ch {
415 Some(byte) => byte.is_ascii_alphabetic(),
416 None => false,
417 }
418}
419
420fn is_whitespace(ch: Option<u8>) -> bool {
421 match ch {
422 Some(byte) => byte == b' ' || byte == b'\t' || byte == b'\r',
423 None => false,
424 }
425}
426
427fn is_digit(ch: Option<u8>) -> bool {
428 match ch {
429 Some(byte) => byte.is_ascii_digit(),
430 None => false,
431 }
432}
433
434fn is_hex_digit(ch: Option<u8>) -> bool {
435 match ch {
436 Some(byte) => byte.is_ascii_hexdigit(),
437 None => false,
438 }
439}
440
441#[cfg(test)]
442mod tests {
443 use super::*;
444
445 fn assert_token(token: Token<'_>, kind: TokenKind, literal: &str) {
446 assert_eq!(kind, token.kind);
447 assert_eq!(literal, token.literal);
448 }
449
450 #[test]
451 fn empty_input_returns_eof_token() {
452 let input = "";
453 let mut lexer = Lexer::new(input);
454
455 let token = lexer.next_token();
456
457 assert_token(token, TokenKind::Eof, "");
458 }
459
460 #[test]
461 fn next_left_curly_brace_token() {
462 let expected_token = Token::new(TokenKind::LeftCurlyBrace, "{", 0);
463 let input = "{";
464 let mut lexer = Lexer::new(input);
465
466 let token = lexer.next_token();
467
468 assert_eq!(expected_token, token);
469 }
470
471 #[test]
472 fn next_right_curly_brace_token() {
473 let input = "}";
474 let mut lexer = Lexer::new(input);
475
476 let token = lexer.next_token();
477
478 assert_token(token, TokenKind::RightCurlyBrace, "}");
479 }
480
481 #[test]
482 fn next_pipe_token() {
483 let input = "|";
484 let mut lexer = Lexer::new(input);
485
486 let token = lexer.next_token();
487
488 assert_token(token, TokenKind::Pipe, "|");
489 }
490
491 #[test]
492 fn next_one_character_token() {
493 let input = "{}()[],;\n+-*/%^!><|?:~$=";
494 let mut lexer = Lexer::new(input);
495 let expected_tokens = vec![
496 (TokenKind::LeftCurlyBrace, "{"),
497 (TokenKind::RightCurlyBrace, "}"),
498 (TokenKind::LeftParen, "("),
499 (TokenKind::RightParen, ")"),
500 (TokenKind::LeftSquareBracket, "["),
501 (TokenKind::RightSquareBracket, "]"),
502 (TokenKind::Comma, ","),
503 (TokenKind::Semicolon, ";"),
504 (TokenKind::NewLine, "<newline>"),
505 (TokenKind::Plus, "+"),
506 (TokenKind::Minus, "-"),
507 (TokenKind::Asterisk, "*"),
508 (TokenKind::Division, "/"),
509 (TokenKind::Percent, "%"),
510 (TokenKind::Caret, "^"),
511 (TokenKind::ExclamationMark, "!"),
512 (TokenKind::GreaterThan, ">"),
513 (TokenKind::LessThan, "<"),
514 (TokenKind::Pipe, "|"),
515 (TokenKind::QuestionMark, "?"),
516 (TokenKind::Colon, ":"),
517 (TokenKind::Tilde, "~"),
518 (TokenKind::DollarSign, "$"),
519 (TokenKind::Assign, "="),
520 (TokenKind::Eof, ""),
521 ];
522
523 for (expected_kind, expected_literal) in expected_tokens {
524 let token = lexer.next_token();
525 assert_token(token, expected_kind, expected_literal);
526 }
527 }
528
529 #[test]
530 fn next_while_token() {
531 let expected_token = Token::new(TokenKind::While, "while", 1);
532 let input = " while";
533 let mut lexer = Lexer::new(input);
534
535 let token = lexer.next_token();
536
537 assert_eq!(expected_token, token);
538 }
539
540 #[test]
541 fn next_identifier_token() {
542 let input = "BEGIN END break continue delete do else exit for function if in next print printf return while";
543 let mut lexer = Lexer::new(input);
544
545 let expected_tokens = vec![
546 (TokenKind::Begin, "BEGIN"),
547 (TokenKind::End, "END"),
548 (TokenKind::Break, "break"),
549 (TokenKind::Continue, "continue"),
550 (TokenKind::Delete, "delete"),
551 (TokenKind::Do, "do"),
552 (TokenKind::Else, "else"),
553 (TokenKind::Exit, "exit"),
554 (TokenKind::For, "for"),
555 (TokenKind::Function, "function"),
556 (TokenKind::If, "if"),
557 (TokenKind::In, "in"),
558 (TokenKind::Next, "next"),
559 (TokenKind::Print, "print"),
560 (TokenKind::Printf, "printf"),
561 (TokenKind::Return, "return"),
562 (TokenKind::While, "while"),
563 (TokenKind::Eof, ""),
564 ];
565
566 for (expected_kind, expected_literal) in expected_tokens {
567 let token = lexer.next_token();
568 assert_token(token, expected_kind, expected_literal);
569 }
570 }
571
572 #[test]
573 fn next_number_token() {
574 let input = "123 4567 890 42.0 .75 0.001";
575 let mut lexer = Lexer::new(input);
576
577 let expected_tokens = vec![
578 (TokenKind::Number, "123"),
579 (TokenKind::Number, "4567"),
580 (TokenKind::Number, "890"),
581 (TokenKind::Number, "42.0"),
582 (TokenKind::Number, ".75"),
583 (TokenKind::Number, "0.001"),
584 (TokenKind::Eof, ""),
585 ];
586
587 for (expected_kind, expected_literal) in expected_tokens {
588 let token = lexer.next_token();
589 assert_token(token, expected_kind, expected_literal);
590 }
591 }
592
593 #[test]
594 fn hex_number_token() {
595 let input = "0xAA 0xaa 0xFEED 0xBEAF";
596 let mut lexer = Lexer::new(input);
597
598 let expected_tokens = vec![
599 (TokenKind::Number, "0xAA"),
600 (TokenKind::Number, "0xaa"),
601 (TokenKind::Number, "0xFEED"),
602 (TokenKind::Number, "0xBEAF"),
603 (TokenKind::Eof, ""),
604 ];
605
606 for (expected_kind, expected_literal) in expected_tokens {
607 let token = lexer.next_token();
608 assert_token(token, expected_kind, expected_literal);
609 }
610 }
611
612 #[test]
613 fn invalid_hex_number_token() {
614 let input = "0xG1 5x03";
615 let mut lexer = Lexer::new(input);
616
617 let expected_tokens = vec![
618 (TokenKind::Number, "0"),
619 (TokenKind::Identifier, "xG1"),
620 (TokenKind::Number, "5"),
621 (TokenKind::Identifier, "x03"),
622 (TokenKind::Eof, ""),
623 ];
624
625 for (expected_kind, expected_literal) in expected_tokens {
626 let token = lexer.next_token();
627 assert_token(token, expected_kind, expected_literal);
628 }
629 }
630
631 #[test]
632 fn next_or_token() {
633 let expected_token = Token::new(TokenKind::Or, "||", 0);
634 let input = "||";
635 let mut lexer = Lexer::new(input);
636
637 let token = lexer.next_token();
638
639 assert_eq!(expected_token, token);
640 }
641
642 #[test]
643 fn next_two_character_token() {
644 let input = "+= -= *= /= %= ^= **= ** || && !~ == <= >= != ++ -- >>";
645 let mut lexer = Lexer::new(input);
646
647 let expected_tokens = vec![
648 (TokenKind::AddAssign, "+="),
649 (TokenKind::SubtractAssign, "-="),
650 (TokenKind::MultiplyAssign, "*="),
651 (TokenKind::DivideAssign, "/="),
652 (TokenKind::ModuloAssign, "%="),
653 (TokenKind::PowerAssign, "^="),
654 (TokenKind::PowerAssign, "**="),
655 (TokenKind::Caret, "**"),
656 (TokenKind::Or, "||"),
657 (TokenKind::And, "&&"),
658 (TokenKind::NoMatch, "!~"),
659 (TokenKind::Equal, "=="),
660 (TokenKind::LessThanOrEqual, "<="),
661 (TokenKind::GreaterThanOrEqual, ">="),
662 (TokenKind::NotEqual, "!="),
663 (TokenKind::Increment, "++"),
664 (TokenKind::Decrement, "--"),
665 (TokenKind::Append, ">>"),
666 (TokenKind::Eof, ""),
667 ];
668
669 for (expected_kind, expected_literal) in expected_tokens {
670 let token = lexer.next_token();
671 assert_token(token, expected_kind, expected_literal);
672 }
673 }
674
675 #[test]
676 fn consume_comment() {
677 let input = "# This is a comment\n123";
678 let mut lexer = Lexer::new(input);
679
680 let expected_tokens = vec![
681 (TokenKind::NewLine, "<newline>"),
682 (TokenKind::Number, "123"),
683 (TokenKind::Eof, ""),
684 ];
685
686 for (expected_kind, expected_literal) in expected_tokens {
687 let token = lexer.next_token();
688 assert_token(token, expected_kind, expected_literal);
689 }
690 }
691
692 #[test]
693 fn expect_newline_after_backslash() {
694 let input = "123 \\\n456";
695 let mut lexer = Lexer::new(input);
696
697 let expected_tokens = vec![
698 (TokenKind::Number, "123"),
699 (TokenKind::NewLine, "<newline>"),
700 (TokenKind::Number, "456"),
701 (TokenKind::Eof, ""),
702 ];
703 for (expected_kind, expected_literal) in expected_tokens {
704 let token = lexer.next_token();
705 assert_token(token, expected_kind, expected_literal);
706 }
707 }
708
709 #[test]
710 fn scientific_number_token() {
711 let input = "1E2 12e-2 .75e+1";
712 let mut lexer = Lexer::new(input);
713
714 let expected_tokens = vec![
715 (TokenKind::Number, "1E2"),
716 (TokenKind::Number, "12e-2"),
717 (TokenKind::Number, ".75e+1"),
718 (TokenKind::Eof, ""),
719 ];
720
721 for (expected_kind, expected_literal) in expected_tokens {
722 let token = lexer.next_token();
723 assert_token(token, expected_kind, expected_literal);
724 }
725 }
726
727 #[test]
728 fn expect_newline_after_backslash_with_crlf() {
729 let input = "123 \\\r\n456";
730 let mut lexer = Lexer::new(input);
731
732 let expected_tokens = vec![
733 (TokenKind::Number, "123"),
734 (TokenKind::NewLine, "<newline>"),
735 (TokenKind::Number, "456"),
736 (TokenKind::Eof, ""),
737 ];
738 for (expected_kind, expected_literal) in expected_tokens {
739 let token = lexer.next_token();
740 assert_token(token, expected_kind, expected_literal);
741 }
742 }
743
744 #[test]
745 fn backslash_without_newline_is_illegal() {
746 let input = "123 \\ 456";
747 let mut lexer = Lexer::new(input);
748 let expected_tokens = vec![
749 (TokenKind::Number, "123"),
750 (TokenKind::Illegal, "<illegal>"),
751 (TokenKind::Number, "456"),
752 (TokenKind::Eof, ""),
753 ];
754
755 for (expected_kind, expected_literal) in expected_tokens {
756 let token = lexer.next_token();
757 assert_token(token, expected_kind, expected_literal);
758 }
759 }
760
761 #[test]
762 fn read_string_token() {
763 let input = r#""Hello, World!" 123 "Hello, again!";"#;
764 let mut lexer = Lexer::new(input);
765 let expected_tokens = vec![
766 (TokenKind::String, "Hello, World!"),
767 (TokenKind::Number, "123"),
768 (TokenKind::String, "Hello, again!"),
769 (TokenKind::Semicolon, ";"),
770 (TokenKind::Eof, ""),
771 ];
772
773 for (expected_kind, expected_literal) in expected_tokens {
774 let token = lexer.next_token();
775 assert_token(token, expected_kind, expected_literal);
776 }
777 }
778
779 #[test]
780 fn read_string_token_with_escaped_quote() {
781 let input = r#""\"""#;
782 let mut lexer = Lexer::new(input);
783
784 let token = lexer.next_token();
785 assert_token(token, TokenKind::String, r#"\""#);
786
787 let token = lexer.next_token();
788 assert_token(token, TokenKind::Eof, "");
789 }
790
791 #[test]
792 fn read_regex_token_when_allowed() {
793 let input = r"/foo\//";
794 let mut lexer = Lexer::new(input);
795
796 let token = lexer.next_token_regex_aware();
797
798 assert_token(token, TokenKind::Regex, r"foo\/");
799 }
800
801 #[test]
802 fn slash_is_division_when_regex_not_allowed() {
803 let input = "/foo/";
804 let mut lexer = Lexer::new(input);
805
806 let token = lexer.next_token();
807
808 assert_token(token, TokenKind::Division, "/");
809 }
810
811 #[test]
812 fn unterminated_string_token() {
813 let input = r#""This is an unterminated string"#;
814 let mut lexer = Lexer::new(input);
815
816 let token = lexer.next_token();
817 assert_token(token, TokenKind::Illegal, "This is an unterminated string");
818 }
819
820 #[test]
821 fn unsupported_character_is_illegal() {
822 let input = "@";
823 let mut lexer = Lexer::new(input);
824
825 let token = lexer.next_token();
826
827 assert_token(token, TokenKind::Illegal, "<illegal>");
828 }
829
830 #[test]
831 fn unterminated_regex_token_is_illegal() {
832 let input = r"/foo";
833 let mut lexer = Lexer::new(input);
834
835 let token = lexer.next_token_regex_aware();
836
837 assert_token(token, TokenKind::Illegal, "foo");
838 }
839
840 #[test]
841 fn bare_dot_is_illegal() {
842 let input = ".";
843 let mut lexer = Lexer::new(input);
844
845 let token = lexer.next_token();
846
847 assert_token(token, TokenKind::Illegal, "<illegal>");
848 }
849
850 #[test]
851 fn built_in_functions() {
852 let input = "atan2 close cos exp gsub index int length log match rand sin split sprintf sqrt srand sub substr system tolower toupper";
853 let mut lexer = Lexer::new(input);
854 let expected_tokens = vec![
855 (TokenKind::Atan2, "atan2"),
856 (TokenKind::Close, "close"),
857 (TokenKind::Cos, "cos"),
858 (TokenKind::Exp, "exp"),
859 (TokenKind::Gsub, "gsub"),
860 (TokenKind::Index, "index"),
861 (TokenKind::Int, "int"),
862 (TokenKind::Length, "length"),
863 (TokenKind::Log, "log"),
864 (TokenKind::Match, "match"),
865 (TokenKind::Rand, "rand"),
866 (TokenKind::Sin, "sin"),
867 (TokenKind::Split, "split"),
868 (TokenKind::Sprintf, "sprintf"),
869 (TokenKind::Sqrt, "sqrt"),
870 (TokenKind::Srand, "srand"),
871 (TokenKind::Sub, "sub"),
872 (TokenKind::Substr, "substr"),
873 (TokenKind::System, "system"),
874 (TokenKind::ToLower, "tolower"),
875 (TokenKind::ToUpper, "toupper"),
876 (TokenKind::Eof, ""),
877 ];
878
879 for (expected_kind, expected_literal) in expected_tokens {
880 let token = lexer.next_token();
881 assert_token(token, expected_kind, expected_literal);
882 }
883 }
884
885 #[test]
886 fn test_identifiers() {
887 let input = "my_variable, anotherVar _privateVar var123";
888 let mut lexer = Lexer::new(input);
889
890 let expected_tokens = vec![
891 (TokenKind::Identifier, "my_variable"),
892 (TokenKind::Comma, ","),
893 (TokenKind::Identifier, "anotherVar"),
894 (TokenKind::Identifier, "_privateVar"),
895 (TokenKind::Identifier, "var123"),
896 (TokenKind::Eof, ""),
897 ];
898
899 for (expected_kind, expected_literal) in expected_tokens {
900 let token = lexer.next_token();
901 assert_token(token, expected_kind, expected_literal);
902 }
903 }
904
905 #[test]
906 fn is_ascii_alphabetic_lowercase() {
907 assert!(is_ascii_alphabetic(Some(b'a')));
908 assert!(is_ascii_alphabetic(Some(b'z')));
909 assert!(is_ascii_alphabetic(Some(b'm')));
910 }
911
912 #[test]
913 fn is_ascii_alphabetic_uppercase() {
914 assert!(is_ascii_alphabetic(Some(b'A')));
915 assert!(is_ascii_alphabetic(Some(b'Z')));
916 assert!(is_ascii_alphabetic(Some(b'M')));
917 }
918
919 #[test]
920 fn is_ascii_alphabetic_digits() {
921 assert!(!is_ascii_alphabetic(Some(b'0')));
922 assert!(!is_ascii_alphabetic(Some(b'5')));
923 assert!(!is_ascii_alphabetic(Some(b'9')));
924 }
925
926 #[test]
927 fn is_ascii_alphabetic_special_chars() {
928 assert!(!is_ascii_alphabetic(Some(b'!')));
929 assert!(!is_ascii_alphabetic(Some(b' ')));
930 assert!(!is_ascii_alphabetic(Some(b'{')));
931 assert!(!is_ascii_alphabetic(Some(b'=')));
932 }
933
934 #[test]
935 fn is_ascii_alphabetic_none() {
936 assert!(!is_ascii_alphabetic(None));
937 }
938
939 #[test]
940 fn is_whitespace_space() {
941 assert!(is_whitespace(Some(b' ')), "space is considered whitespace");
942 assert!(is_whitespace(Some(b'\t')), "tab is considered whitespace");
943 assert!(
944 is_whitespace(Some(b'\r')),
945 "carriage return is considered whitespace"
946 );
947 }
948
949 #[test]
950 fn is_whitespace_special_chars() {
951 assert!(!is_whitespace(Some(b'!')));
952 assert!(!is_whitespace(Some(b'{')));
953 assert!(!is_whitespace(Some(b'=')));
954 }
955
956 #[test]
957 fn is_whitespace_none() {
958 assert!(!is_whitespace(None));
959 }
960
961 #[test]
962 fn is_digit_valid() {
963 assert!(is_digit(Some(b'0')));
964 assert!(is_digit(Some(b'5')));
965 assert!(is_digit(Some(b'9')));
966 }
967
968 #[test]
969 fn is_digit_invalid() {
970 assert!(!is_digit(Some(b'a')));
971 assert!(!is_digit(Some(b'z')));
972 assert!(!is_digit(Some(b'A')));
973 assert!(!is_digit(Some(b'Z')));
974 assert!(!is_digit(Some(b'!')));
975 assert!(!is_digit(Some(b' ')));
976 assert!(!is_digit(Some(b'{')));
977 assert!(!is_digit(Some(b'=')));
978 }
979
980 #[test]
981 fn is_digit_none() {
982 assert!(!is_digit(None));
983 }
984
985 #[test]
986 fn is_hex_digit_valid() {
987 assert!(is_hex_digit(Some(b'0')));
988 assert!(is_hex_digit(Some(b'5')));
989 assert!(is_hex_digit(Some(b'9')));
990 assert!(is_hex_digit(Some(b'a')));
991 assert!(is_hex_digit(Some(b'f')));
992 assert!(is_hex_digit(Some(b'A')));
993 assert!(is_hex_digit(Some(b'F')));
994 }
995
996 #[test]
997 fn is_hex_digit_invalid() {
998 assert!(!is_hex_digit(Some(b'g')));
999 assert!(!is_hex_digit(Some(b'z')));
1000 assert!(!is_hex_digit(Some(b'G')));
1001 assert!(!is_hex_digit(Some(b'Z')));
1002 assert!(!is_hex_digit(Some(b'!')));
1003 assert!(!is_hex_digit(Some(b' ')));
1004 assert!(!is_hex_digit(Some(b'{')));
1005 assert!(!is_hex_digit(Some(b'=')));
1006 assert!(!is_hex_digit(None));
1007 }
1008}