1mod tokens;
2
3pub use tokens::{Token, TokenKind, keyword_to_token};
4
5use crate::error::{Error, Result};
6
7pub struct Lexer<'a> {
9 source: &'a str,
10 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
11 line: usize,
12 column: usize,
13 last_token_produces_value: bool,
14}
15
16impl<'a> Lexer<'a> {
17 pub fn new(source: &'a str) -> Self {
18 Self {
19 source,
20 chars: source.char_indices().peekable(),
21 line: 1,
22 column: 1,
23 last_token_produces_value: false,
24 }
25 }
26
27 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
29 let estimated_tokens = self.source.len() / 4 + 1;
31 let mut tokens = Vec::with_capacity(estimated_tokens.min(1024));
32
33 loop {
34 let token = self.next_token()?;
35 let is_eof = matches!(token.kind, TokenKind::Eof);
36 tokens.push(token);
37 if is_eof {
38 break;
39 }
40 }
41 Ok(tokens)
42 }
43
44 pub fn next_token(&mut self) -> Result<Token> {
46 self.skip_whitespace_and_comments();
47
48 let (line, col) = (self.line, self.column);
49
50 let Some((_pos, ch)) = self.peek_char() else {
51 return Ok(Token::new(TokenKind::Eof, line, col));
52 };
53
54 let token = match ch {
55 '\n' => {
57 self.advance();
58 Token::new(TokenKind::Newline, line, col)
59 }
60
61 '"' => self.scan_string()?,
63
64 '/' => {
66 if self.last_token_produces_value {
67 self.advance();
68 if self.peek_char_is('=') {
69 self.advance();
70 Token::new(TokenKind::SlashAssign, line, col)
71 } else {
72 Token::new(TokenKind::Slash, line, col)
73 }
74 } else {
75 self.scan_regex()?
76 }
77 }
78
79 '0'..='9' | '.' if ch == '.' && self.peek_next_is_digit() => self.scan_number()?,
81 '0'..='9' => self.scan_number()?,
82
83 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
85
86 '+' => {
88 self.advance();
89 if self.peek_char_is('+') {
90 self.advance();
91 Token::new(TokenKind::Increment, line, col)
92 } else if self.peek_char_is('=') {
93 self.advance();
94 Token::new(TokenKind::PlusAssign, line, col)
95 } else {
96 Token::new(TokenKind::Plus, line, col)
97 }
98 }
99 '-' => {
100 self.advance();
101 if self.peek_char_is('-') {
102 self.advance();
103 Token::new(TokenKind::Decrement, line, col)
104 } else if self.peek_char_is('=') {
105 self.advance();
106 Token::new(TokenKind::MinusAssign, line, col)
107 } else {
108 Token::new(TokenKind::Minus, line, col)
109 }
110 }
111 '*' => {
112 self.advance();
113 if self.peek_char_is('=') {
114 self.advance();
115 Token::new(TokenKind::StarAssign, line, col)
116 } else {
117 Token::new(TokenKind::Star, line, col)
118 }
119 }
120 '%' => {
121 self.advance();
122 if self.peek_char_is('=') {
123 self.advance();
124 Token::new(TokenKind::PercentAssign, line, col)
125 } else {
126 Token::new(TokenKind::Percent, line, col)
127 }
128 }
129 '^' => {
130 self.advance();
131 if self.peek_char_is('=') {
132 self.advance();
133 Token::new(TokenKind::CaretAssign, line, col)
134 } else {
135 Token::new(TokenKind::Caret, line, col)
136 }
137 }
138 '<' => {
139 self.advance();
140 if self.peek_char_is('=') {
141 self.advance();
142 Token::new(TokenKind::LessEqual, line, col)
143 } else {
144 Token::new(TokenKind::Less, line, col)
145 }
146 }
147 '>' => {
148 self.advance();
149 if self.peek_char_is('=') {
150 self.advance();
151 Token::new(TokenKind::GreaterEqual, line, col)
152 } else if self.peek_char_is('>') {
153 self.advance();
154 Token::new(TokenKind::Append, line, col)
155 } else {
156 Token::new(TokenKind::Greater, line, col)
157 }
158 }
159 '=' => {
160 self.advance();
161 if self.peek_char_is('=') {
162 self.advance();
163 Token::new(TokenKind::Equal, line, col)
164 } else {
165 Token::new(TokenKind::Assign, line, col)
166 }
167 }
168 '!' => {
169 self.advance();
170 if self.peek_char_is('=') {
171 self.advance();
172 Token::new(TokenKind::NotEqual, line, col)
173 } else if self.peek_char_is('~') {
174 self.advance();
175 Token::new(TokenKind::NotMatch, line, col)
176 } else {
177 Token::new(TokenKind::Not, line, col)
178 }
179 }
180 '~' => {
181 self.advance();
182 Token::new(TokenKind::Match, line, col)
183 }
184 '&' => {
185 self.advance();
186 if self.peek_char_is('&') {
187 self.advance();
188 Token::new(TokenKind::And, line, col)
189 } else {
190 return Err(Error::lexer(
191 "unexpected '&', did you mean '&&'?",
192 line,
193 col,
194 ));
195 }
196 }
197 '|' => {
198 self.advance();
199 if self.peek_char_is('|') {
200 self.advance();
201 Token::new(TokenKind::Or, line, col)
202 } else {
203 Token::new(TokenKind::Pipe, line, col)
204 }
205 }
206 '$' => {
207 self.advance();
208 Token::new(TokenKind::Dollar, line, col)
209 }
210 '?' => {
211 self.advance();
212 Token::new(TokenKind::Question, line, col)
213 }
214 ':' => {
215 self.advance();
216 Token::new(TokenKind::Colon, line, col)
217 }
218 '(' => {
219 self.advance();
220 Token::new(TokenKind::LeftParen, line, col)
221 }
222 ')' => {
223 self.advance();
224 Token::new(TokenKind::RightParen, line, col)
225 }
226 '{' => {
227 self.advance();
228 Token::new(TokenKind::LeftBrace, line, col)
229 }
230 '}' => {
231 self.advance();
232 Token::new(TokenKind::RightBrace, line, col)
233 }
234 '[' => {
235 self.advance();
236 Token::new(TokenKind::LeftBracket, line, col)
237 }
238 ']' => {
239 self.advance();
240 Token::new(TokenKind::RightBracket, line, col)
241 }
242 ';' => {
243 self.advance();
244 Token::new(TokenKind::Semicolon, line, col)
245 }
246 ',' => {
247 self.advance();
248 Token::new(TokenKind::Comma, line, col)
249 }
250
251 _ => {
252 return Err(Error::lexer(
253 format!("unexpected character '{}'", ch),
254 line,
255 col,
256 ));
257 }
258 };
259
260 self.last_token_produces_value = token.kind.produces_value();
261 Ok(token)
262 }
263
264 fn peek_char(&mut self) -> Option<(usize, char)> {
265 self.chars.peek().copied()
266 }
267
268 fn peek_char_is(&mut self, expected: char) -> bool {
269 self.chars
270 .peek()
271 .map(|(_, c)| *c == expected)
272 .unwrap_or(false)
273 }
274
275 fn peek_next_is_digit(&self) -> bool {
276 let mut chars = self.chars.clone();
277 chars.next(); chars
279 .next()
280 .map(|(_, c)| c.is_ascii_digit())
281 .unwrap_or(false)
282 }
283
284 fn advance(&mut self) -> Option<(usize, char)> {
285 let result = self.chars.next();
286 if let Some((_, ch)) = result {
287 if ch == '\n' {
288 self.line += 1;
289 self.column = 1;
290 } else {
291 self.column += 1;
292 }
293 }
294 result
295 }
296
297 fn skip_whitespace_and_comments(&mut self) {
298 loop {
299 match self.peek_char() {
300 Some((_, ' ' | '\t' | '\r')) => {
301 self.advance();
302 }
303 Some((_, '\\')) => {
304 let mut chars = self.chars.clone();
306 chars.next();
307 if chars.peek().map(|(_, c)| *c == '\n').unwrap_or(false) {
308 self.advance(); self.advance(); } else {
311 break;
312 }
313 }
314 Some((_, '#')) => {
315 while let Some((_, ch)) = self.peek_char() {
317 if ch == '\n' {
318 break;
319 }
320 self.advance();
321 }
322 }
323 _ => break,
324 }
325 }
326 }
327
328 fn scan_string(&mut self) -> Result<Token> {
329 let (line, col) = (self.line, self.column);
330 self.advance(); let mut value = String::new();
333
334 loop {
335 match self.advance() {
336 Some((_, '"')) => break,
337 Some((_, '\\')) => {
338 match self.peek_char() {
340 Some((_, 'n')) => {
341 self.advance();
342 value.push('\n');
343 }
344 Some((_, 't')) => {
345 self.advance();
346 value.push('\t');
347 }
348 Some((_, 'r')) => {
349 self.advance();
350 value.push('\r');
351 }
352 Some((_, 'b')) => {
353 self.advance();
354 value.push('\x08');
355 }
356 Some((_, 'f')) => {
357 self.advance();
358 value.push('\x0C');
359 }
360 Some((_, 'a')) => {
361 self.advance();
362 value.push('\x07');
363 }
364 Some((_, 'v')) => {
365 self.advance();
366 value.push('\x0B');
367 }
368 Some((_, '\\')) => {
369 self.advance();
370 value.push('\\');
371 }
372 Some((_, '"')) => {
373 self.advance();
374 value.push('"');
375 }
376 Some((_, '/')) => {
377 self.advance();
378 value.push('/');
379 }
380 Some((_, 'x')) => {
381 self.advance(); let hex = self.read_hex_digits(2);
384 if let Some(ch) = u8::from_str_radix(&hex, 16).ok().map(|b| b as char) {
385 value.push(ch);
386 } else {
387 value.push_str("\\x");
388 value.push_str(&hex);
389 }
390 }
391 Some((_, c)) if c.is_ascii_digit() && c != '8' && c != '9' => {
392 let octal = self.read_octal_digits(3);
394 if let Some(ch) = u8::from_str_radix(&octal, 8).ok().map(|b| b as char)
395 {
396 value.push(ch);
397 } else {
398 value.push('\\');
399 value.push_str(&octal);
400 }
401 }
402 Some((_, c)) => {
403 self.advance();
405 value.push(c);
406 }
407 None => {
408 return Err(Error::lexer("unterminated string", line, col));
409 }
410 }
411 }
412 Some((_, '\n')) => {
413 return Err(Error::lexer(
414 "unterminated string (newline in string)",
415 line,
416 col,
417 ));
418 }
419 Some((_, ch)) => value.push(ch),
420 None => {
421 return Err(Error::lexer("unterminated string", line, col));
422 }
423 }
424 }
425
426 Ok(Token::new(TokenKind::String(value), line, col))
427 }
428
429 fn read_hex_digits(&mut self, max_count: usize) -> String {
430 let mut result = String::new();
431 for _ in 0..max_count {
432 match self.peek_char() {
433 Some((_, c)) if c.is_ascii_hexdigit() => {
434 self.advance();
435 result.push(c);
436 }
437 _ => break,
438 }
439 }
440 result
441 }
442
443 fn read_octal_digits(&mut self, max_count: usize) -> String {
444 let mut result = String::new();
445 for _ in 0..max_count {
446 match self.peek_char() {
447 Some((_, c)) if ('0'..='7').contains(&c) => {
448 self.advance();
449 result.push(c);
450 }
451 _ => break,
452 }
453 }
454 result
455 }
456
457 fn scan_regex(&mut self) -> Result<Token> {
458 let (line, col) = (self.line, self.column);
459 self.advance(); let mut pattern = String::new();
462
463 loop {
464 match self.advance() {
465 Some((_, '/')) => break,
466 Some((_, '\\')) => {
467 pattern.push('\\');
469 if let Some((_, ch)) = self.advance() {
470 pattern.push(ch);
471 } else {
472 return Err(Error::lexer("unterminated regex", line, col));
473 }
474 }
475 Some((_, '\n')) => {
476 return Err(Error::lexer(
477 "unterminated regex (newline in regex)",
478 line,
479 col,
480 ));
481 }
482 Some((_, ch)) => pattern.push(ch),
483 None => {
484 return Err(Error::lexer("unterminated regex", line, col));
485 }
486 }
487 }
488
489 Ok(Token::new(TokenKind::Regex(pattern), line, col))
490 }
491
492 fn scan_number(&mut self) -> Result<Token> {
493 let (line, col) = (self.line, self.column);
494 let start_pos = self.chars.peek().map(|(pos, _)| *pos).unwrap_or(0);
495 let mut end_pos = start_pos;
496
497 while let Some((pos, ch)) = self.peek_char() {
499 if ch.is_ascii_digit() {
500 end_pos = pos + 1;
501 self.advance();
502 } else {
503 break;
504 }
505 }
506
507 if self.peek_char_is('.') {
509 self.advance();
510 end_pos += 1;
511
512 while let Some((pos, ch)) = self.peek_char() {
513 if ch.is_ascii_digit() {
514 end_pos = pos + 1;
515 self.advance();
516 } else {
517 break;
518 }
519 }
520 }
521
522 if let Some((_, 'e' | 'E')) = self.peek_char() {
524 self.advance();
525 end_pos += 1;
526
527 if let Some((_, '+' | '-')) = self.peek_char() {
528 self.advance();
529 end_pos += 1;
530 }
531
532 while let Some((pos, ch)) = self.peek_char() {
533 if ch.is_ascii_digit() {
534 end_pos = pos + 1;
535 self.advance();
536 } else {
537 break;
538 }
539 }
540 }
541
542 let number_str = &self.source[start_pos..end_pos];
543 let value: f64 = number_str
544 .parse()
545 .map_err(|_| Error::lexer(format!("invalid number '{}'", number_str), line, col))?;
546
547 Ok(Token::new(TokenKind::Number(value), line, col))
548 }
549
550 fn scan_identifier(&mut self) -> Result<Token> {
551 let (line, col) = (self.line, self.column);
552 let start_pos = self.chars.peek().map(|(pos, _)| *pos).unwrap_or(0);
553 let mut end_pos = start_pos;
554
555 while let Some((pos, ch)) = self.peek_char() {
556 if ch.is_ascii_alphanumeric() || ch == '_' {
557 end_pos = pos + 1;
558 self.advance();
559 } else {
560 break;
561 }
562 }
563
564 let ident = &self.source[start_pos..end_pos];
565
566 let kind =
568 keyword_to_token(ident).unwrap_or_else(|| TokenKind::Identifier(ident.to_string()));
569
570 Ok(Token::new(kind, line, col))
571 }
572}
573
574#[cfg(test)]
575mod tests {
576 use super::*;
577
578 #[test]
579 fn test_simple_tokens() {
580 let mut lexer = Lexer::new("x + y - z * w / v % u");
582 let tokens = lexer.tokenize().unwrap();
583
584 assert!(matches!(tokens[1].kind, TokenKind::Plus));
585 assert!(matches!(tokens[3].kind, TokenKind::Minus));
586 assert!(matches!(tokens[5].kind, TokenKind::Star));
587 assert!(matches!(tokens[7].kind, TokenKind::Slash));
588 assert!(matches!(tokens[9].kind, TokenKind::Percent));
589 }
590
591 #[test]
592 fn test_keywords() {
593 let mut lexer = Lexer::new("BEGIN END if else while for print");
594 let tokens = lexer.tokenize().unwrap();
595
596 assert!(matches!(tokens[0].kind, TokenKind::Begin));
597 assert!(matches!(tokens[1].kind, TokenKind::End));
598 assert!(matches!(tokens[2].kind, TokenKind::If));
599 assert!(matches!(tokens[3].kind, TokenKind::Else));
600 assert!(matches!(tokens[4].kind, TokenKind::While));
601 assert!(matches!(tokens[5].kind, TokenKind::For));
602 assert!(matches!(tokens[6].kind, TokenKind::Print));
603 }
604
605 #[test]
606 fn test_numbers() {
607 let mut lexer = Lexer::new("42 2.75 1e10 2.5e-3");
608 let tokens = lexer.tokenize().unwrap();
609
610 assert!(matches!(tokens[0].kind, TokenKind::Number(n) if n == 42.0));
611 assert!(matches!(tokens[1].kind, TokenKind::Number(n) if (n - 2.75).abs() < 0.001));
612 assert!(matches!(tokens[2].kind, TokenKind::Number(n) if n == 1e10));
613 assert!(matches!(tokens[3].kind, TokenKind::Number(n) if (n - 2.5e-3).abs() < 0.0001));
614 }
615
616 #[test]
617 fn test_strings() {
618 let mut lexer = Lexer::new(r#""hello" "world\n""#);
619 let tokens = lexer.tokenize().unwrap();
620
621 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
622 assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world\n"));
623 }
624
625 #[test]
626 fn test_regex_vs_division() {
627 let mut lexer = Lexer::new("x / 2");
629 let tokens = lexer.tokenize().unwrap();
630 assert!(matches!(tokens[1].kind, TokenKind::Slash));
631
632 let mut lexer = Lexer::new("/pattern/");
634 let tokens = lexer.tokenize().unwrap();
635 assert!(matches!(&tokens[0].kind, TokenKind::Regex(s) if s == "pattern"));
636 }
637
638 #[test]
639 fn test_line_tracking() {
640 let mut lexer = Lexer::new("a\nb\nc");
641 let tokens = lexer.tokenize().unwrap();
642
643 assert_eq!(tokens[0].location.line, 1);
644 assert_eq!(tokens[2].location.line, 2);
645 assert_eq!(tokens[4].location.line, 3);
646 }
647
648 #[test]
649 fn test_comparison_operators() {
650 let mut lexer = Lexer::new("< <= > >= == !=");
651 let tokens = lexer.tokenize().unwrap();
652 assert!(matches!(tokens[0].kind, TokenKind::Less));
653 assert!(matches!(tokens[1].kind, TokenKind::LessEqual));
654 assert!(matches!(tokens[2].kind, TokenKind::Greater));
655 assert!(matches!(tokens[3].kind, TokenKind::GreaterEqual));
656 assert!(matches!(tokens[4].kind, TokenKind::Equal));
657 assert!(matches!(tokens[5].kind, TokenKind::NotEqual));
658 }
659
660 #[test]
661 fn test_logical_operators() {
662 let mut lexer = Lexer::new("&& || !");
663 let tokens = lexer.tokenize().unwrap();
664 assert!(matches!(tokens[0].kind, TokenKind::And));
665 assert!(matches!(tokens[1].kind, TokenKind::Or));
666 assert!(matches!(tokens[2].kind, TokenKind::Not));
667 }
668
669 #[test]
670 fn test_regex_match_operators() {
671 let mut lexer = Lexer::new("x ~ y x !~ y");
672 let tokens = lexer.tokenize().unwrap();
673 assert!(matches!(tokens[1].kind, TokenKind::Match));
674 assert!(matches!(tokens[4].kind, TokenKind::NotMatch));
675 }
676
677 #[test]
678 fn test_assignment_operators() {
679 let mut lexer = Lexer::new("x = 1 x += 1 x -= 1 x *= 1 x /= 1 x %= 1 x ^= 1");
681 let tokens = lexer.tokenize().unwrap();
682 assert!(matches!(tokens[1].kind, TokenKind::Assign));
683 assert!(matches!(tokens[4].kind, TokenKind::PlusAssign));
684 assert!(matches!(tokens[7].kind, TokenKind::MinusAssign));
685 assert!(matches!(tokens[10].kind, TokenKind::StarAssign));
686 assert!(matches!(tokens[13].kind, TokenKind::SlashAssign));
687 assert!(matches!(tokens[16].kind, TokenKind::PercentAssign));
688 assert!(matches!(tokens[19].kind, TokenKind::CaretAssign));
689 }
690
691 #[test]
692 fn test_increment_decrement() {
693 let mut lexer = Lexer::new("++ --");
694 let tokens = lexer.tokenize().unwrap();
695 assert!(matches!(tokens[0].kind, TokenKind::Increment));
696 assert!(matches!(tokens[1].kind, TokenKind::Decrement));
697 }
698
699 #[test]
700 fn test_delimiters() {
701 let mut lexer = Lexer::new("( ) { } [ ] ; ,");
702 let tokens = lexer.tokenize().unwrap();
703 assert!(matches!(tokens[0].kind, TokenKind::LeftParen));
704 assert!(matches!(tokens[1].kind, TokenKind::RightParen));
705 assert!(matches!(tokens[2].kind, TokenKind::LeftBrace));
706 assert!(matches!(tokens[3].kind, TokenKind::RightBrace));
707 assert!(matches!(tokens[4].kind, TokenKind::LeftBracket));
708 assert!(matches!(tokens[5].kind, TokenKind::RightBracket));
709 assert!(matches!(tokens[6].kind, TokenKind::Semicolon));
710 assert!(matches!(tokens[7].kind, TokenKind::Comma));
711 }
712
713 #[test]
714 fn test_special_operators() {
715 let mut lexer = Lexer::new("$ ? : | >>");
716 let tokens = lexer.tokenize().unwrap();
717 assert!(matches!(tokens[0].kind, TokenKind::Dollar));
718 assert!(matches!(tokens[1].kind, TokenKind::Question));
719 assert!(matches!(tokens[2].kind, TokenKind::Colon));
720 assert!(matches!(tokens[3].kind, TokenKind::Pipe));
721 assert!(matches!(tokens[4].kind, TokenKind::Append));
722 }
723
724 #[test]
725 fn test_exponent() {
726 let mut lexer = Lexer::new("x ^ 2");
727 let tokens = lexer.tokenize().unwrap();
728 assert!(matches!(tokens[1].kind, TokenKind::Caret));
729 }
730
731 #[test]
732 fn test_comments() {
733 let mut lexer = Lexer::new("x # this is a comment\ny");
734 let tokens = lexer.tokenize().unwrap();
735 assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "x"));
737 assert!(matches!(tokens[1].kind, TokenKind::Newline));
738 assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "y"));
739 }
740
741 #[test]
742 fn test_line_continuation() {
743 let mut lexer = Lexer::new("x \\\ny");
744 let tokens = lexer.tokenize().unwrap();
745 assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "x"));
747 assert!(matches!(&tokens[1].kind, TokenKind::Identifier(s) if s == "y"));
748 }
749
750 #[test]
751 fn test_string_escapes() {
752 let mut lexer = Lexer::new(r#""\t\r\n\b\f\a\v\\\"\/""#);
753 let tokens = lexer.tokenize().unwrap();
754 if let TokenKind::String(s) = &tokens[0].kind {
755 assert!(s.contains('\t'));
756 assert!(s.contains('\r'));
757 assert!(s.contains('\n'));
758 assert!(s.contains('\\'));
759 assert!(s.contains('"'));
760 assert!(s.contains('/'));
761 } else {
762 panic!("Expected string token");
763 }
764 }
765
766 #[test]
767 fn test_hex_escape() {
768 let mut lexer = Lexer::new(r#""\x41\x42""#);
769 let tokens = lexer.tokenize().unwrap();
770 if let TokenKind::String(s) = &tokens[0].kind {
771 assert_eq!(s, "AB");
772 } else {
773 panic!("Expected string token");
774 }
775 }
776
777 #[test]
778 fn test_octal_escape() {
779 let mut lexer = Lexer::new(r#""\101\102""#);
780 let tokens = lexer.tokenize().unwrap();
781 if let TokenKind::String(s) = &tokens[0].kind {
782 assert_eq!(s, "AB");
783 } else {
784 panic!("Expected string token");
785 }
786 }
787
788 #[test]
789 fn test_regex_with_escapes() {
790 let mut lexer = Lexer::new(r#"/a\/b/"#);
791 let tokens = lexer.tokenize().unwrap();
792 if let TokenKind::Regex(s) = &tokens[0].kind {
793 assert!(s.contains("\\/"));
794 } else {
795 panic!("Expected regex token");
796 }
797 }
798
799 #[test]
800 fn test_more_keywords() {
801 let mut lexer = Lexer::new(
802 "do break continue function return delete exit next nextfile getline printf in BEGINFILE ENDFILE",
803 );
804 let tokens = lexer.tokenize().unwrap();
805 assert!(matches!(tokens[0].kind, TokenKind::Do));
806 assert!(matches!(tokens[1].kind, TokenKind::Break));
807 assert!(matches!(tokens[2].kind, TokenKind::Continue));
808 assert!(matches!(tokens[3].kind, TokenKind::Function));
809 assert!(matches!(tokens[4].kind, TokenKind::Return));
810 assert!(matches!(tokens[5].kind, TokenKind::Delete));
811 assert!(matches!(tokens[6].kind, TokenKind::Exit));
812 assert!(matches!(tokens[7].kind, TokenKind::Next));
813 assert!(matches!(tokens[8].kind, TokenKind::Nextfile));
814 assert!(matches!(tokens[9].kind, TokenKind::Getline));
815 assert!(matches!(tokens[10].kind, TokenKind::Printf));
816 assert!(matches!(tokens[11].kind, TokenKind::In));
817 assert!(matches!(tokens[12].kind, TokenKind::BeginFile));
818 assert!(matches!(tokens[13].kind, TokenKind::EndFile));
819 }
820
821 #[test]
822 fn test_number_with_exponent() {
823 let mut lexer = Lexer::new("1e+5 1E-5");
824 let tokens = lexer.tokenize().unwrap();
825 assert!(matches!(tokens[0].kind, TokenKind::Number(n) if n == 1e5));
826 assert!(matches!(tokens[1].kind, TokenKind::Number(n) if n == 1e-5));
827 }
828
829 #[test]
830 fn test_decimal_starting_with_dot() {
831 let mut lexer = Lexer::new(".5 .123");
832 let tokens = lexer.tokenize().unwrap();
833 assert!(matches!(tokens[0].kind, TokenKind::Number(n) if (n - 0.5).abs() < 0.001));
834 assert!(matches!(tokens[1].kind, TokenKind::Number(n) if (n - 0.123).abs() < 0.001));
835 }
836
837 #[test]
838 fn test_unexpected_character_error() {
839 let mut lexer = Lexer::new("@");
840 let result = lexer.tokenize();
841 assert!(result.is_err());
842 }
843
844 #[test]
845 fn test_unterminated_string_error() {
846 let mut lexer = Lexer::new("\"unterminated");
847 let result = lexer.tokenize();
848 assert!(result.is_err());
849 }
850
851 #[test]
852 fn test_unterminated_regex_error() {
853 let mut lexer = Lexer::new("/unterminated");
854 let result = lexer.tokenize();
855 assert!(result.is_err());
856 }
857
858 #[test]
859 fn test_string_with_newline_error() {
860 let mut lexer = Lexer::new("\"hello\nworld\"");
861 let result = lexer.tokenize();
862 assert!(result.is_err());
863 }
864
865 #[test]
866 fn test_single_ampersand_error() {
867 let mut lexer = Lexer::new("& ");
868 let result = lexer.tokenize();
869 assert!(result.is_err());
870 }
871
872 #[test]
873 fn test_eof_token() {
874 let mut lexer = Lexer::new("");
875 let tokens = lexer.tokenize().unwrap();
876 assert!(matches!(tokens[0].kind, TokenKind::Eof));
877 }
878
879 #[test]
880 fn test_regex_with_escapes_complete() {
881 let mut lexer = Lexer::new(r#"/\d+\.\d*/"#);
882 let tokens = lexer.tokenize().unwrap();
883 if let TokenKind::Regex(s) = &tokens[0].kind {
884 assert!(s.contains(r"\d"));
885 } else {
886 panic!("Expected regex token");
887 }
888 }
889
890 #[test]
891 fn test_string_unknown_escape() {
892 let mut lexer = Lexer::new(r#""\q""#);
894 let tokens = lexer.tokenize().unwrap();
895 if let TokenKind::String(s) = &tokens[0].kind {
896 assert_eq!(s, "q");
897 } else {
898 panic!("Expected string token");
899 }
900 }
901
902 #[test]
903 fn test_number_leading_dot() {
904 let mut lexer = Lexer::new(".123 .5e2");
905 let tokens = lexer.tokenize().unwrap();
906 assert!(matches!(tokens[0].kind, TokenKind::Number(n) if (n - 0.123).abs() < 0.001));
907 assert!(matches!(tokens[1].kind, TokenKind::Number(n) if n == 50.0));
908 }
909
910 #[test]
911 fn test_number_exponent_variations() {
912 let mut lexer = Lexer::new("1e5 1E5 1e+5 1e-5 1.5e10");
913 let tokens = lexer.tokenize().unwrap();
914 assert!(matches!(tokens[0].kind, TokenKind::Number(n) if n == 1e5));
915 assert!(matches!(tokens[1].kind, TokenKind::Number(n) if n == 1e5));
916 assert!(matches!(tokens[2].kind, TokenKind::Number(n) if n == 1e5));
917 assert!(matches!(tokens[3].kind, TokenKind::Number(n) if n == 1e-5));
918 assert!(matches!(tokens[4].kind, TokenKind::Number(n) if n == 1.5e10));
919 }
920
921 #[test]
922 fn test_regex_after_comma() {
923 let mut lexer = Lexer::new("gsub(/a/, /b/)");
925 let tokens = lexer.tokenize().unwrap();
926 assert!(matches!(&tokens[2].kind, TokenKind::Regex(s) if s == "a"));
928 assert!(matches!(&tokens[4].kind, TokenKind::Regex(s) if s == "b"));
929 }
930
931 #[test]
932 fn test_regex_after_operators() {
933 let mut lexer = Lexer::new("x ~ /a/ && /b/");
935 let tokens = lexer.tokenize().unwrap();
936 assert!(matches!(&tokens[2].kind, TokenKind::Regex(s) if s == "a"));
937 assert!(matches!(&tokens[4].kind, TokenKind::Regex(s) if s == "b"));
938 }
939
940 #[test]
941 fn test_multiple_newlines() {
942 let mut lexer = Lexer::new("a\n\n\nb");
943 let tokens = lexer.tokenize().unwrap();
944 let newline_count = tokens
946 .iter()
947 .filter(|t| matches!(t.kind, TokenKind::Newline))
948 .count();
949 assert!(newline_count >= 2);
950 }
951
952 #[test]
953 fn test_comment_at_end() {
954 let mut lexer = Lexer::new("x # comment at end");
955 let tokens = lexer.tokenize().unwrap();
956 assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "x"));
957 assert!(matches!(tokens[1].kind, TokenKind::Eof));
958 }
959
960 #[test]
961 fn test_identifier_with_underscore() {
962 let mut lexer = Lexer::new("_var var_name my_func_2");
963 let tokens = lexer.tokenize().unwrap();
964 assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "_var"));
965 assert!(matches!(&tokens[1].kind, TokenKind::Identifier(s) if s == "var_name"));
966 assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "my_func_2"));
967 }
968
969 #[test]
970 fn test_string_with_all_escapes() {
971 let mut lexer = Lexer::new(r#""\b\f""#);
972 let tokens = lexer.tokenize().unwrap();
973 if let TokenKind::String(s) = &tokens[0].kind {
974 assert!(s.contains('\x08')); assert!(s.contains('\x0C')); } else {
977 panic!("Expected string token");
978 }
979 }
980
981 #[test]
982 fn test_invalid_hex_escape() {
983 let mut lexer = Lexer::new(r#""\xGG""#);
985 let tokens = lexer.tokenize().unwrap();
986 if let TokenKind::String(s) = &tokens[0].kind {
987 assert!(s.contains("GG") || s.contains("x"));
989 } else {
990 panic!("Expected string token");
991 }
992 }
993
994 #[test]
995 fn test_single_pipe() {
996 let mut lexer = Lexer::new("a | b");
997 let tokens = lexer.tokenize().unwrap();
998 assert!(matches!(tokens[1].kind, TokenKind::Pipe));
999 }
1000
1001 #[test]
1002 fn test_double_pipe() {
1003 let mut lexer = Lexer::new("a || b");
1004 let tokens = lexer.tokenize().unwrap();
1005 assert!(matches!(tokens[1].kind, TokenKind::Or));
1006 }
1007
1008 #[test]
1009 fn test_colon() {
1010 let mut lexer = Lexer::new("a ? b : c");
1011 let tokens = lexer.tokenize().unwrap();
1012 assert!(matches!(tokens[1].kind, TokenKind::Question));
1013 assert!(matches!(tokens[3].kind, TokenKind::Colon));
1014 }
1015
1016 #[test]
1017 fn test_caret() {
1018 let mut lexer = Lexer::new("x ^ 2 x ^= 3");
1019 let tokens = lexer.tokenize().unwrap();
1020 assert!(matches!(tokens[1].kind, TokenKind::Caret));
1021 assert!(matches!(tokens[4].kind, TokenKind::CaretAssign));
1022 }
1023
1024 #[test]
1025 fn test_all_assignment_types() {
1026 let mut lexer = Lexer::new("a += 1 b -= 1 c *= 1 d %= 1");
1027 let tokens = lexer.tokenize().unwrap();
1028 assert!(matches!(tokens[1].kind, TokenKind::PlusAssign));
1029 assert!(matches!(tokens[4].kind, TokenKind::MinusAssign));
1030 assert!(matches!(tokens[7].kind, TokenKind::StarAssign));
1031 assert!(matches!(tokens[10].kind, TokenKind::PercentAssign));
1032 }
1033}