1use crate::error::{ErrorKind, PerlError, PerlResult};
2use crate::token::{keyword_or_ident, Token};
3
4pub const LITERAL_DOLLAR_IN_DQUOTE: char = '\u{E000}';
7
8fn parse_unicode_name(name: &str) -> Option<char> {
10 if let Some(hex) = name.strip_prefix("U+") {
11 let val = u32::from_str_radix(hex, 16).ok()?;
12 char::from_u32(val)
13 } else {
14 unicode_names2::character(name)
15 }
16}
17
18const REGEX_FLAG_CHARS: &str = "gimsxecor";
20
21pub struct Lexer {
22 input: Vec<char>,
23 pos: usize,
24 pub line: usize,
25 last_was_term: bool,
28 error_file: String,
30 pub suppress_m_regex: u32,
34}
35
36impl Lexer {
37 pub fn new(input: &str) -> Self {
38 Self::new_with_file(input, "-e")
39 }
40
41 pub fn new_with_file(input: &str, file: impl Into<String>) -> Self {
42 Self {
43 input: input.chars().collect(),
44 pos: 0,
45 line: 1,
46 last_was_term: false,
47 error_file: file.into(),
48 suppress_m_regex: 0,
49 }
50 }
51
52 fn syntax_err(&self, message: impl Into<String>, line: usize) -> PerlError {
53 PerlError::new(ErrorKind::Syntax, message, line, self.error_file.clone())
54 }
55
56 fn peek(&self) -> Option<char> {
57 self.input.get(self.pos).copied()
58 }
59
60 fn peek_at(&self, offset: usize) -> Option<char> {
61 self.input.get(self.pos + offset).copied()
62 }
63
64 fn at_line_start_for_pod(&self, eq_pos: usize) -> bool {
67 let mut i = eq_pos;
68 while i > 0 {
69 i -= 1;
70 let c = self.input[i];
71 if c == '\n' {
72 return true;
73 }
74 if !c.is_whitespace() {
75 return false;
76 }
77 }
78 true
79 }
80
81 fn advance(&mut self) -> Option<char> {
82 let ch = self.input.get(self.pos).copied();
83 if let Some(c) = ch {
84 if c == '\n' {
85 self.line += 1;
86 }
87 self.pos += 1;
88 }
89 ch
90 }
91
92 fn skip_whitespace_and_comments(&mut self) {
93 while self.pos < self.input.len() {
94 let ch = self.input[self.pos];
95 if ch == '#' {
96 while self.pos < self.input.len() && self.input[self.pos] != '\n' {
98 self.pos += 1;
99 }
100 } else if ch == '\\' && self.peek_at(1) == Some('\n') {
101 self.pos += 2;
104 } else if ch.is_whitespace() {
105 if ch == '\n' {
106 self.line += 1;
107 }
108 self.pos += 1;
109 } else {
110 break;
111 }
112 }
113 }
114
115 fn skip_whitespace_only(&mut self) {
118 while self.pos < self.input.len() {
119 let ch = self.input[self.pos];
120 if ch.is_whitespace() {
121 if ch == '\n' {
122 self.line += 1;
123 }
124 self.pos += 1;
125 } else {
126 break;
127 }
128 }
129 }
130
131 fn read_while(&mut self, pred: impl Fn(char) -> bool) -> String {
132 let mut s = String::new();
133 while let Some(ch) = self.peek() {
134 if pred(ch) {
135 s.push(ch);
136 self.advance();
137 } else {
138 break;
139 }
140 }
141 s
142 }
143
144 fn read_number(&mut self) -> PerlResult<Token> {
145 let start = self.pos;
146 let mut is_float = false;
147 let mut is_hex = false;
148 let mut is_oct = false;
149 let mut is_bin = false;
150
151 if self.peek() == Some('0') {
152 match self.peek_at(1) {
153 Some('x') | Some('X') => {
154 is_hex = true;
155 self.advance();
156 self.advance();
157 }
158 Some('b') | Some('B') => {
159 is_bin = true;
160 self.advance();
161 self.advance();
162 }
163 Some(c) if c.is_ascii_digit() => {
164 is_oct = true;
165 }
166 _ => {}
167 }
168 }
169
170 if is_hex {
171 let digits = self.read_while(|c| c.is_ascii_hexdigit() || c == '_');
172 let clean: String = digits.chars().filter(|&c| c != '_').collect();
173 let val = i64::from_str_radix(&clean, 16)
174 .map_err(|_| self.syntax_err("Invalid hex literal", self.line))?;
175 return Ok(Token::Integer(val));
176 }
177 if is_bin {
178 let digits = self.read_while(|c| c == '0' || c == '1' || c == '_');
179 let clean: String = digits.chars().filter(|&c| c != '_').collect();
180 let val = i64::from_str_radix(&clean, 2)
181 .map_err(|_| self.syntax_err("Invalid binary literal", self.line))?;
182 return Ok(Token::Integer(val));
183 }
184
185 let _int_part = self.read_while(|c| c.is_ascii_digit() || c == '_');
187 if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
188 is_float = true;
189 self.advance(); let _frac = self.read_while(|c| c.is_ascii_digit() || c == '_');
191 }
192 if let Some('e') | Some('E') = self.peek() {
194 is_float = true;
195 self.advance();
196 if let Some('+') | Some('-') = self.peek() {
197 self.advance();
198 }
199 let _exp = self.read_while(|c| c.is_ascii_digit() || c == '_');
200 }
201
202 let raw: String = self.input[start..self.pos].iter().collect();
203 let clean: String = raw.chars().filter(|&c| c != '_').collect();
204
205 if is_float {
206 let val: f64 = clean
207 .parse()
208 .map_err(|_| self.syntax_err("Invalid float literal", self.line))?;
209 Ok(Token::Float(val))
210 } else if is_oct && clean.starts_with('0') && clean.len() > 1 {
211 let val = i64::from_str_radix(&clean[1..], 8)
212 .map_err(|_| self.syntax_err("Invalid octal literal", self.line))?;
213 Ok(Token::Integer(val))
214 } else {
215 let val: i64 = clean
216 .parse()
217 .map_err(|_| self.syntax_err("Invalid integer literal", self.line))?;
218 Ok(Token::Integer(val))
219 }
220 }
221
222 fn read_single_quoted_string(&mut self) -> PerlResult<Token> {
223 self.advance(); let mut s = String::new();
225 loop {
226 match self.advance() {
227 Some('\\') => match self.peek() {
228 Some('\\') => {
229 s.push('\\');
230 self.advance();
231 }
232 Some('\'') => {
233 s.push('\'');
234 self.advance();
235 }
236 _ => s.push('\\'),
237 },
238 Some('\'') => break,
239 Some(c) => s.push(c),
240 None => return Err(self.syntax_err("Unterminated single-quoted string", self.line)),
241 }
242 }
243 Ok(Token::SingleString(s))
244 }
245
246 fn read_double_quoted_string(&mut self) -> PerlResult<Token> {
247 self.advance(); let s = self.read_escaped_until('"')?;
249 Ok(Token::DoubleString(s))
250 }
251
252 fn read_escaped_until(&mut self, term: char) -> PerlResult<String> {
253 let mut s = String::new();
254 loop {
255 match self.advance() {
256 Some('\\') => match self.advance() {
257 Some('n') => s.push('\n'),
258 Some('t') => s.push('\t'),
259 Some('r') => s.push('\r'),
260 Some('\\') => s.push('\\'),
261 Some(c @ '0'..='7') => {
262 let mut oct = String::new();
263 oct.push(c);
264 for _ in 0..2 {
265 match self.peek() {
266 Some(d) if ('0'..='7').contains(&d) => {
267 oct.push(self.advance().unwrap());
268 }
269 _ => break,
270 }
271 }
272 let val = u32::from_str_radix(&oct, 8).unwrap();
273 let ch = char::from_u32(val)
274 .ok_or_else(|| self.syntax_err("Invalid octal escape", self.line))?;
275 s.push(ch);
276 }
277 Some('a') => s.push('\x07'),
278 Some('b') => s.push('\x08'),
279 Some('f') => s.push('\x0C'),
280 Some('e') => s.push('\x1B'),
281 Some('$') => s.push(LITERAL_DOLLAR_IN_DQUOTE),
282 Some('c') => {
283 let ch = self
284 .advance()
285 .ok_or_else(|| self.syntax_err("Unterminated \\c escape", self.line))?;
286 s.push(char::from(ch.to_ascii_uppercase() as u8 ^ 0x40));
287 }
288 Some('o') if self.peek() == Some('{') => {
289 self.advance(); let oct = self.read_while(|c| c != '}');
291 if self.peek() != Some('}') {
292 return Err(
293 self.syntax_err("Unterminated \\o{...} in string", self.line)
294 );
295 }
296 self.advance(); if oct.is_empty() {
298 return Err(self.syntax_err("Empty \\o{} in string", self.line));
299 }
300 let val = u32::from_str_radix(&oct, 8).map_err(|_| {
301 self.syntax_err("Invalid octal digits in \\o{...}", self.line)
302 })?;
303 let c = char::from_u32(val).ok_or_else(|| {
304 self.syntax_err("Invalid Unicode scalar value in \\o{...}", self.line)
305 })?;
306 s.push(c);
307 }
308 Some('u') if self.peek() == Some('{') => {
309 self.advance(); let hex = self.read_while(|c| c != '}');
311 if self.peek() != Some('}') {
312 return Err(
313 self.syntax_err("Unterminated \\u{...} in string", self.line)
314 );
315 }
316 self.advance(); if hex.is_empty() {
318 return Err(self.syntax_err("Empty \\u{} in string", self.line));
319 }
320 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
321 self.syntax_err("Invalid hex digits in \\u{...}", self.line)
322 })?;
323 let c = char::from_u32(val).ok_or_else(|| {
324 self.syntax_err("Invalid Unicode scalar value in \\u{...}", self.line)
325 })?;
326 s.push(c);
327 }
328 Some('N') if self.peek() == Some('{') => {
329 self.advance(); let name = self.read_while(|c| c != '}');
331 if self.peek() != Some('}') {
332 return Err(
333 self.syntax_err("Unterminated \\N{...} in string", self.line)
334 );
335 }
336 self.advance(); if name.is_empty() {
338 return Err(self.syntax_err("Empty \\N{} in string", self.line));
339 }
340 let c = parse_unicode_name(&name).ok_or_else(|| {
341 self.syntax_err(
342 format!("Unknown Unicode character name: {name}"),
343 self.line,
344 )
345 })?;
346 s.push(c);
347 }
348 Some('x') => {
349 if self.peek() == Some('{') {
350 self.advance(); let hex = self.read_while(|c| c != '}');
352 if self.peek() != Some('}') {
353 return Err(
354 self.syntax_err("Unterminated \\x{...} in string", self.line)
355 );
356 }
357 self.advance(); if hex.is_empty() {
359 return Err(self.syntax_err("Empty \\x{} in string", self.line));
360 }
361 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
362 self.syntax_err("Invalid hex digits in \\x{...}", self.line)
363 })?;
364 let c = char::from_u32(val).ok_or_else(|| {
365 self.syntax_err(
366 "Invalid Unicode scalar value in \\x{...}",
367 self.line,
368 )
369 })?;
370 s.push(c);
371 } else {
372 let mut hex = String::new();
374 for _ in 0..2 {
375 match self.peek() {
376 Some(c) if c.is_ascii_hexdigit() => {
377 hex.push(self.advance().unwrap());
378 }
379 _ => break,
380 }
381 }
382 if hex.is_empty() {
383 s.push('\0');
385 } else if let Ok(val) = u32::from_str_radix(&hex, 16) {
386 if let Some(c) = char::from_u32(val) {
387 s.push(c);
388 } else {
389 return Err(self.syntax_err(
390 "Invalid code point in \\x escape",
391 self.line,
392 ));
393 }
394 }
395 }
396 }
397 Some(c) if c == term => s.push(c),
398 Some(c) => {
399 s.push('\\');
400 s.push(c);
401 }
402 None => return Err(self.syntax_err("Unterminated string", self.line)),
403 },
404 Some(c) if c == term => break,
405 Some(c) => s.push(c),
406 None => return Err(self.syntax_err("Unterminated string", self.line)),
407 }
408 }
409 Ok(s)
410 }
411
412 fn read_q_qq_balanced_body(
415 &mut self,
416 open: char,
417 close: char,
418 is_qq: bool,
419 ) -> PerlResult<String> {
420 let mut s = String::new();
421 let mut depth: usize = 1;
422 loop {
423 match self.peek() {
424 Some('\\') => {
425 self.advance();
426 if is_qq {
427 match self.advance() {
428 Some('n') => s.push('\n'),
429 Some('t') => s.push('\t'),
430 Some('r') => s.push('\r'),
431 Some('\\') => s.push('\\'),
432 Some(c @ '0'..='7') => {
433 let mut oct = String::new();
434 oct.push(c);
435 for _ in 0..2 {
436 match self.peek() {
437 Some(d) if ('0'..='7').contains(&d) => {
438 oct.push(self.advance().unwrap());
439 }
440 _ => break,
441 }
442 }
443 let val = u32::from_str_radix(&oct, 8).unwrap();
444 let ch = char::from_u32(val).ok_or_else(|| {
445 self.syntax_err("Invalid octal escape", self.line)
446 })?;
447 s.push(ch);
448 }
449 Some('a') => s.push('\x07'),
450 Some('b') => s.push('\x08'),
451 Some('f') => s.push('\x0C'),
452 Some('e') => s.push('\x1B'),
453 Some('$') => s.push(LITERAL_DOLLAR_IN_DQUOTE),
454 Some('c') => {
455 let ch = self.advance().ok_or_else(|| {
456 self.syntax_err("Unterminated \\c escape", self.line)
457 })?;
458 s.push(char::from(ch.to_ascii_uppercase() as u8 ^ 0x40));
459 }
460 Some('o') if self.peek() == Some('{') => {
461 self.advance();
462 let oct = self.read_while(|c| c != '}');
463 if self.peek() != Some('}') {
464 return Err(self.syntax_err(
465 "Unterminated \\o{...} in qq string",
466 self.line,
467 ));
468 }
469 self.advance();
470 if oct.is_empty() {
471 return Err(
472 self.syntax_err("Empty \\o{} in qq string", self.line)
473 );
474 }
475 let val = u32::from_str_radix(&oct, 8).map_err(|_| {
476 self.syntax_err("Invalid octal digits in \\o{...}", self.line)
477 })?;
478 let c = char::from_u32(val).ok_or_else(|| {
479 self.syntax_err(
480 "Invalid Unicode scalar value in \\o{...}",
481 self.line,
482 )
483 })?;
484 s.push(c);
485 }
486 Some('u') if self.peek() == Some('{') => {
487 self.advance();
488 let hex = self.read_while(|c| c != '}');
489 if self.peek() != Some('}') {
490 return Err(self.syntax_err(
491 "Unterminated \\u{...} in qq string",
492 self.line,
493 ));
494 }
495 self.advance();
496 if hex.is_empty() {
497 return Err(
498 self.syntax_err("Empty \\u{} in qq string", self.line)
499 );
500 }
501 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
502 self.syntax_err("Invalid hex digits in \\u{...}", self.line)
503 })?;
504 let c = char::from_u32(val).ok_or_else(|| {
505 self.syntax_err(
506 "Invalid Unicode scalar value in \\u{...}",
507 self.line,
508 )
509 })?;
510 s.push(c);
511 }
512 Some('N') if self.peek() == Some('{') => {
513 self.advance();
514 let name = self.read_while(|c| c != '}');
515 if self.peek() != Some('}') {
516 return Err(self.syntax_err(
517 "Unterminated \\N{...} in qq string",
518 self.line,
519 ));
520 }
521 self.advance();
522 if name.is_empty() {
523 return Err(
524 self.syntax_err("Empty \\N{} in qq string", self.line)
525 );
526 }
527 let c = parse_unicode_name(&name).ok_or_else(|| {
528 self.syntax_err(
529 format!("Unknown Unicode character name: {name}"),
530 self.line,
531 )
532 })?;
533 s.push(c);
534 }
535 Some('x') => {
536 if self.peek() == Some('{') {
537 self.advance();
538 let hex = self.read_while(|c| c != '}');
539 if self.peek() != Some('}') {
540 return Err(self.syntax_err(
541 "Unterminated \\x{...} in qq string",
542 self.line,
543 ));
544 }
545 self.advance();
546 if hex.is_empty() {
547 return Err(
548 self.syntax_err("Empty \\x{} in qq string", self.line)
549 );
550 }
551 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
552 self.syntax_err("Invalid hex digits in \\x{...}", self.line)
553 })?;
554 let c = char::from_u32(val).ok_or_else(|| {
555 self.syntax_err(
556 "Invalid Unicode scalar value in \\x{...}",
557 self.line,
558 )
559 })?;
560 s.push(c);
561 } else {
562 let mut hex = String::new();
563 for _ in 0..2 {
564 match self.peek() {
565 Some(c) if c.is_ascii_hexdigit() => {
566 hex.push(self.advance().unwrap());
567 }
568 _ => break,
569 }
570 }
571 if hex.is_empty() {
572 s.push('\0');
573 } else if let Ok(val) = u32::from_str_radix(&hex, 16) {
574 if let Some(c) = char::from_u32(val) {
575 s.push(c);
576 } else {
577 return Err(self.syntax_err(
578 "Invalid code point in \\x escape",
579 self.line,
580 ));
581 }
582 }
583 }
584 }
585 Some(c) if c == close && depth == 1 => s.push(close),
586 Some(c) => {
587 s.push('\\');
588 s.push(c);
589 }
590 None => {
591 return Err(
592 self.syntax_err("Unterminated qq(...) string", self.line)
593 );
594 }
595 }
596 } else {
597 match self.advance() {
598 Some(c) if c == close && depth == 1 => s.push(close),
599 Some(c) => {
600 s.push('\\');
601 s.push(c);
602 }
603 None => {
604 return Err(
605 self.syntax_err("Unterminated q(...) string", self.line)
606 );
607 }
608 }
609 }
610 }
611 Some(c) if c == open => {
612 self.advance();
613 depth += 1;
614 s.push(open);
615 }
616 Some(c) if c == close => {
617 self.advance();
618 if depth == 1 {
619 break;
620 }
621 depth -= 1;
622 s.push(close);
623 }
624 Some(c) => {
625 self.advance();
626 s.push(c);
627 }
628 None => {
629 return Err(self.syntax_err("Unterminated q/qq bracketed string", self.line));
630 }
631 }
632 }
633 Ok(s)
634 }
635
636 fn read_regex(&mut self) -> PerlResult<Token> {
637 self.advance(); let mut pattern = String::new();
639 loop {
640 match self.advance() {
641 Some('\\') => {
642 pattern.push('\\');
643 if let Some(c) = self.advance() {
644 pattern.push(c);
645 }
646 }
647 Some('/') => break,
648 Some(c) => pattern.push(c),
649 None => return Err(self.syntax_err("Unterminated regex", self.line)),
650 }
651 }
652 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
653 Ok(Token::Regex(pattern, flags, '/'))
654 }
655
656 fn read_qw(&mut self) -> PerlResult<Token> {
657 self.skip_whitespace_only();
659 let open = self
660 .advance()
661 .ok_or_else(|| self.syntax_err("Expected delimiter after qw", self.line))?;
662 let close = match open {
663 '(' => ')',
664 '[' => ']',
665 '{' => '}',
666 '<' => '>',
667 c => c,
668 };
669 let mut words = Vec::new();
670 if matches!(open, '(' | '[' | '{' | '<') {
671 let mut depth: usize = 1;
674 let mut buf = String::new();
675 loop {
676 match self.peek() {
677 None => {
678 return Err(self.syntax_err("Unterminated qw()", self.line));
679 }
680 Some(c) if depth == 1 && c.is_whitespace() => {
681 self.advance();
682 if !buf.is_empty() {
683 words.push(buf.clone());
684 buf.clear();
685 }
686 while self.peek().is_some_and(|c| c.is_whitespace()) {
687 self.advance();
688 }
689 }
690 Some(c) if c == close && depth == 1 => {
691 self.advance();
692 if !buf.is_empty() {
693 words.push(buf);
694 }
695 break;
696 }
697 Some(c) if c == open => {
698 depth += 1;
699 buf.push(self.advance().unwrap());
700 }
701 Some(c) if c == close => {
702 debug_assert!(depth >= 2);
704 depth -= 1;
705 buf.push(self.advance().unwrap());
706 }
707 Some(_) => {
708 buf.push(self.advance().unwrap());
709 }
710 }
711 }
712 return Ok(Token::QW(words));
713 }
714 loop {
715 while let Some(ch) = self.peek() {
717 if ch.is_whitespace() {
718 self.advance();
719 } else {
720 break;
721 }
722 }
723 if self.peek() == Some(close) {
724 self.advance();
725 break;
726 }
727 if self.peek().is_none() {
728 return Err(self.syntax_err("Unterminated qw()", self.line));
729 }
730 let word = self.read_while(|c| !c.is_whitespace() && c != close);
731 if !word.is_empty() {
732 words.push(word);
733 }
734 }
735 Ok(Token::QW(words))
736 }
737
738 fn read_heredoc_tag(&mut self) -> PerlResult<(String, bool, bool)> {
739 self.read_heredoc_tag_inner(false)
740 }
741
742 fn read_heredoc_tag_inner(&mut self, indented: bool) -> PerlResult<(String, bool, bool)> {
743 let quoted;
746 let tag;
747 match self.peek() {
748 Some('\'') => {
749 self.advance();
750 tag = self.read_while(|c| c != '\'');
751 self.advance(); quoted = false; }
754 Some('"') => {
755 self.advance();
756 tag = self.read_while(|c| c != '"');
757 self.advance();
758 quoted = true;
759 }
760 Some('~') => {
761 self.advance(); return self.read_heredoc_tag_inner(true); }
764 _ => {
765 tag = self.read_while(|c| c.is_alphanumeric() || c == '_');
766 quoted = true;
767 }
768 }
769 Ok((tag, quoted, indented))
770 }
771
772 fn read_heredoc_body(&mut self, tag: &str, indented: bool) -> PerlResult<String> {
773 let mut lines: Vec<String> = Vec::new();
776 while let Some(ch) = self.peek() {
778 if ch == '\n' {
779 self.advance();
780 break;
781 }
782 self.advance();
783 }
784 let mut terminator_indent: Option<usize> = None;
785 loop {
786 let _line_start = self.pos;
787 let line = self.read_while(|c| c != '\n');
788 if line.trim() == tag {
789 if indented {
792 terminator_indent = Some(line.len() - line.trim_start().len());
793 }
794 break;
795 }
796 lines.push(line);
797 if self.peek() == Some('\n') {
798 self.advance();
799 } else if self.pos >= self.input.len() {
800 return Err(self.syntax_err(
801 format!("Unterminated heredoc (looking for '{tag}')"),
802 self.line,
803 ));
804 }
805 }
806 if self.peek() == Some('\n') {
807 self.advance();
808 }
809 if indented {
812 let strip = terminator_indent.unwrap_or(0);
813 let mut body = String::new();
814 for line in lines {
815 let ws_count = line.len() - line.trim_start().len();
816 let to_strip = ws_count.min(strip);
817 body.push_str(&line[to_strip..]);
818 body.push('\n');
819 }
820 Ok(body)
821 } else {
822 let mut body = String::new();
823 for line in lines {
824 body.push_str(&line);
825 body.push('\n');
826 }
827 Ok(body)
828 }
829 }
830
831 fn read_identifier(&mut self) -> String {
832 self.read_while(|c| c.is_alphanumeric() || c == '_')
833 }
834
835 fn read_package_qualified_identifier(&mut self) -> String {
837 let mut s = self.read_identifier();
838 while self.peek() == Some(':') && self.input.get(self.pos + 1) == Some(&':') {
839 self.advance();
840 self.advance();
841 s.push_str("::");
842 s.push_str(&self.read_identifier());
843 }
844 s
845 }
846
847 fn read_format_body(&mut self) -> PerlResult<Vec<String>> {
849 while self.peek().is_some_and(|c| c == ' ' || c == '\t') {
850 self.advance();
851 }
852 if self.peek() == Some('\n') {
853 self.advance();
854 }
855 let mut lines = Vec::new();
856 loop {
857 let mut line = String::new();
858 while let Some(c) = self.peek() {
859 if c == '\n' {
860 self.advance();
861 break;
862 }
863 if c == '\r' {
864 self.advance();
865 if self.peek() == Some('\n') {
866 self.advance();
867 }
868 break;
869 }
870 line.push(c);
871 self.advance();
872 }
873 if line.trim() == "." {
874 break;
875 }
876 lines.push(line);
877 if self.peek().is_none() {
878 return Err(self.syntax_err(
879 "Unterminated format (expected '.' on its own line before end of file)",
880 self.line,
881 ));
882 }
883 }
884 Ok(lines)
885 }
886
887 fn read_variable_name(&mut self) -> String {
888 match self.peek() {
890 Some('$')
892 if self.input.get(self.pos + 1) == Some(&'_')
893 && self.input.get(self.pos + 2) == Some(&'{') =>
894 {
895 self.advance(); self.advance(); "_".to_string()
898 }
899 Some(':') if self.input.get(self.pos + 1) == Some(&':') => {
901 self.advance();
902 self.advance();
903 let mut s = "::".to_string();
904 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
905 s.push_str(&self.read_identifier());
906 }
907 while self.peek() == Some(':') && self.input.get(self.pos + 1) == Some(&':') {
908 self.advance();
909 self.advance();
910 s.push_str("::");
911 s.push_str(&self.read_identifier());
912 }
913 s
914 }
915 Some(c) if c.is_alphabetic() || c == '_' => {
916 let ident = self.read_package_qualified_identifier();
917 if ident == "_" {
919 let mut lts = String::new();
920 while self.peek() == Some('<') {
921 self.advance();
922 lts.push('<');
923 }
924 if !lts.is_empty() {
925 return format!("_{}", lts);
926 }
927 }
928 ident
929 }
930 Some('^') => {
931 self.advance();
932 if self.peek().is_some_and(|c| c.is_alphabetic()) {
934 let c2 = self.advance().unwrap();
935 format!("^{}", c2)
936 } else {
937 "^".to_string()
938 }
939 }
940 Some('{') => {
942 self.advance(); let name = self.read_while(|c| c != '}');
944 if self.peek() == Some('}') {
945 self.advance();
946 }
947 name
948 }
949 Some('#') => {
951 self.advance();
952 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
953 let mut name = String::from("#");
954 name.push_str(&self.read_package_qualified_identifier());
955 name
956 } else {
957 "#".to_string()
958 }
959 }
960 Some(c) if "!@$&*+;',\"\\|?/<>.0123456789~%-=()[]{}".contains(c) => {
961 self.advance();
962 c.to_string()
963 }
964 _ => String::new(),
965 }
966 }
967
968 fn braced_body_symbolic_scalar_deref_name(body: &str) -> Option<&str> {
972 let body = body.trim();
973 let rest = body.strip_prefix('$')?;
974 if rest.is_empty() {
975 return None;
976 }
977 let mut chars = rest.chars();
978 let c0 = chars.next()?;
979 if !(c0.is_alphabetic() || c0 == '_') {
980 return None;
981 }
982 for c in chars {
983 if !(c.is_alphanumeric() || c == '_' || c == ':') {
984 return None;
985 }
986 }
987 Some(rest)
988 }
989
990 pub fn next_token(&mut self) -> PerlResult<Token> {
991 self.skip_whitespace_and_comments();
992
993 if self.pos >= self.input.len() {
994 return Ok(Token::Eof);
995 }
996
997 let ch = self.input[self.pos];
998 match ch {
999 '$' => {
1001 self.advance();
1002 if self.peek() == Some('$') {
1004 let is_dollar_under_brace = self.input.get(self.pos + 1) == Some(&'_')
1006 && self.input.get(self.pos + 2) == Some(&'{');
1007 if !is_dollar_under_brace {
1008 self.advance();
1009 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
1010 let name = self.read_identifier();
1011 self.last_was_term = true;
1012 return Ok(Token::DerefScalarVar(name));
1013 }
1014 self.last_was_term = true;
1016 return Ok(Token::ScalarVar("$$".to_string()));
1017 }
1018 }
1019 let name = self.read_variable_name();
1020 if name.is_empty() {
1021 return Err(self.syntax_err("Expected variable name after $", self.line));
1022 }
1023 self.last_was_term = true;
1024 if let Some(tail) = Self::braced_body_symbolic_scalar_deref_name(&name) {
1025 return Ok(Token::DerefScalarVar(tail.to_string()));
1026 }
1027 Ok(Token::ScalarVar(name))
1028 }
1029 '@' => {
1030 self.advance();
1031 if self.peek() == Some('-') {
1032 self.advance();
1033 self.last_was_term = true;
1034 return Ok(Token::ArrayVar("-".to_string()));
1035 }
1036 if self.peek() == Some('+') {
1037 self.advance();
1038 self.last_was_term = true;
1039 return Ok(Token::ArrayVar("+".to_string()));
1040 }
1041 if self.peek() == Some('^')
1042 && self
1043 .input
1044 .get(self.pos + 1)
1045 .is_some_and(|c| c.is_alphabetic() || *c == '_')
1046 {
1047 self.advance();
1048 let name = format!("^{}", self.read_package_qualified_identifier());
1049 self.last_was_term = true;
1050 return Ok(Token::ArrayVar(name));
1051 }
1052 if self.peek() == Some('_') || self.peek().is_some_and(|c| c.is_alphabetic()) {
1053 let name = self.read_package_qualified_identifier();
1054 self.last_was_term = true;
1055 return Ok(Token::ArrayVar(name));
1056 }
1057 self.last_was_term = false;
1058 Ok(Token::ArrayAt)
1059 }
1060 '%' if !self.last_was_term => {
1061 self.advance();
1062 if self.peek() == Some('+') {
1064 self.advance();
1065 self.last_was_term = true;
1066 return Ok(Token::HashVar("+".to_string()));
1067 }
1068 if self.peek() == Some('^')
1069 && self
1070 .input
1071 .get(self.pos + 1)
1072 .is_some_and(|c| c.is_alphabetic() || *c == '_')
1073 {
1074 self.advance();
1075 let name = format!("^{}", self.read_package_qualified_identifier());
1076 self.last_was_term = true;
1077 return Ok(Token::HashVar(name));
1078 }
1079 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
1080 let name = self.read_package_qualified_identifier();
1081 self.last_was_term = true;
1082 return Ok(Token::HashVar(name));
1083 }
1084 self.last_was_term = false;
1085 Ok(Token::HashPercent)
1086 }
1087
1088 '0'..='9' => {
1090 let tok = self.read_number()?;
1091 self.last_was_term = true;
1092 Ok(tok)
1093 }
1094
1095 '\'' => {
1097 let tok = self.read_single_quoted_string()?;
1098 self.last_was_term = true;
1099 Ok(tok)
1100 }
1101 '"' => {
1102 let tok = self.read_double_quoted_string()?;
1103 self.last_was_term = true;
1104 Ok(tok)
1105 }
1106
1107 '`' => {
1109 self.advance();
1110 let cmd = self.read_escaped_until('`')?;
1111 self.last_was_term = true;
1112 Ok(Token::BacktickString(cmd))
1113 }
1114
1115 '/' => {
1117 if !self.last_was_term {
1118 let tok = self.read_regex()?;
1119 self.last_was_term = true;
1120 return Ok(tok);
1121 }
1122 self.advance();
1123 if self.peek() == Some('=') {
1124 self.advance();
1125 self.last_was_term = false;
1126 return Ok(Token::DivAssign);
1127 }
1128 if self.peek() == Some('/') {
1129 self.advance();
1130 if self.peek() == Some('=') {
1131 self.advance();
1132 self.last_was_term = false;
1133 return Ok(Token::DefinedOrAssign);
1134 }
1135 self.last_was_term = false;
1136 return Ok(Token::DefinedOr);
1137 }
1138 self.last_was_term = false;
1139 Ok(Token::Slash)
1140 }
1141
1142 '+' => {
1144 self.advance();
1145 if self.peek() == Some('+') {
1146 self.advance();
1147 return Ok(Token::Increment);
1149 }
1150 if self.peek() == Some('=') {
1151 self.advance();
1152 self.last_was_term = false;
1153 return Ok(Token::PlusAssign);
1154 }
1155 self.last_was_term = false;
1156 Ok(Token::Plus)
1157 }
1158 '-' => {
1159 self.advance();
1160 if !self.last_was_term {
1162 if let Some(c) = self.peek() {
1163 if "efdlpSszrwxoRWXOBCTMAgut".contains(c)
1164 && self.peek_at(1).is_none_or(|n| {
1165 n.is_whitespace()
1166 || n == '$'
1167 || n == '\''
1168 || n == '"'
1169 || n == '('
1170 || n == ')'
1171 || n == '}'
1172 || n == ';'
1173 || n == ','
1174 })
1175 {
1176 self.advance();
1177 self.last_was_term = false;
1178 return Ok(Token::FileTest(c));
1179 }
1180 }
1181 }
1182 if self.peek() == Some('-') {
1183 self.advance();
1184 return Ok(Token::Decrement);
1185 }
1186 if self.peek() == Some('=') {
1187 self.advance();
1188 self.last_was_term = false;
1189 return Ok(Token::MinusAssign);
1190 }
1191 if self.peek() == Some('>') {
1192 self.advance();
1193 if self.peek() == Some('>') {
1194 self.advance();
1195 self.last_was_term = false;
1196 return Ok(Token::ThreadArrowLast);
1197 }
1198 self.last_was_term = false;
1199 return Ok(Token::Arrow);
1200 }
1201 self.last_was_term = false;
1202 Ok(Token::Minus)
1203 }
1204 '*' => {
1205 self.advance();
1206 if self.peek() == Some('*') {
1207 self.advance();
1208 if self.peek() == Some('=') {
1209 self.advance();
1210 self.last_was_term = false;
1211 return Ok(Token::PowAssign);
1212 }
1213 self.last_was_term = false;
1214 return Ok(Token::Power);
1215 }
1216 if self.peek() == Some('=') {
1217 self.advance();
1218 self.last_was_term = false;
1219 return Ok(Token::MulAssign);
1220 }
1221 self.last_was_term = false;
1222 Ok(Token::Star)
1223 }
1224 '%' => {
1225 self.advance();
1227 if self.peek() == Some('=') {
1228 self.advance();
1229 self.last_was_term = false;
1230 return Ok(Token::ModAssign);
1231 }
1232 self.last_was_term = false;
1233 Ok(Token::Percent)
1234 }
1235 '.' => {
1236 self.advance();
1237 if self.peek() == Some('.') {
1238 self.advance();
1239 if self.peek() == Some('.') {
1240 self.advance();
1241 self.last_was_term = false;
1242 return Ok(Token::RangeExclusive);
1243 }
1244 self.last_was_term = false;
1245 return Ok(Token::Range);
1246 }
1247 if self.peek() == Some('=') {
1248 self.advance();
1249 self.last_was_term = false;
1250 return Ok(Token::DotAssign);
1251 }
1252 self.last_was_term = false;
1253 Ok(Token::Dot)
1254 }
1255 '=' => {
1256 let eq_pos = self.pos;
1257 self.advance();
1258 if self.peek() == Some('=') {
1259 self.advance();
1260 self.last_was_term = false;
1261 return Ok(Token::NumEq);
1262 }
1263 if self.peek() == Some('~') {
1264 self.advance();
1265 self.last_was_term = false;
1266 return Ok(Token::BindMatch);
1267 }
1268 if self.peek() == Some('>') {
1269 self.advance();
1270 self.last_was_term = false;
1271 return Ok(Token::FatArrow);
1272 }
1273 if self.peek().is_some_and(|c| c.is_alphabetic())
1275 && self.at_line_start_for_pod(eq_pos)
1276 {
1277 loop {
1279 let line = self.read_while(|c| c != '\n');
1280 if self.peek() == Some('\n') {
1281 self.advance();
1282 }
1283 if line.starts_with("=cut") || self.pos >= self.input.len() {
1284 break;
1285 }
1286 }
1287 return self.next_token();
1288 }
1289 self.last_was_term = false;
1290 Ok(Token::Assign)
1291 }
1292 '!' => {
1293 self.advance();
1294 if self.peek() == Some('=') {
1295 self.advance();
1296 self.last_was_term = false;
1297 return Ok(Token::NumNe);
1298 }
1299 if self.peek() == Some('~') {
1300 self.advance();
1301 self.last_was_term = false;
1302 return Ok(Token::BindNotMatch);
1303 }
1304 self.last_was_term = false;
1305 Ok(Token::LogNot)
1306 }
1307 '<' => {
1308 self.advance();
1309 let after_lt = self.pos;
1310 if self.peek() == Some('$') {
1312 self.advance();
1313 let name = self.read_variable_name();
1314 if !name.is_empty() && self.peek() == Some('>') {
1315 self.advance();
1316 self.last_was_term = true;
1317 return Ok(Token::ReadLine(name));
1318 }
1319 self.pos = after_lt;
1320 }
1321 if self.peek() == Some('>') {
1323 self.advance();
1324 self.last_was_term = true;
1325 return Ok(Token::Diamond);
1326 }
1327 if self.peek().is_some_and(|c| c.is_uppercase()) {
1328 let name = self.read_identifier();
1329 if self.peek() == Some('>') {
1330 self.advance();
1331 self.last_was_term = true;
1332 return Ok(Token::ReadLine(name));
1333 }
1334 self.last_was_term = false;
1337 return Ok(Token::NumLt);
1338 }
1339 if self.peek() == Some('=') {
1340 self.advance();
1341 if self.peek() == Some('>') {
1342 self.advance();
1343 self.last_was_term = false;
1344 return Ok(Token::Spaceship);
1345 }
1346 self.last_was_term = false;
1347 return Ok(Token::NumLe);
1348 }
1349 if self.peek() == Some('<') {
1350 self.advance();
1351 if self.peek() == Some('=') {
1352 self.advance();
1353 self.last_was_term = false;
1354 return Ok(Token::ShiftLeftAssign);
1355 }
1356 if self.last_was_term {
1359 self.last_was_term = false;
1360 return Ok(Token::ShiftLeft);
1361 }
1362 let (tag, interpolate, indented) = self.read_heredoc_tag()?;
1363 let body = self.read_heredoc_body(&tag, indented)?;
1364 self.last_was_term = true;
1365 return Ok(Token::HereDoc(tag, body, interpolate));
1366 }
1367 self.last_was_term = false;
1368 Ok(Token::NumLt)
1369 }
1370 '>' => {
1371 self.advance();
1372 if self.peek() == Some('{') {
1373 self.advance();
1374 self.last_was_term = false;
1375 return Ok(Token::ArrowBrace);
1376 }
1377 if self.peek() == Some('=') {
1378 self.advance();
1379 self.last_was_term = false;
1380 return Ok(Token::NumGe);
1381 }
1382 if self.peek() == Some('>') {
1383 self.advance();
1384 if self.peek() == Some('=') {
1385 self.advance();
1386 self.last_was_term = false;
1387 return Ok(Token::ShiftRightAssign);
1388 }
1389 self.last_was_term = false;
1390 return Ok(Token::ShiftRight);
1391 }
1392 self.last_was_term = false;
1393 Ok(Token::NumGt)
1394 }
1395 '&' => {
1396 self.advance();
1397 if self.peek() == Some('&') {
1398 self.advance();
1399 if self.peek() == Some('=') {
1400 self.advance();
1401 self.last_was_term = false;
1402 return Ok(Token::AndAssign);
1403 }
1404 self.last_was_term = false;
1405 return Ok(Token::LogAnd);
1406 }
1407 if self.peek() == Some('=') {
1408 self.advance();
1409 self.last_was_term = false;
1410 return Ok(Token::BitAndAssign);
1411 }
1412 self.last_was_term = false;
1413 Ok(Token::BitAnd)
1414 }
1415 '|' => {
1416 self.advance();
1417 if self.peek() == Some('|') {
1418 self.advance();
1419 if self.peek() == Some('=') {
1420 self.advance();
1421 self.last_was_term = false;
1422 return Ok(Token::OrAssign);
1423 }
1424 self.last_was_term = false;
1425 return Ok(Token::LogOr);
1426 }
1427 if self.peek() == Some('=') {
1428 self.advance();
1429 self.last_was_term = false;
1430 return Ok(Token::BitOrAssign);
1431 }
1432 if self.peek() == Some('>') {
1433 self.advance();
1434 self.last_was_term = false;
1435 return Ok(Token::PipeForward);
1436 }
1437 self.last_was_term = false;
1438 Ok(Token::BitOr)
1439 }
1440 '^' => {
1441 self.advance();
1442 if self.peek() == Some('=') {
1443 self.advance();
1444 self.last_was_term = false;
1445 return Ok(Token::XorAssign);
1446 }
1447 self.last_was_term = false;
1448 Ok(Token::BitXor)
1449 }
1450 '~' => {
1451 self.advance();
1452 if self.peek() == Some('>') {
1453 self.advance();
1454 if self.peek() == Some('>') {
1455 self.advance();
1456 self.last_was_term = false;
1457 return Ok(Token::ThreadArrowLast);
1458 }
1459 self.last_was_term = false;
1460 return Ok(Token::ThreadArrow);
1461 }
1462 self.last_was_term = false;
1463 Ok(Token::BitNot)
1464 }
1465 '?' => {
1466 self.advance();
1467 self.last_was_term = false;
1468 Ok(Token::Question)
1469 }
1470 ':' => {
1471 self.advance();
1472 if self.peek() == Some(':') {
1473 self.advance();
1474 self.last_was_term = false;
1475 return Ok(Token::PackageSep);
1476 }
1477 self.last_was_term = false;
1478 Ok(Token::Colon)
1479 }
1480 '\\' => {
1481 self.advance();
1482 if self.peek() == Some('\n') {
1485 self.pos += 1; return self.next_token();
1487 }
1488 self.last_was_term = false;
1489 Ok(Token::Backslash)
1490 }
1491 ',' => {
1492 self.advance();
1493 self.last_was_term = false;
1494 Ok(Token::Comma)
1495 }
1496 ';' => {
1497 self.advance();
1498 self.last_was_term = false;
1499 Ok(Token::Semicolon)
1500 }
1501 '(' => {
1502 self.advance();
1503 self.last_was_term = false;
1504 Ok(Token::LParen)
1505 }
1506 ')' => {
1507 self.advance();
1508 self.last_was_term = true;
1509 Ok(Token::RParen)
1510 }
1511 '[' => {
1512 self.advance();
1513 self.last_was_term = false;
1514 Ok(Token::LBracket)
1515 }
1516 ']' => {
1517 self.advance();
1518 self.last_was_term = true;
1519 Ok(Token::RBracket)
1520 }
1521 '{' => {
1522 self.advance();
1523 self.last_was_term = false;
1524 Ok(Token::LBrace)
1525 }
1526 '}' => {
1527 self.advance();
1528 self.last_was_term = true;
1529 Ok(Token::RBrace)
1530 }
1531
1532 c if c.is_alphabetic() || c == '_' => {
1534 let ident = self.read_identifier();
1535
1536 match ident.as_str() {
1538 "format" => {
1539 self.skip_whitespace_and_comments();
1540 let fname = self.read_package_qualified_identifier();
1541 self.skip_whitespace_and_comments();
1542 if self.peek() != Some('=') {
1543 return Err(
1544 self.syntax_err("Expected '=' after format name", self.line)
1545 );
1546 }
1547 self.advance();
1548 let lines = self.read_format_body()?;
1549 self.last_was_term = false;
1550 return Ok(Token::FormatDecl { name: fname, lines });
1551 }
1552 "qw" => {
1553 let start_pos = self.pos;
1555 self.skip_whitespace_only();
1556 if let Some(c) = self.peek() {
1557 if c == '=' && self.peek_at(1) == Some('>') {
1558 self.pos = start_pos;
1559 self.last_was_term = true;
1560 return Ok(Token::Ident(ident));
1561 }
1562 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
1563 self.pos = start_pos;
1564 self.last_was_term = true;
1565 return Ok(Token::Ident(ident));
1566 }
1567 }
1568 self.pos = start_pos; let tok = self.read_qw()?;
1570 self.last_was_term = true;
1571 return Ok(tok);
1572 }
1573 "qq" | "q" => {
1574 let start_pos = self.pos;
1578 self.skip_whitespace_only();
1579 if let Some(c) = self.peek() {
1580 if c == '=' && self.peek_at(1) == Some('>') {
1582 self.pos = start_pos; self.last_was_term = true;
1584 return Ok(Token::Ident(ident));
1585 }
1586 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
1588 self.pos = start_pos;
1589 self.last_was_term = true;
1590 return Ok(Token::Ident(ident));
1591 }
1592 }
1593 let delim = self.advance().ok_or_else(|| {
1594 self.syntax_err("Expected delimiter after q/qq", self.line)
1595 })?;
1596 let close = match delim {
1597 '(' => ')',
1598 '[' => ']',
1599 '{' => '}',
1600 '<' => '>',
1601 c => c,
1602 };
1603 let s = if matches!(delim, '(' | '[' | '{' | '<') {
1604 self.read_q_qq_balanced_body(delim, close, ident == "qq")?
1605 } else {
1606 self.read_escaped_until(close)?
1607 };
1608 self.last_was_term = true;
1609 if ident == "qq" {
1610 return Ok(Token::DoubleString(s));
1611 }
1612 return Ok(Token::SingleString(s));
1613 }
1614 "qx" => {
1615 let start_pos = self.pos;
1617 self.skip_whitespace_only();
1618 if let Some(c) = self.peek() {
1619 if c == '=' && self.peek_at(1) == Some('>') {
1620 self.pos = start_pos;
1621 self.last_was_term = true;
1622 return Ok(Token::Ident(ident));
1623 }
1624 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
1625 self.pos = start_pos;
1626 self.last_was_term = true;
1627 return Ok(Token::Ident(ident));
1628 }
1629 }
1630 let delim = self.advance().ok_or_else(|| {
1631 self.syntax_err("Expected delimiter after qx", self.line)
1632 })?;
1633 let close = match delim {
1634 '(' => ')',
1635 '[' => ']',
1636 '{' => '}',
1637 '<' => '>',
1638 c => c,
1639 };
1640 let s = self.read_escaped_until(close)?;
1641 self.last_was_term = true;
1642 return Ok(Token::BacktickString(s));
1643 }
1644 "qr" => {
1645 let start_pos = self.pos;
1647 self.skip_whitespace_only();
1648 if let Some(c) = self.peek() {
1649 if c == '=' && self.peek_at(1) == Some('>') {
1650 self.pos = start_pos;
1651 self.last_was_term = true;
1652 return Ok(Token::Ident(ident));
1653 }
1654 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
1655 self.pos = start_pos;
1656 self.last_was_term = true;
1657 return Ok(Token::Ident(ident));
1658 }
1659 }
1660 let delim = self.advance().ok_or_else(|| {
1661 self.syntax_err("Expected delimiter after qr", self.line)
1662 })?;
1663 let close = match delim {
1664 '(' => ')',
1665 '[' => ']',
1666 '{' => '}',
1667 '<' => '>',
1668 c => c,
1669 };
1670 let pattern = self.read_escaped_until(close)?;
1671 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
1672 self.last_was_term = true;
1673 return Ok(Token::Regex(pattern, flags, delim));
1674 }
1675 "m" => {
1676 let start_pos = self.pos;
1679 self.skip_whitespace_only();
1680 if let Some(d) = self.peek() {
1681 if d == '=' && self.peek_at(1) == Some('>') {
1682 self.pos = start_pos;
1683 self.last_was_term = true;
1684 return Ok(Token::Ident(ident));
1685 }
1686 if matches!(d, ';' | ',' | ')' | ']' | '}' | '>' | ':' | '\n') {
1687 self.pos = start_pos;
1688 self.last_was_term = true;
1689 return Ok(Token::Ident(ident));
1690 }
1691 }
1692 self.pos = start_pos;
1693 if self.suppress_m_regex == 0 {
1696 if let Some(delim) = self.peek() {
1697 if !delim.is_alphanumeric() && delim != '_' {
1698 let saved_pos = self.pos;
1700 let saved_line = self.line;
1701 self.advance(); let close = match delim {
1703 '(' => ')',
1704 '[' => ']',
1705 '{' => '}',
1706 '<' => '>',
1707 c => c,
1708 };
1709 let mut pattern = String::new();
1710 let mut terminated = true;
1711 loop {
1712 match self.advance() {
1713 Some('\\') => {
1714 pattern.push('\\');
1715 if let Some(c) = self.advance() {
1716 pattern.push(c);
1717 }
1718 }
1719 Some(c) if c == close => break,
1720 Some(c) if c == '\n' && close == '/' => {
1721 terminated = false;
1723 break;
1724 }
1725 Some(c) => pattern.push(c),
1726 None => {
1727 return Err(self.syntax_err(
1728 "Search pattern not terminated",
1729 saved_line,
1730 ));
1731 }
1732 }
1733 }
1734 if terminated {
1735 let flags =
1736 self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
1737 self.last_was_term = true;
1738 return Ok(Token::Regex(pattern, flags, delim));
1739 }
1740 self.pos = saved_pos;
1742 self.line = saved_line;
1743 }
1744 }
1745 }
1746 self.last_was_term = true;
1748 return Ok(Token::Ident(ident));
1749 }
1750 "s" => {
1751 let start_pos = self.pos;
1754 self.skip_whitespace_only();
1755 if let Some(d) = self.peek() {
1756 if d == '=' && self.peek_at(1) == Some('>') {
1757 self.pos = start_pos;
1758 self.last_was_term = true;
1759 return Ok(Token::Ident(ident));
1760 }
1761 if matches!(d, ';' | ',' | ')' | ']' | '}' | '>' | ':' | '\n') {
1762 self.pos = start_pos;
1763 self.last_was_term = true;
1764 return Ok(Token::Ident(ident));
1765 }
1766 }
1767 self.pos = start_pos;
1768 if let Some(delim) = self.peek() {
1770 if !delim.is_alphanumeric() && delim != '_' && delim != ' ' {
1771 self.advance();
1772 let close = match delim {
1773 '(' => ')',
1774 '[' => ']',
1775 '{' => '}',
1776 '<' => '>',
1777 c => c,
1778 };
1779 let mut pattern = String::new();
1780 loop {
1781 match self.advance() {
1782 Some('\\') => {
1783 pattern.push('\\');
1784 if let Some(c) = self.advance() {
1785 pattern.push(c);
1786 }
1787 }
1788 Some(c) if c == close => break,
1789 Some(c) => pattern.push(c),
1790 None => {
1791 return Err(self.syntax_err(
1792 "Unterminated s/// pattern",
1793 self.line,
1794 ))
1795 }
1796 }
1797 }
1798 if "([{<".contains(delim) {
1800 self.skip_whitespace_only();
1801 let open2 = self.advance().unwrap_or(delim);
1802 let close = match open2 {
1803 '(' => ')',
1804 '[' => ']',
1805 '{' => '}',
1806 '<' => '>',
1807 c => c,
1808 };
1809 let replacement = self.read_escaped_until(close)?;
1810 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
1811 self.last_was_term = true;
1812 return Ok(Token::Ident(format!(
1815 "\x00s\x00{}\x00{}\x00{}\x00{}",
1816 pattern, replacement, flags, delim
1817 )));
1818 }
1819 let replacement = self.read_escaped_until(close)?;
1820 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
1821 self.last_was_term = true;
1822 return Ok(Token::Ident(format!(
1823 "\x00s\x00{}\x00{}\x00{}\x00{}",
1824 pattern, replacement, flags, delim
1825 )));
1826 }
1827 }
1828 self.last_was_term = true;
1829 return Ok(Token::Ident(ident));
1830 }
1831 "tr" | "y" => {
1832 if let Some(d) = self.peek() {
1835 if matches!(d, ';' | ',' | ')' | ']' | '}' | '>' | ':' | '\n') {
1836 self.last_was_term = true;
1837 return Ok(Token::Ident(ident));
1838 }
1839 } else {
1840 self.last_was_term = true;
1841 return Ok(Token::Ident(ident));
1842 }
1843 let start_pos = self.pos;
1845 self.skip_whitespace_only();
1846 if let Some(d) = self.peek() {
1847 if d == '=' && self.peek_at(1) != Some('=') {
1849 self.pos = start_pos;
1850 self.last_was_term = true;
1851 return Ok(Token::Ident(ident));
1852 }
1853 }
1854 self.pos = start_pos;
1855 if let Some(delim) = self.peek() {
1857 if !delim.is_alphanumeric() && delim != '_' && delim != ' ' {
1858 self.advance();
1859 let close = match delim {
1860 '(' => ')',
1861 '[' => ']',
1862 '{' => '}',
1863 '<' => '>',
1864 c => c,
1865 };
1866 let from = self.read_escaped_until(close)?;
1867 if "([{<".contains(delim) {
1869 self.skip_whitespace_only();
1870 self.advance(); }
1872 let to = self.read_escaped_until(close)?;
1873 let flags = self.read_while(|c| "cdsr".contains(c));
1874 self.last_was_term = true;
1875 return Ok(Token::Ident(format!(
1876 "\x00tr\x00{}\x00{}\x00{}\x00{}",
1877 from, to, flags, delim
1878 )));
1879 }
1880 }
1881 self.last_was_term = true;
1882 return Ok(Token::Ident(ident));
1883 }
1884 _ => {}
1885 }
1886
1887 let saved_pos2 = self.pos;
1889 self.skip_whitespace_and_comments();
1890 if self.peek() == Some('=') && self.peek_at(1) == Some('>') {
1891 self.pos = saved_pos2;
1892 self.last_was_term = true;
1893 return Ok(Token::Ident(ident));
1894 }
1895 self.pos = saved_pos2;
1896
1897 let tok = if ident == "x" && !self.last_was_term {
1901 Token::Ident("x".to_string())
1902 } else {
1903 keyword_or_ident(&ident)
1904 };
1905 self.last_was_term = match ident.as_str() {
1908 "my"
1911 | "mysync"
1912 | "frozen"
1913 | "const"
1914 | "typed"
1915 | "our"
1916 | "local"
1917 | "state"
1918 | "return"
1919 | "print"
1920 | "pr"
1921 | "say"
1922 | "p"
1923 | "die"
1924 | "warn"
1925 | "push"
1926 | "pop"
1927 | "shift"
1928 | "shuffle"
1929 | "chunked"
1930 | "windowed"
1931 | "unshift"
1932 | "splice"
1933 | "delete"
1934 | "exists"
1935 | "chomp"
1936 | "chop"
1937 | "defined"
1938 | "keys"
1939 | "values"
1940 | "each"
1941 | "sub"
1942 | "struct"
1943 | "if"
1944 | "unless"
1945 | "while"
1946 | "until"
1947 | "for"
1948 | "foreach"
1949 | "elsif"
1950 | "use"
1951 | "no"
1952 | "require"
1953 | "eval"
1954 | "do"
1955 | "map"
1956 | "maps"
1957 | "flat_maps"
1958 | "grep"
1959 | "greps"
1960 | "sort"
1961 | "all"
1962 | "any"
1963 | "none"
1964 | "take_while"
1965 | "drop_while"
1966 | "skip_while"
1967 | "skip"
1968 | "first_or"
1969 | "tap"
1970 | "peek"
1971 | "with_index"
1972 | "pmap"
1973 | "pflat_map"
1974 | "puniq"
1975 | "pfirst"
1976 | "pany"
1977 | "pmap_chunked"
1978 | "pipeline"
1979 | "pgrep"
1980 | "pfor"
1981 | "par_lines"
1982 | "par_walk"
1983 | "pwatch"
1984 | "watch"
1985 | "psort"
1986 | "reduce"
1987 | "fold"
1988 | "inject"
1989 | "first"
1990 | "detect"
1991 | "find"
1992 | "find_all"
1993 | "preduce"
1994 | "preduce_init"
1995 | "pmap_reduce"
1996 | "pcache"
1997 | "fan"
1998 | "fan_cap"
1999 | "pchannel"
2000 | "pselect"
2001 | "uniq"
2002 | "distinct"
2003 | "flatten"
2004 | "set"
2005 | "list_count"
2006 | "list_size"
2007 | "count"
2008 | "len"
2009 | "size"
2010 | "cnt"
2011 | "zip"
2012 | "async"
2013 | "trace"
2014 | "timer"
2015 | "await"
2016 | "slurp"
2017 | "capture"
2018 | "fetch_url"
2019 | "fetch"
2020 | "fetch_json"
2021 | "fetch_async"
2022 | "fetch_async_json"
2023 | "par_fetch"
2024 | "par_csv_read"
2025 | "par_pipeline"
2026 | "par_pipeline_stream"
2027 | "par_sed"
2028 | "join"
2029 | "json_encode"
2030 | "json_decode"
2031 | "json_jq"
2032 | "jwt_encode"
2033 | "jwt_decode"
2034 | "jwt_decode_unsafe"
2035 | "log_info"
2036 | "log_warn"
2037 | "log_error"
2038 | "log_debug"
2039 | "log_trace"
2040 | "log_json"
2041 | "log_level"
2042 | "sha256"
2043 | "sha1"
2044 | "md5"
2045 | "hmac_sha256"
2046 | "hmac"
2047 | "uuid"
2048 | "base64_encode"
2049 | "base64_decode"
2050 | "hex_encode"
2051 | "hex_decode"
2052 | "gzip"
2053 | "gunzip"
2054 | "zstd"
2055 | "zstd_decode"
2056 | "datetime_utc"
2057 | "datetime_from_epoch"
2058 | "datetime_parse_rfc3339"
2059 | "datetime_strftime"
2060 | "toml_decode"
2061 | "toml_encode"
2062 | "yaml_decode"
2063 | "yaml_encode"
2064 | "url_encode"
2065 | "url_decode"
2066 | "uri_escape"
2067 | "uri_unescape"
2068 | "split"
2069 | "reverse"
2070 | "reversed"
2071 | "not"
2072 | "ref"
2073 | "scalar"
2074 | "try"
2075 | "catch"
2076 | "finally"
2077 | "given"
2078 | "when"
2079 | "default"
2080 | "eval_timeout"
2081 | "tie"
2082 | "retry"
2083 | "rate_limit"
2084 | "every"
2085 | "gen"
2086 | "yield"
2087 | "match"
2088 | "filter"
2089 | "f"
2090 | "reject"
2091 | "collect"
2092 | "compact"
2093 | "concat"
2094 | "chain"
2095 | "min_by"
2096 | "max_by"
2097 | "sort_by"
2098 | "tally"
2099 | "find_index"
2100 | "each_with_index"
2101 | "fore"
2102 | "e"
2103 | "ep"
2104 | "flat_map"
2105 | "group_by"
2106 | "chunk_by"
2107 | "bench" => false,
2108 "thread" | "t" => !self.last_was_term,
2112 _ => matches!(tok, Token::Ident(_)),
2113 };
2114 Ok(tok)
2115 }
2116
2117 c => Err(self.syntax_err(format!("Unexpected character '{c}'"), self.line)),
2118 }
2119 }
2120
2121 pub fn tokenize(&mut self) -> PerlResult<Vec<(Token, usize)>> {
2123 let mut tokens = Vec::new();
2124 loop {
2125 self.skip_whitespace_and_comments();
2130 let line = self.line;
2131 let tok = self.next_token()?;
2132 if tok == Token::Eof {
2133 tokens.push((Token::Eof, line));
2134 break;
2135 }
2136 tokens.push((tok, line));
2137 }
2138 Ok(tokens)
2139 }
2140}
2141
2142#[cfg(test)]
2143mod tests {
2144 use super::*;
2145 use crate::token::Token;
2146
2147 #[test]
2148 fn tokenize_empty_yields_eof() {
2149 let mut l = Lexer::new("");
2150 let t = l.tokenize().expect("tokenize");
2151 assert_eq!(t.len(), 1);
2152 assert!(matches!(t[0].0, Token::Eof));
2153 }
2154
2155 #[test]
2156 fn tokenize_integer_literal() {
2157 let mut l = Lexer::new("42");
2158 let t = l.tokenize().expect("tokenize");
2159 assert!(matches!(t[0].0, Token::Integer(42)));
2160 }
2161
2162 #[test]
2163 fn tokenize_keyword_my_and_semicolon() {
2164 let mut l = Lexer::new("my;");
2165 let t = l.tokenize().expect("tokenize");
2166 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "my"));
2167 assert!(matches!(t[1].0, Token::Semicolon));
2168 }
2169
2170 #[test]
2171 fn tokenize_skips_hash_line_comment() {
2172 let mut l = Lexer::new("1#comment\n2");
2173 let t = l.tokenize().expect("tokenize");
2174 assert!(matches!(t[0].0, Token::Integer(1)));
2175 assert!(matches!(t[1].0, Token::Integer(2)));
2176 assert!(matches!(t[2].0, Token::Eof));
2177 }
2178
2179 #[test]
2180 fn tokenize_double_quoted_string_literal() {
2181 let mut l = Lexer::new(r#""hi""#);
2182 let t = l.tokenize().expect("tokenize");
2183 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "hi"));
2184 }
2185
2186 #[test]
2187 fn tokenize_double_string_escaped_sigils_are_literal() {
2188 let mut l = Lexer::new(r#""my \$x""#);
2190 let t = l.tokenize().expect("tokenize");
2191 let want = format!("my {}x", LITERAL_DOLLAR_IN_DQUOTE);
2192 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
2193 }
2194
2195 #[test]
2196 fn tokenize_double_string_braced_hex_unicode_escape() {
2197 let mut l = Lexer::new(r#""\x{1215}""#);
2198 let t = l.tokenize().expect("tokenize");
2199 let want: String = ['\u{1215}'].into_iter().collect();
2200 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
2201 }
2202
2203 #[test]
2204 fn tokenize_double_string_braced_unicode_u_escape() {
2205 let mut l = Lexer::new(r#""\u{0301}""#);
2206 let t = l.tokenize().expect("tokenize");
2207 let want: String = ['\u{0301}'].into_iter().collect();
2208 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
2209 }
2210
2211 #[test]
2212 fn tokenize_double_string_braced_unicode_u_escape_multi() {
2213 let mut l = Lexer::new(r#""\u{0041}\u{00E9}\u{1F600}""#);
2215 let t = l.tokenize().expect("tokenize");
2216 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "Aé😀"));
2217 }
2218
2219 #[test]
2220 fn tokenize_double_string_octal_escape() {
2221 let mut l = Lexer::new(r#""\101""#);
2222 let t = l.tokenize().expect("tokenize");
2223 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "A"));
2224 }
2225
2226 #[test]
2227 fn tokenize_double_string_braced_octal_escape() {
2228 let mut l = Lexer::new(r#""\o{101}""#);
2229 let t = l.tokenize().expect("tokenize");
2230 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "A"));
2231 }
2232
2233 #[test]
2234 fn tokenize_double_string_control_char_escape() {
2235 let mut l = Lexer::new(r#""\cA""#);
2236 let t = l.tokenize().expect("tokenize");
2237 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "\x01"));
2238 }
2239
2240 #[test]
2241 fn tokenize_double_string_named_unicode_escape() {
2242 let mut l = Lexer::new(r#""\N{SNOWMAN}""#);
2243 let t = l.tokenize().expect("tokenize");
2244 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "☃"));
2245 }
2246
2247 #[test]
2248 fn tokenize_double_string_named_unicode_u_plus() {
2249 let mut l = Lexer::new(r#""\N{U+2603}""#);
2250 let t = l.tokenize().expect("tokenize");
2251 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "☃"));
2252 }
2253
2254 #[test]
2255 fn tokenize_double_string_unbraced_hex_two_digits() {
2256 let mut l = Lexer::new(r#""\x41""#);
2257 let t = l.tokenize().expect("tokenize");
2258 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "A"));
2259 }
2260
2261 #[test]
2262 fn tokenize_single_quoted_string_literal() {
2263 let mut l = Lexer::new("'x'");
2264 let t = l.tokenize().expect("tokenize");
2265 assert!(matches!(t[0].0, Token::SingleString(ref s) if s == "x"));
2266 }
2267
2268 #[test]
2269 fn tokenize_spaceship_operator() {
2270 let mut l = Lexer::new("1 <=> 2");
2271 let t = l.tokenize().expect("tokenize");
2272 assert!(matches!(t[0].0, Token::Integer(1)));
2273 assert!(matches!(t[1].0, Token::Spaceship));
2274 assert!(matches!(t[2].0, Token::Integer(2)));
2275 }
2276
2277 #[test]
2278 fn tokenize_m_regex_literal() {
2279 let mut l = Lexer::new("m/abc/");
2280 let t = l.tokenize().expect("tokenize");
2281 assert!(matches!(t[0].0, Token::Regex(ref p, ref f, _) if p == "abc" && f.is_empty()));
2282 }
2283
2284 #[test]
2285 fn tokenize_q_brace_constructor() {
2286 let mut l = Lexer::new("q{lit}");
2287 let t = l.tokenize().expect("tokenize");
2288 assert!(matches!(t[0].0, Token::SingleString(ref s) if s == "lit"));
2289 }
2290
2291 #[test]
2293 fn tokenize_q_paren_balances_nested_parens_in_prototype() {
2294 let mut l = Lexer::new("q(fn ($) { 1 })");
2295 let t = l.tokenize().expect("tokenize");
2296 assert!(matches!(t[0].0, Token::SingleString(ref s) if s == "fn ($) { 1 }"));
2297 }
2298
2299 #[test]
2301 fn tokenize_qw_paren_balances_nested_parens() {
2302 let mut l = Lexer::new("qw( (SV*)pWARN_ALL )");
2303 let t = l.tokenize().expect("tokenize");
2304 assert!(matches!(t[0].0, Token::QW(ref w) if w.len() == 1 && w[0] == "(SV*)pWARN_ALL"));
2305 }
2306
2307 #[test]
2308 fn tokenize_float_literal() {
2309 let mut l = Lexer::new("3.25");
2310 let t = l.tokenize().expect("tokenize");
2311 assert!(matches!(t[0].0, Token::Float(f) if (f - 3.25).abs() < f64::EPSILON));
2312 }
2313
2314 #[test]
2315 fn tokenize_scientific_float() {
2316 let mut l = Lexer::new("1e2");
2317 let t = l.tokenize().expect("tokenize");
2318 assert!(matches!(t[0].0, Token::Float(f) if (f - 100.0).abs() < 1e-9));
2319 }
2320
2321 #[test]
2322 fn tokenize_hex_with_underscore_separators() {
2323 let mut l = Lexer::new("0x_FF");
2324 let t = l.tokenize().expect("tokenize");
2325 assert!(matches!(t[0].0, Token::Integer(255)));
2326 }
2327
2328 #[test]
2329 fn tokenize_qr_regex_with_flags() {
2330 let mut l = Lexer::new("qr/pat/i");
2331 let t = l.tokenize().expect("tokenize");
2332 assert!(matches!(t[0].0, Token::Regex(ref p, ref f, _) if p == "pat" && f == "i"));
2333 }
2334
2335 #[test]
2336 fn tokenize_m_slash_includes_gc_flags() {
2337 let mut l = Lexer::new("m/./gc");
2338 let t = l.tokenize().expect("tokenize");
2339 assert!(matches!(&t[0].0, Token::Regex(p, f, _) if p == "." && f == "gc"));
2340 }
2341
2342 #[test]
2343 fn tokenize_m_hash_delimiter_includes_gc_flags() {
2344 let mut l = Lexer::new("m#\\w#gc");
2345 let t = l.tokenize().expect("tokenize");
2346 assert!(matches!(&t[0].0, Token::Regex(p, f, _) if p == r"\w" && f == "gc"));
2347 }
2348
2349 #[test]
2350 fn tokenize_qr_slash_includes_gco_flags() {
2351 let mut l = Lexer::new("qr/x/gco");
2352 let t = l.tokenize().expect("tokenize");
2353 assert!(matches!(&t[0].0, Token::Regex(p, f, _) if p == "x" && f == "gco"));
2354 }
2355
2356 #[test]
2357 fn tokenize_qw_hash_delimiter_not_line_comment() {
2358 let mut l = Lexer::new("qw# a b #;");
2360 let t = l.tokenize().expect("tokenize");
2361 assert!(
2362 matches!(&t[0].0, Token::QW(w) if w == &["a", "b"]),
2363 "first={:?}",
2364 t.first()
2365 );
2366 }
2367
2368 #[test]
2369 fn tokenize_qq_hash_delimiter_single_line() {
2370 let mut l = Lexer::new("qq#x#;");
2371 let t = l.tokenize().expect("tokenize");
2372 assert!(matches!(&t[0].0, Token::DoubleString(s) if s == "x"));
2373 }
2374
2375 #[test]
2376 fn tokenize_qr_hash_delimiter_text_balanced_preamble() {
2377 let src = "qr#(\n [!=]~\n | split|grep|map\n | not|and|or|xor\n)#x";
2378 let mut l = Lexer::new(src);
2379 let t = l.tokenize().expect("tokenize");
2380 let Token::Regex(p, f, _) = &t[0].0 else {
2381 panic!("expected Regex, got {:?}", t[0].0);
2382 };
2383 let rest: Vec<_> = t.iter().skip(1).take(8).map(|x| &x.0).collect();
2384 assert!(f.contains('x'), "flags={f:?} pattern={p:?} rest={rest:?}");
2385 assert!(p.contains("[!=]~"), "{p:?}");
2386 assert!(p.contains("split|grep|map"), "{p:?}");
2387 }
2388
2389 #[test]
2390 fn tokenize_octal_integer_literal() {
2391 let mut l = Lexer::new("010");
2392 let t = l.tokenize().expect("tokenize");
2393 assert!(matches!(t[0].0, Token::Integer(8)));
2394 }
2395
2396 #[test]
2397 fn tokenize_binary_integer_literal() {
2398 let mut l = Lexer::new("0b1010");
2399 let t = l.tokenize().expect("tokenize");
2400 assert!(matches!(t[0].0, Token::Integer(10)));
2401 }
2402
2403 #[test]
2404 fn tokenize_filetest_exists() {
2405 let mut l = Lexer::new("-e '.'");
2406 let t = l.tokenize().expect("tokenize");
2407 assert!(matches!(t[0].0, Token::FileTest('e')));
2408 assert!(matches!(t[1].0, Token::SingleString(ref s) if s == "."));
2409 }
2410
2411 #[test]
2412 fn tokenize_filetest_tty() {
2413 let mut l = Lexer::new("-t 'STDIN'");
2414 let t = l.tokenize().expect("tokenize");
2415 assert!(matches!(t[0].0, Token::FileTest('t')));
2416 assert!(matches!(t[1].0, Token::SingleString(ref s) if s == "STDIN"));
2417 }
2418
2419 #[test]
2420 fn tokenize_power_and_range_operators() {
2421 let mut l = Lexer::new("2 ** 3");
2422 let t = l.tokenize().expect("tokenize");
2423 assert!(matches!(t[0].0, Token::Integer(2)));
2424 assert!(matches!(t[1].0, Token::Power));
2425 assert!(matches!(t[2].0, Token::Integer(3)));
2426
2427 let mut l = Lexer::new("1..4");
2428 let t = l.tokenize().expect("tokenize");
2429 assert!(matches!(t[0].0, Token::Integer(1)));
2430 assert!(matches!(t[1].0, Token::Range));
2431 assert!(matches!(t[2].0, Token::Integer(4)));
2432 }
2433
2434 #[test]
2435 fn tokenize_numeric_equality_operators() {
2436 let mut l = Lexer::new("1 == 2");
2437 let t = l.tokenize().expect("tokenize");
2438 assert!(matches!(t[0].0, Token::Integer(1)));
2439 assert!(matches!(t[1].0, Token::NumEq));
2440 assert!(matches!(t[2].0, Token::Integer(2)));
2441
2442 let mut l = Lexer::new("3 != 4");
2443 let t = l.tokenize().expect("tokenize");
2444 assert!(matches!(t[0].0, Token::Integer(3)));
2445 assert!(matches!(t[1].0, Token::NumNe));
2446 assert!(matches!(t[2].0, Token::Integer(4)));
2447 }
2448
2449 #[test]
2450 fn tokenize_logical_and_or_plus_assign() {
2451 let mut l = Lexer::new("1 && 0");
2452 let t = l.tokenize().expect("tokenize");
2453 assert!(matches!(t[0].0, Token::Integer(1)));
2454 assert!(matches!(t[1].0, Token::LogAnd));
2455 assert!(matches!(t[2].0, Token::Integer(0)));
2456
2457 let mut l = Lexer::new("0 || 9");
2458 let t = l.tokenize().expect("tokenize");
2459 assert!(matches!(t[0].0, Token::Integer(0)));
2460 assert!(matches!(t[1].0, Token::LogOr));
2461 assert!(matches!(t[2].0, Token::Integer(9)));
2462
2463 let mut l = Lexer::new("n += 1");
2464 let t = l.tokenize().expect("tokenize");
2465 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "n"));
2466 assert!(matches!(t[1].0, Token::PlusAssign));
2467 assert!(matches!(t[2].0, Token::Integer(1)));
2468 }
2469
2470 #[test]
2471 fn tokenize_bitwise_and_operator() {
2472 let mut l = Lexer::new("3 & 5");
2473 let t = l.tokenize().expect("tokenize");
2474 assert!(matches!(t[0].0, Token::Integer(3)));
2475 assert!(matches!(t[1].0, Token::BitAnd));
2476 assert!(matches!(t[2].0, Token::Integer(5)));
2477 }
2478
2479 #[test]
2480 fn tokenize_braced_caret_scalar_global_phase() {
2481 let mut l = Lexer::new(r#"print ${^GLOBAL_PHASE}, "\n";"#);
2482 let t = l.tokenize().expect("tokenize");
2483 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "print"));
2484 assert!(matches!(t[1].0, Token::ScalarVar(ref s) if s == "^GLOBAL_PHASE"));
2485 assert!(matches!(t[2].0, Token::Comma));
2486 assert!(matches!(t[3].0, Token::DoubleString(ref s) if s == "\n"));
2487 assert!(matches!(t[4].0, Token::Semicolon));
2488 }
2489
2490 #[test]
2491 fn tokenize_bitwise_or_and_assign() {
2492 let mut l = Lexer::new("$a |= $b");
2493 let t = l.tokenize().expect("tokenize");
2494 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "a"));
2495 assert!(matches!(t[1].0, Token::BitOrAssign));
2496 assert!(matches!(t[2].0, Token::ScalarVar(ref s) if s == "b"));
2497
2498 let mut l = Lexer::new("$a &= $b");
2499 let t = l.tokenize().expect("tokenize");
2500 assert!(matches!(t[1].0, Token::BitAndAssign));
2501 }
2502
2503 #[test]
2504 fn tokenize_division_and_modulo() {
2505 let mut l = Lexer::new("7 / 2");
2506 let t = l.tokenize().expect("tokenize");
2507 assert!(matches!(t[1].0, Token::Slash));
2508
2509 let mut l = Lexer::new("7 % 3");
2510 let t = l.tokenize().expect("tokenize");
2511 assert!(matches!(t[1].0, Token::Percent));
2512 }
2513
2514 #[test]
2515 fn tokenize_comma_fat_arrow_and_semicolon() {
2516 let mut l = Lexer::new("a => 1;");
2517 let t = l.tokenize().expect("tokenize");
2518 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "a"));
2519 assert!(matches!(t[1].0, Token::FatArrow));
2520 assert!(matches!(t[2].0, Token::Integer(1)));
2521 assert!(matches!(t[3].0, Token::Semicolon));
2522 }
2523
2524 #[test]
2525 fn tokenize_minus_unary_vs_binary() {
2526 let mut l = Lexer::new("- 5");
2527 let t = l.tokenize().expect("tokenize");
2528 assert!(matches!(t[0].0, Token::Minus));
2529 assert!(matches!(t[1].0, Token::Integer(5)));
2530 }
2531
2532 #[test]
2533 fn tokenize_dollar_scalar_sigil() {
2534 let mut l = Lexer::new("$foo");
2535 let t = l.tokenize().expect("tokenize");
2536 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "foo"));
2537 }
2538
2539 #[test]
2541 fn tokenize_assign_not_pod_when_eq_not_line_start() {
2542 let mut l = Lexer::new("$_=foo;");
2543 let t = l.tokenize().expect("tokenize");
2544 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "_"));
2545 assert!(matches!(t[1].0, Token::Assign));
2546 assert!(matches!(t[2].0, Token::Ident(ref s) if s == "foo"));
2547 assert!(matches!(t[3].0, Token::Semicolon));
2548 }
2549
2550 #[test]
2551 fn tokenize_pod_equals_still_skipped_at_line_start() {
2552 let mut l = Lexer::new("=head1 NAME\ncode\n=cut\n$x;");
2553 let t = l.tokenize().expect("tokenize");
2554 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "x"));
2555 assert!(matches!(t[1].0, Token::Semicolon));
2556 }
2557
2558 #[test]
2559 fn tokenize_at_array_sigil() {
2560 let mut l = Lexer::new("@arr");
2561 let t = l.tokenize().expect("tokenize");
2562 assert!(matches!(t[0].0, Token::ArrayVar(ref s) if s == "arr"));
2563 }
2564
2565 #[test]
2566 fn tokenize_at_caret_capture_array() {
2567 let mut l = Lexer::new("@^CAPTURE");
2568 let t = l.tokenize().expect("tokenize");
2569 assert!(matches!(t[0].0, Token::ArrayVar(ref s) if s == "^CAPTURE"));
2570 }
2571
2572 #[test]
2573 fn tokenize_percent_caret_hook_hash() {
2574 let mut l = Lexer::new("%^HOOK");
2575 let t = l.tokenize().expect("tokenize");
2576 assert!(matches!(t[0].0, Token::HashVar(ref s) if s == "^HOOK"));
2577 }
2578
2579 #[test]
2580 fn tokenize_caret_letter_and_at_minus_plus() {
2581 let mut l = Lexer::new("$^I@-@+");
2582 let t = l.tokenize().expect("tokenize");
2583 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "^I"));
2584 assert!(matches!(t[1].0, Token::ArrayVar(ref s) if s == "-"));
2585 assert!(matches!(t[2].0, Token::ArrayVar(ref s) if s == "+"));
2586 }
2587
2588 #[test]
2589 fn tokenize_percent_hash_sigil() {
2590 let mut l = Lexer::new("%h");
2591 let t = l.tokenize().expect("tokenize");
2592 assert!(matches!(t[0].0, Token::HashVar(ref s) if s == "h"));
2593 }
2594
2595 #[test]
2596 fn tokenize_percent_plus_named_capture_hash() {
2597 let mut l = Lexer::new("%+");
2598 let t = l.tokenize().expect("tokenize");
2599 assert!(matches!(t[0].0, Token::HashVar(ref s) if s == "+"));
2600 }
2601
2602 #[test]
2603 fn tokenize_dollar_dollar_under_brace_is_not_pid() {
2604 let mut l = Lexer::new("$$_{$k}");
2606 let t = l.tokenize().expect("tokenize");
2607 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "_"));
2608 assert!(matches!(t[1].0, Token::LBrace));
2609 }
2610
2611 #[test]
2612 fn tokenize_braced_scalar_deref_try_tiny() {
2613 let mut l = Lexer::new("${$code_ref}");
2615 let t = l.tokenize().expect("tokenize");
2616 assert!(matches!(t[0].0, Token::DerefScalarVar(ref s) if s == "code_ref"));
2617 }
2618
2619 #[test]
2620 fn tokenize_braced_scalar_deref_package_qualified() {
2621 let mut l = Lexer::new("${$Foo::bar}");
2622 let t = l.tokenize().expect("tokenize");
2623 assert!(matches!(t[0].0, Token::DerefScalarVar(ref s) if s == "Foo::bar"));
2624 }
2625
2626 #[test]
2627 fn tokenize_dollar_colon_stash_brace() {
2628 let mut l = Lexer::new("$::{$pack}");
2630 let t = l.tokenize().expect("tokenize");
2631 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "::"));
2632 assert!(matches!(t[1].0, Token::LBrace));
2633 }
2634
2635 #[test]
2636 fn tokenize_ampersand_then_ident_is_bitand_not_coderef() {
2637 let mut l = Lexer::new("&f");
2639 let t = l.tokenize().expect("tokenize");
2640 assert!(matches!(t[0].0, Token::BitAnd));
2641 assert!(matches!(t[1].0, Token::Ident(ref s) if s == "f"));
2642 }
2643
2644 #[test]
2645 fn tokenize_qq_paren_constructor() {
2646 let mut l = Lexer::new("qq(x y)");
2647 let t = l.tokenize().expect("tokenize");
2648 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "x y"));
2649 }
2650
2651 #[test]
2652 fn tokenize_qq_slash_escaped_dollar_is_literal() {
2653 let mut l = Lexer::new(r#"qq/my \$y/"#);
2654 let t = l.tokenize().expect("tokenize");
2655 let want = format!("my {}y", LITERAL_DOLLAR_IN_DQUOTE);
2656 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
2657 }
2658
2659 #[test]
2660 fn tokenize_s_substitution_alternate_delimiter() {
2661 let mut l = Lexer::new("s#a#b#");
2662 let t = l.tokenize().expect("tokenize");
2663 assert!(matches!(t[0].0, Token::Ident(ref s) if s.starts_with("\x00s\x00")));
2664 }
2665
2666 #[test]
2667 fn tokenize_tr_slash_delimiter() {
2668 let mut l = Lexer::new("tr/a/b/");
2669 let t = l.tokenize().expect("tokenize");
2670 assert!(matches!(t[0].0, Token::Ident(ref s) if s.starts_with("\x00tr\x00")));
2671 }
2672
2673 #[test]
2674 fn tokenize_y_synonym_for_tr() {
2675 let mut l = Lexer::new("y/x/y/");
2676 let t = l.tokenize().expect("tokenize");
2677 assert!(matches!(t[0].0, Token::Ident(ref s) if s.starts_with("\x00tr\x00")));
2678 }
2679
2680 #[test]
2681 fn tokenize_less_equal_greater_relops() {
2682 let mut l = Lexer::new("1 <= 2");
2683 let t = l.tokenize().expect("tokenize");
2684 assert!(matches!(t[1].0, Token::NumLe));
2685
2686 let mut l = Lexer::new("3 >= 2");
2687 let t = l.tokenize().expect("tokenize");
2688 assert!(matches!(t[1].0, Token::NumGe));
2689
2690 let mut l = Lexer::new("1 < 2");
2691 let t = l.tokenize().expect("tokenize");
2692 assert!(matches!(t[1].0, Token::NumLt));
2693
2694 let mut l = Lexer::new("3 > 2");
2695 let t = l.tokenize().expect("tokenize");
2696 assert!(matches!(t[1].0, Token::NumGt));
2697 }
2698
2699 #[test]
2700 fn tokenize_readline_scalar_handle() {
2701 let mut l = Lexer::new("<$fh>");
2702 let t = l.tokenize().expect("tokenize");
2703 assert!(matches!(t[0].0, Token::ReadLine(ref s) if s == "fh"));
2704 }
2705
2706 #[test]
2707 fn tokenize_shift_right_and_shift_left_assign() {
2708 let mut l = Lexer::new("8 >> 1");
2709 let t = l.tokenize().expect("tokenize");
2710 assert!(matches!(t[1].0, Token::ShiftRight));
2711
2712 let mut l = Lexer::new("8 << 1");
2713 let t = l.tokenize().expect("tokenize");
2714 assert!(matches!(t[1].0, Token::ShiftLeft));
2715
2716 let mut l = Lexer::new("x <<= 3");
2717 let t = l.tokenize().expect("tokenize");
2718 assert!(matches!(t[1].0, Token::ShiftLeftAssign));
2719 }
2720
2721 #[test]
2722 fn tokenize_heredoc_after_print_not_shift() {
2723 let src = "print <<EOT\nhi\nEOT\n";
2724 let mut l = Lexer::new(src);
2725 let t = l.tokenize().expect("tokenize");
2726 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "print"));
2727 assert!(
2728 matches!(&t[1].0, Token::HereDoc(tag, body, interpolate) if tag == "EOT" && body == "hi\n" && *interpolate),
2729 "got {:?}",
2730 t[1].0
2731 );
2732 }
2733
2734 #[test]
2735 fn tokenize_bitwise_or_xor() {
2736 let mut l = Lexer::new("3 | 1");
2737 let t = l.tokenize().expect("tokenize");
2738 assert!(matches!(t[1].0, Token::BitOr));
2739
2740 let mut l = Lexer::new("3 ^ 1");
2741 let t = l.tokenize().expect("tokenize");
2742 assert!(matches!(t[1].0, Token::BitXor));
2743 }
2744
2745 #[test]
2746 fn tokenize_pipe_forward_vs_bitor_vs_logor() {
2747 let mut l = Lexer::new("1 |> f");
2749 let t = l.tokenize().expect("tokenize");
2750 assert!(matches!(t[1].0, Token::PipeForward), "got {:?}", t[1].0);
2751
2752 let mut l = Lexer::new("a | b || c |> d");
2754 let t = l.tokenize().expect("tokenize");
2755 let kinds: Vec<_> = t.iter().map(|(k, _)| k.clone()).collect();
2756 assert!(kinds.iter().any(|k| matches!(k, Token::BitOr)));
2757 assert!(kinds.iter().any(|k| matches!(k, Token::LogOr)));
2758 assert!(kinds.iter().any(|k| matches!(k, Token::PipeForward)));
2759 }
2760
2761 #[test]
2762 fn tokenize_compare_and_three_way_string_ops() {
2763 let mut l = Lexer::new("\"a\" cmp \"b\"");
2764 let t = l.tokenize().expect("tokenize");
2765 assert!(matches!(t[1].0, Token::StrCmp));
2766 }
2767
2768 #[test]
2769 fn tokenize_package_double_colon_splits_qualified_name() {
2770 let mut l = Lexer::new("Foo::Bar::baz");
2771 let t = l.tokenize().expect("tokenize");
2772 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "Foo"));
2773 assert!(matches!(t[1].0, Token::PackageSep));
2774 assert!(matches!(t[2].0, Token::Ident(ref s) if s == "Bar"));
2775 assert!(matches!(t[3].0, Token::PackageSep));
2776 assert!(matches!(t[4].0, Token::Ident(ref s) if s == "baz"));
2777 }
2778
2779 #[test]
2780 fn tokenize_pod_line_skipped_like_comment_prefix() {
2781 let mut l = Lexer::new("=pod\n=cut\n42");
2783 let t = l.tokenize().expect("tokenize");
2784 assert!(matches!(t[0].0, Token::Integer(42)));
2785 }
2786
2787 #[test]
2788 fn tokenize_underscore_in_identifier() {
2789 let mut l = Lexer::new("__PACKAGE__");
2790 let t = l.tokenize().expect("tokenize");
2791 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "__PACKAGE__"));
2792 }
2793
2794 #[test]
2796 fn tokenize_x_repeat_vs_sub_name() {
2797 let mut l = Lexer::new("3 x 4");
2798 let t = l.tokenize().expect("tokenize");
2799 assert!(matches!(t[1].0, Token::X));
2800
2801 let mut l = Lexer::new("sub x { 1 }");
2802 let t = l.tokenize().expect("tokenize");
2803 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "sub"));
2804 assert!(matches!(t[1].0, Token::Ident(ref s) if s == "x"));
2805 }
2806}