stryke/lexer.rs
1use crate::error::{ErrorKind, StrykeError, StrykeResult};
2use crate::token::{keyword_or_ident, Token};
3
4/// Private-use character for a literal `$` inside double-quoted / `qq` strings (from `\$` in source).
5/// The parser maps this to `$` without variable interpolation (CPAN `eval qq/…/` code generators).
6pub const LITERAL_DOLLAR_IN_DQUOTE: char = '\u{E000}';
7/// Private-use character for a literal `@` inside double-quoted / `qq` strings (from `\@` in source).
8/// Mirrors `LITERAL_DOLLAR_IN_DQUOTE`; suppresses array interpolation so `"\@x"` is the literal `@x`.
9pub const LITERAL_AT_IN_DQUOTE: char = '\u{E001}';
10
11/// Resolve `\N{U+XXXX}` hex codepoints and `\N{LATIN SMALL LETTER E}` Unicode character names.
12fn parse_unicode_name(name: &str) -> Option<char> {
13 if let Some(hex) = name.strip_prefix("U+") {
14 let val = u32::from_str_radix(hex, 16).ok()?;
15 char::from_u32(val)
16 } else {
17 unicode_names2::character(name)
18 }
19}
20
21/// Flag letters after `m//`, `qr//`, etc. (`c` = `/gc`, `o` = compile once; CPAN uses both).
22const REGEX_FLAG_CHARS: &str = "gimsxecor";
23
24pub struct Lexer {
25 input: Vec<char>,
26 pos: usize,
27 pub line: usize,
28 /// Line where the most recently-returned token starts. Set by
29 /// [`Self::next_token`] right after its leading
30 /// `skip_whitespace_and_comments` call so the value reflects the
31 /// **emitted** token's source position even when the call recursed
32 /// through a POD / heredoc skip (which advances `self.line` many
33 /// lines before producing the real token). Read by [`Self::tokenize`].
34 pub token_start_line: usize,
35 /// Tracks whether the last token was a term (value/variable/close-delim)
36 /// to disambiguate `/` as division vs regex and `{` as hash-ref vs block.
37 last_was_term: bool,
38 /// Tracks whether the last token was a method-call arrow (`->`). After
39 /// `->`, identifiers `s` / `tr` / `y` / `q` / `qq` / `qw` / `qr` / `m`
40 /// are method names — never substitution / transliteration / quote-like
41 /// operators. Without this gate, `$obj->y` followed by `,` would consume
42 /// `, …, …` as a transliteration body.
43 last_was_arrow: bool,
44 /// Snapshot of [`Self::last_was_arrow`] taken at the start of each
45 /// [`Self::next_token`] call so identifier-decoding logic can read the
46 /// previous-token state without racing against its own writes.
47 prev_arrow: bool,
48 /// Source path for [`StrykeError`] (e.g. real script or required `.pm` path).
49 error_file: String,
50 /// When > 0, the lexer treats `m` followed by `/` as a plain identifier
51 /// instead of `m//` regex syntax. Used in thread/pipeline stages where
52 /// `/m/` should be a regex grep filter, not `m//`.
53 pub suppress_m_regex: u32,
54 /// Set true by [`Self::next_token`] right before returning a token that
55 /// originated from a *bare* positional alias (`_`, `_0`, `_1`, …) — i.e.
56 /// without a leading `$` sigil. Read by [`Self::tokenize`] to record the
57 /// emitted token's index in [`Self::bare_positional_indices`]. Reset to
58 /// `false` at the top of every `next_token` call.
59 pub last_was_bare_positional: bool,
60 /// Indices into the token vector returned by [`Self::tokenize`] for every
61 /// bare-positional token. Used by the parser's `my $X = EXPR` rule to
62 /// auto-wrap an RHS that contains free positional aliases into an
63 /// implicit zero-arg coderef (so `my $f = _ * 2` ≡ `my $f = fn { _ * 2 }`).
64 pub bare_positional_indices: std::collections::HashSet<usize>,
65}
66
67impl Lexer {
68 pub fn new(input: &str) -> Self {
69 Self::new_with_file(input, "-e")
70 }
71
72 pub fn new_with_file(input: &str, file: impl Into<String>) -> Self {
73 Self {
74 input: input.chars().collect(),
75 pos: 0,
76 line: 1,
77 token_start_line: 1,
78 last_was_term: false,
79 last_was_arrow: false,
80 prev_arrow: false,
81 error_file: file.into(),
82 suppress_m_regex: 0,
83 last_was_bare_positional: false,
84 bare_positional_indices: std::collections::HashSet::new(),
85 }
86 }
87
88 fn syntax_err(&self, message: impl Into<String>, line: usize) -> StrykeError {
89 StrykeError::new(ErrorKind::Syntax, message, line, self.error_file.clone())
90 }
91
92 /// Used by the `s` / `tr` / `y` lexer arms when the identifier is followed
93 /// by `,`. Returns `true` only when the rest of the statement looks like a
94 /// genuine `s,PAT,REPL,FLAGS` / `tr,FROM,TO,FLAGS` shape — at least 2 more
95 /// commas before the next statement terminator (`;`, newline, EOF, or
96 /// closing brace/paren/bracket from the enclosing context). Without this
97 /// gate, `struct Pt { x, y, z }` would consume `y, z }` as a transliteration
98 /// body, and `$obj->y, ...` would eat the rest of the call.
99 fn lookahead_is_comma_delim_subst(&self) -> bool {
100 let mut commas = 0usize;
101 let mut depth_paren = 0i32;
102 let mut depth_bracket = 0i32;
103 let mut depth_brace = 0i32;
104 let mut i = self.pos;
105 while i < self.input.len() {
106 let c = self.input[i];
107 match c {
108 '\\' => {
109 i += 2; // skip escaped char (regex backslash escapes are common in pat/repl)
110 continue;
111 }
112 '(' => depth_paren += 1,
113 ')' => {
114 if depth_paren == 0 {
115 break;
116 }
117 depth_paren -= 1;
118 }
119 '[' => depth_bracket += 1,
120 ']' => {
121 if depth_bracket == 0 {
122 break;
123 }
124 depth_bracket -= 1;
125 }
126 '{' => depth_brace += 1,
127 '}' => {
128 if depth_brace == 0 {
129 break;
130 }
131 depth_brace -= 1;
132 }
133 ';' | '\n' => break,
134 ',' if depth_paren == 0 && depth_bracket == 0 && depth_brace == 0 => {
135 commas += 1;
136 if commas >= 3 {
137 return true;
138 }
139 }
140 _ => {}
141 }
142 i += 1;
143 }
144 // Need 3 total commas: `s,P,R,F` / `tr,F,T,F` (FLAGS may be empty,
145 // but the third comma must still be present).
146 commas >= 3
147 }
148
149 fn peek(&self) -> Option<char> {
150 self.input.get(self.pos).copied()
151 }
152
153 fn peek_at(&self, offset: usize) -> Option<char> {
154 self.input.get(self.pos + offset).copied()
155 }
156
157 /// True when `=` at `eq_pos` is Perl POD (`=head1`, `=cut`, …): first non-whitespace on the line.
158 /// Otherwise `$_=foo` would misparse `=f` as POD and swallow the rest of the file.
159 fn at_line_start_for_pod(&self, eq_pos: usize) -> bool {
160 let mut i = eq_pos;
161 while i > 0 {
162 i -= 1;
163 let c = self.input[i];
164 if c == '\n' {
165 return true;
166 }
167 if !c.is_whitespace() {
168 return false;
169 }
170 }
171 true
172 }
173
174 fn advance(&mut self) -> Option<char> {
175 let ch = self.input.get(self.pos).copied();
176 if let Some(c) = ch {
177 if c == '\n' {
178 self.line += 1;
179 }
180 self.pos += 1;
181 }
182 ch
183 }
184
185 fn skip_whitespace_and_comments(&mut self) {
186 while self.pos < self.input.len() {
187 let ch = self.input[self.pos];
188 if ch == '#' {
189 // Line comment
190 while self.pos < self.input.len() && self.input[self.pos] != '\n' {
191 self.pos += 1;
192 }
193 } else if ch == '\\' && self.peek_at(1) == Some('\n') {
194 // Backslash-newline: line continuation (shell-style)
195 // Don't increment line — continued line is logically part of the same line
196 self.pos += 2;
197 } else if ch.is_whitespace() {
198 if ch == '\n' {
199 self.line += 1;
200 }
201 self.pos += 1;
202 } else {
203 break;
204 }
205 }
206 }
207
208 /// Whitespace only — used after `q`/`qq`/`qr`/… before the opening delimiter so `#` is not
209 /// mistaken for a line comment (`qr#...#`, `qw#...#`).
210 fn skip_whitespace_only(&mut self) {
211 while self.pos < self.input.len() {
212 let ch = self.input[self.pos];
213 if ch.is_whitespace() {
214 if ch == '\n' {
215 self.line += 1;
216 }
217 self.pos += 1;
218 } else {
219 break;
220 }
221 }
222 }
223
224 fn read_while(&mut self, pred: impl Fn(char) -> bool) -> String {
225 let mut s = String::new();
226 while let Some(ch) = self.peek() {
227 if pred(ch) {
228 s.push(ch);
229 self.advance();
230 } else {
231 break;
232 }
233 }
234 s
235 }
236
237 /// Peek past whitespace and check whether the next token starts a range
238 /// operator: `:`, `..`, `...`, or `~`. Used by the hex-integer lexer
239 /// to switch into range-friendly DoubleString mode so `0x00:0xFF:1` and
240 /// `0x00~0xFF~1` iterate with hex output instead of decimal. `~` here
241 /// must NOT be `~>` / `~>>` (the thread macro) — those are operators on
242 /// a value, not range starters.
243 fn next_is_range_separator(&self) -> bool {
244 let mut i = self.pos;
245 while i < self.input.len() && matches!(self.input[i], ' ' | '\t') {
246 i += 1;
247 }
248 if i >= self.input.len() {
249 return false;
250 }
251 match self.input[i] {
252 ':' => true,
253 '.' if self.input.get(i + 1) == Some(&'.') => true,
254 '~' if self.input.get(i + 1) != Some(&'>') => true,
255 _ => false,
256 }
257 }
258
259 /// `YYYY-MM-DD` / `YYYY-MM` lookahead. Called from [`Self::read_number`]
260 /// when the just-consumed integer part is exactly 4 digits followed by
261 /// `-<digit>`. Tries the longer `YYYY-MM-DD` shape first (full ISO date),
262 /// falling back to `YYYY-MM` (year-month). Both shapes require valid
263 /// month (01..=12) and, for the date form, valid day (01..=31). On match
264 /// returns the literal string and advances `self.pos` past it; on
265 /// failure restores `self.pos` so the caller falls through to the
266 /// existing arithmetic-as-`-` path. The 4-digit year requirement is the
267 /// disambiguator vs. plain subtraction (`2022-01-01` = date,
268 /// `5-2-1` = arithmetic).
269 fn try_consume_iso_date_tail(&mut self, start: usize) -> Option<String> {
270 let saved = self.pos;
271 let year: String = self.input[start..self.pos].iter().collect();
272 if year.len() != 4 || year.parse::<u16>().is_err() {
273 return None;
274 }
275 // Match `-MM`
276 if self.peek() != Some('-') {
277 return None;
278 }
279 if !self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
280 || !self.peek_at(2).is_some_and(|c| c.is_ascii_digit())
281 {
282 return None;
283 }
284 // Reject when third char after `-` is also a digit (e.g. `2022-100`)
285 // — that's arithmetic, not a month.
286 if self.peek_at(3).is_some_and(|c| c.is_ascii_digit()) {
287 return None;
288 }
289 let month_str: String = self.input[self.pos + 1..self.pos + 3].iter().collect();
290 let month: u8 = match month_str.parse() {
291 Ok(m) if (1..=12).contains(&m) => m,
292 _ => return None,
293 };
294 // Provisionally consume `-MM`
295 self.advance(); // -
296 self.advance(); // M
297 self.advance(); // M
298 // Try `-DD` extension
299 if self.peek() == Some('-')
300 && self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
301 && self.peek_at(2).is_some_and(|c| c.is_ascii_digit())
302 && !self.peek_at(3).is_some_and(|c| c.is_ascii_digit())
303 {
304 let day_str: String = self.input[self.pos + 1..self.pos + 3].iter().collect();
305 if let Ok(day) = day_str.parse::<u8>() {
306 if (1..=31).contains(&day) {
307 self.advance(); // -
308 self.advance(); // D
309 self.advance(); // D
310 let _ = month; // already validated
311 return Some(format!("{}-{}-{:02}", year, month_str, day));
312 }
313 }
314 }
315 // Year-month form `YYYY-MM`. Reject if followed by another `-DIGIT`
316 // (would be arithmetic) — caught above by the third-digit guard.
317 Some(format!("{}-{}", year, month_str)).filter(|_| {
318 // No risky trailing chars beyond what we've already consumed.
319 let _ = saved;
320 true
321 })
322 }
323
324 /// IPv6 lookahead from an arbitrary starting pos. Called from
325 /// [`Self::read_number`] (digit-prefix), the identifier path (hex-letter
326 /// prefix `fe80::1`), and the `:` lexer arm (zero-compressed prefix
327 /// `::1`). `start` is where the IPv6 candidate begins in `self.input`;
328 /// `self.pos` may already be partway through but is reset here so the
329 /// scanner controls consumption. Greedily consumes hex digits, `:`, and
330 /// at most one `::`, then validates with Rust's [`std::net::Ipv6Addr`]
331 /// parser. On success returns the literal and leaves `self.pos` past
332 /// it; on failure restores `self.pos` to its pre-call value.
333 /// Acts only when the candidate has at least 2 colons — single-colon
334 /// `1:5` is unambiguous range syntax, and 3-segment chains of pure-digit
335 /// groups (`1:5:1`) never parse as IPv6 so range-with-step is preserved.
336 fn try_consume_ipv6_tail(&mut self, start: usize) -> Option<String> {
337 let saved = self.pos;
338 self.pos = start;
339 let mut seen_double_colon = false;
340 let mut prev_was_colon = false;
341 let mut colon_count = 0usize;
342 while self.pos < self.input.len() {
343 let c = self.input[self.pos];
344 if c == ':' {
345 colon_count += 1;
346 if prev_was_colon {
347 if seen_double_colon {
348 break;
349 }
350 seen_double_colon = true;
351 }
352 prev_was_colon = true;
353 self.advance();
354 continue;
355 }
356 if c.is_ascii_hexdigit() {
357 prev_was_colon = false;
358 self.advance();
359 continue;
360 }
361 break;
362 }
363 // Strip a trailing single colon (likely a range separator the lexer
364 // greedily ate); a trailing `::` is part of the address.
365 if self.pos > start
366 && self.input[self.pos - 1] == ':'
367 && (self.pos < start + 2 || self.input[self.pos - 2] != ':')
368 {
369 self.pos -= 1;
370 colon_count -= 1;
371 }
372 // Package-separator disambiguator: when the candidate ends right
373 // before an ASCII letter or `_` (an identifier continuation), the
374 // `::` is almost certainly a package qualifier (`B::GV::SAFENAME`)
375 // rather than IPv6 zero-compression. IPv6 lookahead bails so the
376 // standard package-separator path runs.
377 if self.pos < self.input.len() {
378 let next = self.input[self.pos];
379 if next.is_ascii_alphabetic() && !next.is_ascii_hexdigit() || next == '_' {
380 self.pos = saved;
381 return None;
382 }
383 // Three-segment package path: `A::B::C` greedy-matched `A::B` as
384 // IPv6 zero-compressed (legal: `0:0:0:0:0:0:A:B`). Detect the
385 // `::IDENT` continuation and bail so the standard PackageSep path
386 // runs. Without this, `package A::B::C` lexes as
387 // `Ident, DoubleString("A::B"), PackageSep, Ident("C")`.
388 if next == ':'
389 && self.input.get(self.pos + 1) == Some(&':')
390 && self
391 .input
392 .get(self.pos + 2)
393 .is_some_and(|c| c.is_ascii_alphabetic() || *c == '_')
394 {
395 self.pos = saved;
396 return None;
397 }
398 }
399 let candidate: String = self.input[start..self.pos].iter().collect();
400 // Require at least one hex digit. The bare `::` form is technically
401 // valid IPv6 (all-zeros) but in real code it nearly always means
402 // something else — array-slice default step (`@a[::]`), package
403 // separator at the start of an empty list, etc. Users who want the
404 // unspecified address can write `::0`.
405 if colon_count < 2
406 || !candidate.chars().any(|c| c.is_ascii_hexdigit())
407 || candidate.parse::<std::net::Ipv6Addr>().is_err()
408 {
409 self.pos = saved;
410 return None;
411 }
412 // Package-qualifier disambiguator: a 2-segment candidate
413 // `LETTERS::LETTERS` with no digits anywhere parses as IPv6
414 // (e.g. `d::ab` ≡ `d:0:0:0:0:0:0:ab`, `dead::beef`) but in
415 // Perl/stryke source it is overwhelmingly a package qualifier
416 // (`fn d::ab ($x) { ... }`, `Foo::Bar->method`). Real IPv6
417 // literals in source carry at least one decimal digit (`fe80::1`,
418 // `::1`, `1::dead`), which keeps them on the IPv6 path. Bare
419 // pure-letter forms are routed back through PackageSep.
420 if !candidate.chars().any(|c| c.is_ascii_digit()) {
421 let segments: Vec<&str> = candidate.split("::").collect();
422 if segments.len() == 2
423 && segments
424 .iter()
425 .all(|s| !s.is_empty() && s.chars().all(|c| c.is_ascii_alphabetic()))
426 {
427 self.pos = saved;
428 return None;
429 }
430 }
431 Some(candidate)
432 }
433
434 /// IPv4 dotted-quad lookahead. Called from [`Self::read_number`] when the
435 /// just-consumed integer part is followed by `.<digit>`. Speculatively
436 /// matches 3 more `.<digits>` segments and accepts only when every octet
437 /// parses as `u8` (0..=255). On match, returns the full dotted-quad
438 /// string (e.g. `"192.168.255.255"`) and advances `self.pos` past it; on
439 /// failure restores `self.pos` to its pre-call value so the caller falls
440 /// through to the existing float-lexing path.
441 fn try_consume_ipv4_tail(&mut self, start: usize) -> Option<String> {
442 let saved = self.pos;
443 // We've already consumed the first octet (start..self.pos).
444 let first: String = self.input[start..self.pos].iter().collect();
445 if first.parse::<u8>().is_err() {
446 return None;
447 }
448 let mut octets: Vec<String> = vec![first];
449 for _ in 0..3 {
450 if self.peek() != Some('.') {
451 self.pos = saved;
452 return None;
453 }
454 if !self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
455 self.pos = saved;
456 return None;
457 }
458 self.advance(); // consume '.'
459 let oct_start = self.pos;
460 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
461 self.advance();
462 }
463 let octet: String = self.input[oct_start..self.pos].iter().collect();
464 if octet.parse::<u8>().is_err() {
465 self.pos = saved;
466 return None;
467 }
468 octets.push(octet);
469 }
470 // Reject 5-segment chains like `1.2.3.4.5` — the trailing `.<digit>`
471 // means the user wrote something else (e.g. version number, list of
472 // floats). Falling back to float lexing is safer than half-eating it.
473 if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
474 self.pos = saved;
475 return None;
476 }
477 Some(octets.join("."))
478 }
479
480 fn read_number(&mut self) -> StrykeResult<Token> {
481 let start = self.pos;
482 let mut is_float = false;
483 let mut is_hex = false;
484 let mut is_oct = false;
485 let mut is_bin = false;
486
487 if self.peek() == Some('0') {
488 match self.peek_at(1) {
489 Some('x') | Some('X') => {
490 is_hex = true;
491 self.advance();
492 self.advance();
493 }
494 Some('b') | Some('B') => {
495 is_bin = true;
496 self.advance();
497 self.advance();
498 }
499 // `0o777` — Perl 5.34+ octal prefix (alongside the bare-`0`
500 // form). (BUG-082) Read the same digit pool as bare `0...`
501 // octals, but skip the `0o` prefix in the conversion.
502 Some('o') | Some('O') => {
503 self.advance();
504 self.advance();
505 let digits = self.read_while(|c| c.is_ascii_digit() || c == '_');
506 let clean: String = digits.chars().filter(|&c| c != '_').collect();
507 let val = i64::from_str_radix(&clean, 8)
508 .map_err(|_| self.syntax_err("Invalid octal literal", self.line))?;
509 return Ok(Token::Integer(val));
510 }
511 Some(c) if c.is_ascii_digit() => {
512 is_oct = true;
513 }
514 _ => {}
515 }
516 }
517
518 if is_hex {
519 let digits = self.read_while(|c| c.is_ascii_hexdigit() || c == '_');
520 let clean: String = digits.chars().filter(|&c| c != '_').collect();
521 let val = i64::from_str_radix(&clean, 16)
522 .map_err(|_| self.syntax_err("Invalid hex literal", self.line))?;
523 // Range-context lookahead: `0x00:0xFF:1` should iterate as hex
524 // strings (`0x00`, `0x01`, …, `0xFF`), preserving the leading
525 // `0x` and case-of-digits. When the next non-whitespace token is
526 // a range separator (`:`, `..`, `...`, or `!!!`), produce a
527 // string-typed literal so the runtime range op can detect the
528 // hex format and emit hex output. In all other contexts the
529 // hex is a normal integer (arithmetic, assignment, etc.).
530 if self.next_is_range_separator() {
531 let raw: String = self.input[start..self.pos].iter().collect();
532 return Ok(Token::DoubleString(raw));
533 }
534 return Ok(Token::Integer(val));
535 }
536 if is_bin {
537 let digits = self.read_while(|c| c == '0' || c == '1' || c == '_');
538 let clean: String = digits.chars().filter(|&c| c != '_').collect();
539 let val = i64::from_str_radix(&clean, 2)
540 .map_err(|_| self.syntax_err("Invalid binary literal", self.line))?;
541 return Ok(Token::Integer(val));
542 }
543
544 // Decimal or octal
545 let _int_part = self.read_while(|c| c.is_ascii_digit() || c == '_');
546 // IPv4 dotted-quad lookahead: `192.168.255.255` should lex as ONE
547 // string token, not as `192.168` (float) `.` `255.255` (float). Try
548 // to consume 3 more `.NUM` segments where every octet is 0..=255 AND
549 // not followed by another `.NUM` (so a 5-segment chain like
550 // `1.2.3.4.5` cleanly fails the ipv4 path and falls back to floats).
551 // Only fires when the current `.NUM` would have been a float decimal
552 // — preserves all existing float lexing.
553 if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
554 if let Some(consumed) = self.try_consume_ipv4_tail(start) {
555 return Ok(Token::DoubleString(consumed));
556 }
557 is_float = true;
558 self.advance(); // consume '.'
559 let _frac = self.read_while(|c| c.is_ascii_digit() || c == '_');
560 }
561 // ISO-date / year-month lookahead: `2022-01-01` and `2022-01`.
562 // Distinct from arithmetic `2022 - 01 - 01` only because the lexer
563 // greedily consumes the dotted form here. The 4-digit-year guard
564 // inside [`Self::try_consume_iso_date_tail`] keeps `5-2-1` parsing
565 // as arithmetic.
566 if !is_float
567 && self.peek() == Some('-')
568 && self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
569 {
570 if let Some(consumed) = self.try_consume_iso_date_tail(start) {
571 return Ok(Token::DoubleString(consumed));
572 }
573 }
574 // IPv6 lookahead: a hex-digit-only integer part followed by `:` and
575 // more hex / `:` could be IPv6. Try to parse and accept; on failure
576 // fall through to the existing range / arithmetic paths so plain
577 // numeric ranges (`1:10`) keep their meaning.
578 if !is_float && self.peek() == Some(':') {
579 if let Some(consumed) = self.try_consume_ipv6_tail(start) {
580 return Ok(Token::DoubleString(consumed));
581 }
582 }
583 // Scientific notation
584 if let Some('e') | Some('E') = self.peek() {
585 is_float = true;
586 self.advance();
587 if let Some('+') | Some('-') = self.peek() {
588 self.advance();
589 }
590 let _exp = self.read_while(|c| c.is_ascii_digit() || c == '_');
591 }
592
593 let raw: String = self.input[start..self.pos].iter().collect();
594 let clean: String = raw.chars().filter(|&c| c != '_').collect();
595
596 if is_float {
597 let val: f64 = clean
598 .parse()
599 .map_err(|_| self.syntax_err("Invalid float literal", self.line))?;
600 Ok(Token::Float(val))
601 } else if is_oct && clean.starts_with('0') && clean.len() > 1 {
602 let val = i64::from_str_radix(&clean[1..], 8)
603 .map_err(|_| self.syntax_err("Invalid octal literal", self.line))?;
604 Ok(Token::Integer(val))
605 } else {
606 let val: i64 = clean
607 .parse()
608 .map_err(|_| self.syntax_err("Invalid integer literal", self.line))?;
609 Ok(Token::Integer(val))
610 }
611 }
612
613 fn read_single_quoted_string(&mut self) -> StrykeResult<Token> {
614 self.advance(); // consume opening '
615 let mut s = String::new();
616 loop {
617 match self.advance() {
618 Some('\\') => match self.peek() {
619 Some('\\') => {
620 s.push('\\');
621 self.advance();
622 }
623 Some('\'') => {
624 s.push('\'');
625 self.advance();
626 }
627 _ => s.push('\\'),
628 },
629 Some('\'') => break,
630 Some(c) => s.push(c),
631 None => return Err(self.syntax_err("Unterminated single-quoted string", self.line)),
632 }
633 }
634 Ok(Token::SingleString(s))
635 }
636
637 fn read_double_quoted_string(&mut self) -> StrykeResult<Token> {
638 self.advance(); // consume opening "
639 // Triple-quoted form: `"""..."""` — multi-line interpolating string.
640 // The opening `"` was just consumed; if the next two chars are also
641 // `"`, we're in triple-quote mode. Read until the closing `"""`,
642 // preserving raw newlines (no indent stripping). Interpolation
643 // (`$var`, `@arr`, `#{expr}`) flows through unchanged because the
644 // resulting `Token::DoubleString` body goes through the same
645 // downstream interpolator as a normal `"..."`.
646 if self.peek() == Some('"') && self.peek_at(1) == Some('"') {
647 self.advance(); // consume 2nd "
648 self.advance(); // consume 3rd "
649 let s = self.read_triple_quoted_body(true)?;
650 return Ok(Token::DoubleString(s));
651 }
652 let s = self.read_escaped_until('"')?;
653 Ok(Token::DoubleString(s))
654 }
655
656 /// Read the body of a triple-quoted string up to and including the
657 /// closing `"""`. `interpolate=true` honors backslash escapes the same
658 /// way `read_escaped_until` does (so `\n`, `\t`, `\\`, `\$`, `\@`,
659 /// `\"` etc. work inside `"""..."""`). `interpolate=false` is "raw"
660 /// mode: every byte is copied verbatim, including backslashes; the
661 /// only way out is a literal `"""`.
662 ///
663 /// Newlines are preserved verbatim — no indent stripping. The user
664 /// chose the indentation; we don't second-guess it.
665 fn read_triple_quoted_body(&mut self, interpolate: bool) -> StrykeResult<String> {
666 let mut s = String::new();
667 loop {
668 // Check for closing `"""` at the current position.
669 if self.peek() == Some('"')
670 && self.peek_at(1) == Some('"')
671 && self.peek_at(2) == Some('"')
672 {
673 self.advance(); // 1st "
674 self.advance(); // 2nd "
675 self.advance(); // 3rd "
676 return Ok(s);
677 }
678 let c = match self.advance() {
679 Some(c) => c,
680 None => {
681 return Err(self.syntax_err(
682 "Unterminated triple-quoted string (missing closing \"\"\")",
683 self.line,
684 ))
685 }
686 };
687 if interpolate && c == '\\' {
688 // Handle escapes the same way single-line `"..."` does.
689 // Reuse the existing escape table by hand for the common
690 // cases; anything we don't recognise falls through as
691 // `\` followed by the next char (matching Perl's
692 // permissive double-quote escape behavior).
693 let next = self.advance();
694 match next {
695 Some('n') => s.push('\n'),
696 Some('t') => s.push('\t'),
697 Some('r') => s.push('\r'),
698 Some('\\') => s.push('\\'),
699 Some('"') => s.push('"'),
700 Some('$') => s.push(LITERAL_DOLLAR_IN_DQUOTE),
701 Some('@') => s.push(LITERAL_AT_IN_DQUOTE),
702 Some('0') => s.push('\0'),
703 Some('a') => s.push('\x07'),
704 Some('b') => s.push('\x08'),
705 Some('f') => s.push('\x0C'),
706 Some('e') => s.push('\x1B'),
707 Some(other) => {
708 s.push('\\');
709 s.push(other);
710 }
711 None => {
712 return Err(self.syntax_err(
713 "Unterminated escape at end of triple-quoted string",
714 self.line,
715 ))
716 }
717 }
718 continue;
719 }
720 if c == '\n' {
721 self.line += 1;
722 }
723 s.push(c);
724 }
725 }
726
727 /// Same as [`Self::read_escaped_until`] but preserves `\1`..`\9` verbatim
728 /// (Perl's `s/(\d+)/\1/` numbered back-reference). Multi-digit octals
729 /// (`\012`, `\077`) still resolve here, matching Perl's documented
730 /// "two-or-more digit escapes are octal" rule.
731 fn read_substitution_replacement(&mut self, term: char) -> StrykeResult<String> {
732 self.read_escaped_until_inner(term, true)
733 }
734
735 fn read_escaped_until(&mut self, term: char) -> StrykeResult<String> {
736 self.read_escaped_until_inner(term, false)
737 }
738
739 /// Body parser for `q{…}`, `s/.../.../`, etc. When `defer_single_digit` is
740 /// `true`, `\1`..`\9` (followed by a non-octal-digit) are preserved
741 /// verbatim so the substitution layer can expand them as numbered
742 /// back-references (Perl's documented `s///` behavior). Multi-digit
743 /// octals (`\012`) still resolve numerically in both modes.
744 fn read_escaped_until_inner(
745 &mut self,
746 term: char,
747 defer_single_digit: bool,
748 ) -> StrykeResult<String> {
749 let mut s = String::new();
750 loop {
751 match self.advance() {
752 Some('\\') => match self.advance() {
753 Some('n') => s.push('\n'),
754 Some('t') => s.push('\t'),
755 Some('r') => s.push('\r'),
756 Some('\\') => s.push('\\'),
757 Some(c @ '0'..='7') => {
758 // In substitution-replacement mode, defer `\1`..`\9`
759 // (with no further octal digit) so the replacement
760 // expander can read them as numbered back-refs.
761 if defer_single_digit && c != '0' && !matches!(self.peek(), Some('0'..='7'))
762 {
763 s.push('\\');
764 s.push(c);
765 continue;
766 }
767 let mut oct = String::new();
768 oct.push(c);
769 for _ in 0..2 {
770 match self.peek() {
771 Some(d) if ('0'..='7').contains(&d) => {
772 oct.push(self.advance().unwrap());
773 }
774 _ => break,
775 }
776 }
777 let val = u32::from_str_radix(&oct, 8).unwrap();
778 let ch = char::from_u32(val)
779 .ok_or_else(|| self.syntax_err("Invalid octal escape", self.line))?;
780 s.push(ch);
781 }
782 Some('a') => s.push('\x07'),
783 Some('b') => s.push('\x08'),
784 Some('f') => s.push('\x0C'),
785 Some('e') => s.push('\x1B'),
786 Some('$') => s.push(LITERAL_DOLLAR_IN_DQUOTE),
787 Some('@') => s.push(LITERAL_AT_IN_DQUOTE),
788 Some('c') => {
789 let ch = self
790 .advance()
791 .ok_or_else(|| self.syntax_err("Unterminated \\c escape", self.line))?;
792 s.push(char::from(ch.to_ascii_uppercase() as u8 ^ 0x40));
793 }
794 Some('o') if self.peek() == Some('{') => {
795 self.advance(); // '{'
796 let oct = self.read_while(|c| c != '}');
797 if self.peek() != Some('}') {
798 return Err(
799 self.syntax_err("Unterminated \\o{...} in string", self.line)
800 );
801 }
802 self.advance(); // '}'
803 if oct.is_empty() {
804 return Err(self.syntax_err("Empty \\o{} in string", self.line));
805 }
806 let val = u32::from_str_radix(&oct, 8).map_err(|_| {
807 self.syntax_err("Invalid octal digits in \\o{...}", self.line)
808 })?;
809 let c = char::from_u32(val).ok_or_else(|| {
810 self.syntax_err("Invalid Unicode scalar value in \\o{...}", self.line)
811 })?;
812 s.push(c);
813 }
814 Some('u') if self.peek() == Some('{') => {
815 self.advance(); // '{'
816 let hex = self.read_while(|c| c != '}');
817 if self.peek() != Some('}') {
818 return Err(
819 self.syntax_err("Unterminated \\u{...} in string", self.line)
820 );
821 }
822 self.advance(); // '}'
823 if hex.is_empty() {
824 return Err(self.syntax_err("Empty \\u{} in string", self.line));
825 }
826 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
827 self.syntax_err("Invalid hex digits in \\u{...}", self.line)
828 })?;
829 let c = char::from_u32(val).ok_or_else(|| {
830 self.syntax_err("Invalid Unicode scalar value in \\u{...}", self.line)
831 })?;
832 s.push(c);
833 }
834 Some('N') if self.peek() == Some('{') => {
835 self.advance(); // '{'
836 let name = self.read_while(|c| c != '}');
837 if self.peek() != Some('}') {
838 return Err(
839 self.syntax_err("Unterminated \\N{...} in string", self.line)
840 );
841 }
842 self.advance(); // '}'
843 if name.is_empty() {
844 return Err(self.syntax_err("Empty \\N{} in string", self.line));
845 }
846 let c = parse_unicode_name(&name).ok_or_else(|| {
847 self.syntax_err(
848 format!("Unknown Unicode character name: {name}"),
849 self.line,
850 )
851 })?;
852 s.push(c);
853 }
854 Some('x') => {
855 if self.peek() == Some('{') {
856 self.advance(); // '{'
857 let hex = self.read_while(|c| c != '}');
858 if self.peek() != Some('}') {
859 return Err(
860 self.syntax_err("Unterminated \\x{...} in string", self.line)
861 );
862 }
863 self.advance(); // '}'
864 if hex.is_empty() {
865 return Err(self.syntax_err("Empty \\x{} in string", self.line));
866 }
867 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
868 self.syntax_err("Invalid hex digits in \\x{...}", self.line)
869 })?;
870 let c = char::from_u32(val).ok_or_else(|| {
871 self.syntax_err(
872 "Invalid Unicode scalar value in \\x{...}",
873 self.line,
874 )
875 })?;
876 s.push(c);
877 } else {
878 // Unbraced: up to two hex digits (Perl: "\\x414" is "\\x41" + "4").
879 let mut hex = String::new();
880 for _ in 0..2 {
881 match self.peek() {
882 Some(c) if c.is_ascii_hexdigit() => {
883 hex.push(self.advance().unwrap());
884 }
885 _ => break,
886 }
887 }
888 if hex.is_empty() {
889 // Perl: bare "\\x" in a string yields NUL.
890 s.push('\0');
891 } else if let Ok(val) = u32::from_str_radix(&hex, 16) {
892 if let Some(c) = char::from_u32(val) {
893 s.push(c);
894 } else {
895 return Err(self.syntax_err(
896 "Invalid code point in \\x escape",
897 self.line,
898 ));
899 }
900 }
901 }
902 }
903 Some(c) if c == term => s.push(c),
904 Some(c) => {
905 s.push('\\');
906 s.push(c);
907 }
908 None => return Err(self.syntax_err("Unterminated string", self.line)),
909 },
910 Some(c) if c == term => break,
911 Some(c) => s.push(c),
912 None => return Err(self.syntax_err("Unterminated string", self.line)),
913 }
914 }
915 Ok(s)
916 }
917
918 /// `q(...)` / `qq(...)` with pairing delimiters — Perl balances nested `()`, `[]`, `{}`, `<>`
919 /// so `q(sub ($) { 1 })` does not end at the `)` in `($)` (core `Carp.pm` uses `eval(q(...))`).
920 fn read_q_qq_balanced_body(
921 &mut self,
922 open: char,
923 close: char,
924 is_qq: bool,
925 ) -> StrykeResult<String> {
926 let mut s = String::new();
927 let mut depth: usize = 1;
928 loop {
929 match self.peek() {
930 Some('\\') => {
931 self.advance();
932 if is_qq {
933 match self.advance() {
934 Some('n') => s.push('\n'),
935 Some('t') => s.push('\t'),
936 Some('r') => s.push('\r'),
937 Some('\\') => s.push('\\'),
938 Some(c @ '0'..='7') => {
939 let mut oct = String::new();
940 oct.push(c);
941 for _ in 0..2 {
942 match self.peek() {
943 Some(d) if ('0'..='7').contains(&d) => {
944 oct.push(self.advance().unwrap());
945 }
946 _ => break,
947 }
948 }
949 let val = u32::from_str_radix(&oct, 8).unwrap();
950 let ch = char::from_u32(val).ok_or_else(|| {
951 self.syntax_err("Invalid octal escape", self.line)
952 })?;
953 s.push(ch);
954 }
955 Some('a') => s.push('\x07'),
956 Some('b') => s.push('\x08'),
957 Some('f') => s.push('\x0C'),
958 Some('e') => s.push('\x1B'),
959 Some('$') => s.push(LITERAL_DOLLAR_IN_DQUOTE),
960 Some('@') => s.push(LITERAL_AT_IN_DQUOTE),
961 Some('c') => {
962 let ch = self.advance().ok_or_else(|| {
963 self.syntax_err("Unterminated \\c escape", self.line)
964 })?;
965 s.push(char::from(ch.to_ascii_uppercase() as u8 ^ 0x40));
966 }
967 Some('o') if self.peek() == Some('{') => {
968 self.advance();
969 let oct = self.read_while(|c| c != '}');
970 if self.peek() != Some('}') {
971 return Err(self.syntax_err(
972 "Unterminated \\o{...} in qq string",
973 self.line,
974 ));
975 }
976 self.advance();
977 if oct.is_empty() {
978 return Err(
979 self.syntax_err("Empty \\o{} in qq string", self.line)
980 );
981 }
982 let val = u32::from_str_radix(&oct, 8).map_err(|_| {
983 self.syntax_err("Invalid octal digits in \\o{...}", self.line)
984 })?;
985 let c = char::from_u32(val).ok_or_else(|| {
986 self.syntax_err(
987 "Invalid Unicode scalar value in \\o{...}",
988 self.line,
989 )
990 })?;
991 s.push(c);
992 }
993 Some('u') if self.peek() == Some('{') => {
994 self.advance();
995 let hex = self.read_while(|c| c != '}');
996 if self.peek() != Some('}') {
997 return Err(self.syntax_err(
998 "Unterminated \\u{...} in qq string",
999 self.line,
1000 ));
1001 }
1002 self.advance();
1003 if hex.is_empty() {
1004 return Err(
1005 self.syntax_err("Empty \\u{} in qq string", self.line)
1006 );
1007 }
1008 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
1009 self.syntax_err("Invalid hex digits in \\u{...}", self.line)
1010 })?;
1011 let c = char::from_u32(val).ok_or_else(|| {
1012 self.syntax_err(
1013 "Invalid Unicode scalar value in \\u{...}",
1014 self.line,
1015 )
1016 })?;
1017 s.push(c);
1018 }
1019 Some('N') if self.peek() == Some('{') => {
1020 self.advance();
1021 let name = self.read_while(|c| c != '}');
1022 if self.peek() != Some('}') {
1023 return Err(self.syntax_err(
1024 "Unterminated \\N{...} in qq string",
1025 self.line,
1026 ));
1027 }
1028 self.advance();
1029 if name.is_empty() {
1030 return Err(
1031 self.syntax_err("Empty \\N{} in qq string", self.line)
1032 );
1033 }
1034 let c = parse_unicode_name(&name).ok_or_else(|| {
1035 self.syntax_err(
1036 format!("Unknown Unicode character name: {name}"),
1037 self.line,
1038 )
1039 })?;
1040 s.push(c);
1041 }
1042 Some('x') => {
1043 if self.peek() == Some('{') {
1044 self.advance();
1045 let hex = self.read_while(|c| c != '}');
1046 if self.peek() != Some('}') {
1047 return Err(self.syntax_err(
1048 "Unterminated \\x{...} in qq string",
1049 self.line,
1050 ));
1051 }
1052 self.advance();
1053 if hex.is_empty() {
1054 return Err(
1055 self.syntax_err("Empty \\x{} in qq string", self.line)
1056 );
1057 }
1058 let val = u32::from_str_radix(&hex, 16).map_err(|_| {
1059 self.syntax_err("Invalid hex digits in \\x{...}", self.line)
1060 })?;
1061 let c = char::from_u32(val).ok_or_else(|| {
1062 self.syntax_err(
1063 "Invalid Unicode scalar value in \\x{...}",
1064 self.line,
1065 )
1066 })?;
1067 s.push(c);
1068 } else {
1069 let mut hex = String::new();
1070 for _ in 0..2 {
1071 match self.peek() {
1072 Some(c) if c.is_ascii_hexdigit() => {
1073 hex.push(self.advance().unwrap());
1074 }
1075 _ => break,
1076 }
1077 }
1078 if hex.is_empty() {
1079 s.push('\0');
1080 } else if let Ok(val) = u32::from_str_radix(&hex, 16) {
1081 if let Some(c) = char::from_u32(val) {
1082 s.push(c);
1083 } else {
1084 return Err(self.syntax_err(
1085 "Invalid code point in \\x escape",
1086 self.line,
1087 ));
1088 }
1089 }
1090 }
1091 }
1092 Some(c) if c == close && depth == 1 => s.push(close),
1093 Some(c) => {
1094 s.push('\\');
1095 s.push(c);
1096 }
1097 None => {
1098 return Err(
1099 self.syntax_err("Unterminated qq(...) string", self.line)
1100 );
1101 }
1102 }
1103 } else {
1104 match self.advance() {
1105 Some(c) if c == close && depth == 1 => s.push(close),
1106 Some(c) => {
1107 s.push('\\');
1108 s.push(c);
1109 }
1110 None => {
1111 return Err(
1112 self.syntax_err("Unterminated q(...) string", self.line)
1113 );
1114 }
1115 }
1116 }
1117 }
1118 Some(c) if c == open => {
1119 self.advance();
1120 depth += 1;
1121 s.push(open);
1122 }
1123 Some(c) if c == close => {
1124 self.advance();
1125 if depth == 1 {
1126 break;
1127 }
1128 depth -= 1;
1129 s.push(close);
1130 }
1131 Some(c) => {
1132 self.advance();
1133 s.push(c);
1134 }
1135 None => {
1136 return Err(self.syntax_err("Unterminated q/qq bracketed string", self.line));
1137 }
1138 }
1139 }
1140 Ok(s)
1141 }
1142
1143 fn read_regex(&mut self) -> StrykeResult<Token> {
1144 self.advance(); // consume opening /
1145 let mut pattern = String::new();
1146 loop {
1147 match self.advance() {
1148 Some('\\') => {
1149 pattern.push('\\');
1150 if let Some(c) = self.advance() {
1151 pattern.push(c);
1152 }
1153 }
1154 Some('/') => break,
1155 Some(c) => pattern.push(c),
1156 None => return Err(self.syntax_err("Unterminated regex", self.line)),
1157 }
1158 }
1159 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
1160 Ok(Token::Regex(pattern, flags, '/'))
1161 }
1162
1163 fn read_qw(&mut self) -> StrykeResult<Token> {
1164 // Already consumed 'qw', now expect delimiter
1165 self.skip_whitespace_only();
1166 let open = self
1167 .advance()
1168 .ok_or_else(|| self.syntax_err("Expected delimiter after qw", self.line))?;
1169 let close = match open {
1170 '(' => ')',
1171 '[' => ']',
1172 '{' => '}',
1173 '<' => '>',
1174 c => c,
1175 };
1176 let mut words = Vec::new();
1177 if matches!(open, '(' | '[' | '{' | '<') {
1178 // Perl balances nested delimiters in `qw( ... )` / `qw[ ... ]` / … so
1179 // `qw( (SV*)pWARN_ALL )` is one word (core `B.pm` line 88).
1180 let mut depth: usize = 1;
1181 let mut buf = String::new();
1182 loop {
1183 match self.peek() {
1184 None => {
1185 return Err(self.syntax_err("Unterminated qw()", self.line));
1186 }
1187 Some(c) if depth == 1 && c.is_whitespace() => {
1188 self.advance();
1189 if !buf.is_empty() {
1190 words.push(buf.clone());
1191 buf.clear();
1192 }
1193 while self.peek().is_some_and(|c| c.is_whitespace()) {
1194 self.advance();
1195 }
1196 }
1197 Some(c) if c == close && depth == 1 => {
1198 self.advance();
1199 if !buf.is_empty() {
1200 words.push(buf);
1201 }
1202 break;
1203 }
1204 Some(c) if c == open => {
1205 depth += 1;
1206 buf.push(self.advance().unwrap());
1207 }
1208 Some(c) if c == close => {
1209 // `depth == 1 && close` is handled above (final qw delimiter).
1210 debug_assert!(depth >= 2);
1211 depth -= 1;
1212 buf.push(self.advance().unwrap());
1213 }
1214 Some(_) => {
1215 buf.push(self.advance().unwrap());
1216 }
1217 }
1218 }
1219 return Ok(Token::QW(words));
1220 }
1221 loop {
1222 // Skip whitespace inside qw
1223 while let Some(ch) = self.peek() {
1224 if ch.is_whitespace() {
1225 self.advance();
1226 } else {
1227 break;
1228 }
1229 }
1230 if self.peek() == Some(close) {
1231 self.advance();
1232 break;
1233 }
1234 if self.peek().is_none() {
1235 return Err(self.syntax_err("Unterminated qw()", self.line));
1236 }
1237 let word = self.read_while(|c| !c.is_whitespace() && c != close);
1238 if !word.is_empty() {
1239 words.push(word);
1240 }
1241 }
1242 Ok(Token::QW(words))
1243 }
1244
1245 fn read_heredoc_tag(&mut self) -> StrykeResult<(String, bool, bool)> {
1246 self.read_heredoc_tag_inner(false)
1247 }
1248
1249 fn read_heredoc_tag_inner(&mut self, indented: bool) -> StrykeResult<(String, bool, bool)> {
1250 // We've consumed '<<'. Now figure out the tag.
1251 // Returns (tag, interpolate, indented).
1252 let quoted;
1253 let tag;
1254 match self.peek() {
1255 Some('\'') => {
1256 self.advance();
1257 tag = self.read_while(|c| c != '\'');
1258 self.advance(); // closing quote
1259 quoted = false; // no interpolation
1260 }
1261 Some('"') => {
1262 self.advance();
1263 tag = self.read_while(|c| c != '"');
1264 self.advance();
1265 quoted = true;
1266 }
1267 Some('~') => {
1268 self.advance(); // indented heredoc
1269 return self.read_heredoc_tag_inner(true); // recurse with indented=true
1270 }
1271 _ => {
1272 tag = self.read_while(|c| c.is_alphanumeric() || c == '_');
1273 quoted = true;
1274 }
1275 }
1276 Ok((tag, quoted, indented))
1277 }
1278
1279 fn read_heredoc_body(&mut self, tag: &str, indented: bool) -> StrykeResult<String> {
1280 // Read until we find a line that is exactly the tag (or, for indented heredocs,
1281 // a line whose trimmed content equals the tag).
1282 let mut lines: Vec<String> = Vec::new();
1283 // First, skip to end of current line
1284 while let Some(ch) = self.peek() {
1285 if ch == '\n' {
1286 self.advance();
1287 break;
1288 }
1289 self.advance();
1290 }
1291 let mut terminator_indent: Option<usize> = None;
1292 loop {
1293 let _line_start = self.pos;
1294 let line = self.read_while(|c| c != '\n');
1295 if line.trim() == tag {
1296 // For indented heredocs, the terminator's leading whitespace determines
1297 // how much to strip from all body lines.
1298 if indented {
1299 terminator_indent = Some(line.len() - line.trim_start().len());
1300 }
1301 break;
1302 }
1303 lines.push(line);
1304 if self.peek() == Some('\n') {
1305 self.advance();
1306 } else if self.pos >= self.input.len() {
1307 return Err(self.syntax_err(
1308 format!("Unterminated heredoc (looking for '{tag}')"),
1309 self.line,
1310 ));
1311 }
1312 }
1313 if self.peek() == Some('\n') {
1314 self.advance();
1315 }
1316 // For indented heredocs (<<~), strip leading whitespace from each line,
1317 // up to the amount of indentation on the terminator line.
1318 if indented {
1319 let strip = terminator_indent.unwrap_or(0);
1320 let mut body = String::new();
1321 for line in lines {
1322 let ws_count = line.len() - line.trim_start().len();
1323 let to_strip = ws_count.min(strip);
1324 body.push_str(&line[to_strip..]);
1325 body.push('\n');
1326 }
1327 Ok(body)
1328 } else {
1329 let mut body = String::new();
1330 for line in lines {
1331 body.push_str(&line);
1332 body.push('\n');
1333 }
1334 Ok(body)
1335 }
1336 }
1337
1338 fn read_identifier(&mut self) -> String {
1339 self.read_while(|c| c.is_alphanumeric() || c == '_')
1340 }
1341
1342 /// `Foo::Bar::Baz` after the leading sigil.
1343 fn read_package_qualified_identifier(&mut self) -> String {
1344 let mut s = self.read_identifier();
1345 while self.peek() == Some(':') && self.input.get(self.pos + 1) == Some(&':') {
1346 self.advance();
1347 self.advance();
1348 s.push_str("::");
1349 s.push_str(&self.read_identifier());
1350 }
1351 s
1352 }
1353
1354 /// Body lines for `format N =` … `.` (excluding the closing `.` line).
1355 fn read_format_body(&mut self) -> StrykeResult<Vec<String>> {
1356 while self.peek().is_some_and(|c| c == ' ' || c == '\t') {
1357 self.advance();
1358 }
1359 if self.peek() == Some('\n') {
1360 self.advance();
1361 }
1362 let mut lines = Vec::new();
1363 loop {
1364 let mut line = String::new();
1365 while let Some(c) = self.peek() {
1366 if c == '\n' {
1367 self.advance();
1368 break;
1369 }
1370 if c == '\r' {
1371 self.advance();
1372 if self.peek() == Some('\n') {
1373 self.advance();
1374 }
1375 break;
1376 }
1377 line.push(c);
1378 self.advance();
1379 }
1380 if line.trim() == "." {
1381 break;
1382 }
1383 lines.push(line);
1384 if self.peek().is_none() {
1385 return Err(self.syntax_err(
1386 "Unterminated format (expected '.' on its own line before end of file)",
1387 self.line,
1388 ));
1389 }
1390 }
1391 Ok(lines)
1392 }
1393
1394 fn read_variable_name(&mut self) -> String {
1395 // Handle special vars like $_, $!, $0, $/, $^I, etc.
1396 match self.peek() {
1397 // Second `$` in `$$_{` — with leading `$` already consumed, we have `$` `_` `{` → `$_` then `{`.
1398 Some('$')
1399 if self.input.get(self.pos + 1) == Some(&'_')
1400 && self.input.get(self.pos + 2) == Some(&'{') =>
1401 {
1402 self.advance(); // second $
1403 self.advance(); // `_` of `$_`
1404 "_".to_string()
1405 }
1406 // `$::{$key}` / `$::Foo` — stash access (`%::`) and package names rooted at `::` (Perl `$::` ≡ main stash).
1407 Some(':') if self.input.get(self.pos + 1) == Some(&':') => {
1408 self.advance();
1409 self.advance();
1410 let mut s = "::".to_string();
1411 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
1412 s.push_str(&self.read_identifier());
1413 }
1414 while self.peek() == Some(':') && self.input.get(self.pos + 1) == Some(&':') {
1415 self.advance();
1416 self.advance();
1417 s.push_str("::");
1418 s.push_str(&self.read_identifier());
1419 }
1420 s
1421 }
1422 Some(c) if c.is_alphabetic() || c == '_' => {
1423 let ident = self.read_package_qualified_identifier();
1424 // `$_<`, `$_<<`, … — outer topic chain (stryke extension). Also
1425 // applies to positional slots: `$_0<<<<<`, `$_1<<<<<`, etc. The
1426 // canonical scope key is `_<<<<<` (slot 0) or `_N<<<<<` (slot N).
1427 //
1428 // Indexed-ascent shortcut: `$_<N` ≡ `$_<<<...<` (N chevrons),
1429 // with N a positive integer. `$_<3` is much more readable than
1430 // `$_<<<` past depth 2. The lexer synthesizes the chevron form
1431 // so the rest of the system (scope keys, parse) is unchanged.
1432 let is_topic_slot = ident == "_"
1433 || (ident.len() > 1
1434 && ident.starts_with('_')
1435 && ident[1..].bytes().all(|b| b.is_ascii_digit()));
1436 if is_topic_slot {
1437 let mut lts = String::new();
1438 while self.peek() == Some('<') {
1439 self.advance();
1440 lts.push('<');
1441 }
1442 // Indexed-ascent: after a single `<`, if the next chars are
1443 // digits NOT followed by `>` or `:` (which would make it a
1444 // string slice like `$_<1:5>`), expand `<N` to N chevrons.
1445 if lts.len() == 1 && self.peek().is_some_and(|c| c.is_ascii_digit()) {
1446 let mut peek_off = 0usize;
1447 while self.peek_at(peek_off).is_some_and(|c| c.is_ascii_digit()) {
1448 peek_off += 1;
1449 }
1450 let trailing = self.peek_at(peek_off);
1451 let is_slice = matches!(trailing, Some(':') | Some('>'));
1452 if !is_slice {
1453 let mut digits = String::new();
1454 for _ in 0..peek_off {
1455 if let Some(c) = self.advance() {
1456 digits.push(c);
1457 }
1458 }
1459 if let Ok(n) = digits.parse::<usize>() {
1460 if n >= 1 {
1461 // Replace the single `<` already collected
1462 // with N chevrons (we already consumed 1).
1463 for _ in 1..n {
1464 lts.push('<');
1465 }
1466 }
1467 }
1468 }
1469 }
1470 if !lts.is_empty() {
1471 return format!("{}{}", ident, lts);
1472 }
1473 }
1474 ident
1475 }
1476 Some('^') => {
1477 self.advance();
1478 // Perl `$^I`, `$^O`, … — caret plus one letter (or `^` alone).
1479 if self.peek().is_some_and(|c| c.is_alphabetic()) {
1480 let c2 = self.advance().unwrap();
1481 format!("^{}", c2)
1482 } else {
1483 "^".to_string()
1484 }
1485 }
1486 // `${name}` — must run before the punctuation branch (`{` is also listed there).
1487 Some('{') => {
1488 self.advance(); // {
1489 let name = self.read_while(|c| c != '}');
1490 if self.peek() == Some('}') {
1491 self.advance();
1492 }
1493 name
1494 }
1495 // Perl `$#name` — last index of `@name` (scalar name stored as `#name`).
1496 Some('#') => {
1497 self.advance();
1498 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
1499 let mut name = String::from("#");
1500 name.push_str(&self.read_package_qualified_identifier());
1501 name
1502 } else {
1503 "#".to_string()
1504 }
1505 }
1506 Some(c) if "!@$&*+;',\"\\|?/<>.0123456789~%-=()[]{}".contains(c) => {
1507 self.advance();
1508 c.to_string()
1509 }
1510 _ => String::new(),
1511 }
1512 }
1513
1514 /// `${$name}` / `${$Foo::bar}` — when the braced body is a plain scalar `$identifier`, Perl treats it
1515 /// like `$$name` (scalar deref). The naive lexer otherwise yields a bogus [`Token::ScalarVar`] name
1516 /// containing a leading `$` (e.g. Try::Tiny's `${$code_ref}`).
1517 fn braced_body_symbolic_scalar_deref_name(body: &str) -> Option<&str> {
1518 let body = body.trim();
1519 let rest = body.strip_prefix('$')?;
1520 if rest.is_empty() {
1521 return None;
1522 }
1523 let mut chars = rest.chars();
1524 let c0 = chars.next()?;
1525 if !(c0.is_alphabetic() || c0 == '_') {
1526 return None;
1527 }
1528 for c in chars {
1529 if !(c.is_alphanumeric() || c == '_' || c == ':') {
1530 return None;
1531 }
1532 }
1533 Some(rest)
1534 }
1535
1536 pub fn next_token(&mut self) -> StrykeResult<Token> {
1537 self.skip_whitespace_and_comments();
1538 // Stamp the start line for [`Self::tokenize`]. Recursive calls
1539 // through POD / heredoc skip will overwrite this with the post-
1540 // skip line, so the emitted token always reports the source line
1541 // it actually lives on.
1542 self.token_start_line = self.line;
1543
1544 if self.pos >= self.input.len() {
1545 return Ok(Token::Eof);
1546 }
1547
1548 // `last_was_arrow` is consumed at most once per token: the s/tr/y/q/qq
1549 // /qw/qr/m guards check whether the IMMEDIATELY previous token was
1550 // `->`. Reset here; the Arrow / ArrowBrace return paths re-arm it
1551 // for the next `next_token` call. We snapshot before the reset so
1552 // identifier-decoding logic below can read the previous-token state
1553 // via `self.prev_arrow` (set up via a one-shot field swap).
1554 self.prev_arrow = self.last_was_arrow;
1555 self.last_was_arrow = false;
1556 self.last_was_bare_positional = false;
1557
1558 let ch = self.input[self.pos];
1559 match ch {
1560 // Variables
1561 '$' => {
1562 self.advance();
1563 // `$$foo` — symbolic scalar deref (Perl `${$foo}`-style lookup)
1564 if self.peek() == Some('$') {
1565 // `$$_{` — Perl parses as `$_->{...}` (implicit arrow on `$_`), not `$$` PID + `_`.
1566 let is_dollar_under_brace = self.input.get(self.pos + 1) == Some(&'_')
1567 && self.input.get(self.pos + 2) == Some(&'{');
1568 if !is_dollar_under_brace {
1569 self.advance();
1570 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
1571 let name = self.read_identifier();
1572 self.last_was_term = true;
1573 return Ok(Token::DerefScalarVar(name));
1574 }
1575 // `$$` — process id (Perl `$$`)
1576 self.last_was_term = true;
1577 return Ok(Token::ScalarVar("$$".to_string()));
1578 }
1579 }
1580 let name = self.read_variable_name();
1581 if name.is_empty() {
1582 return Err(self.syntax_err("Expected variable name after $", self.line));
1583 }
1584 // `--no-interop`: reject `$a` / `$b` (Perl's reduce/sort/pair*
1585 // comparator-bind globals). Stryke's runtime also binds `$_0`
1586 // / `$_1` for the same positions; in idiomatic-only mode users
1587 // must use those instead.
1588 if crate::no_interop_mode() && (name == "a" || name == "b") {
1589 return Err(self.syntax_err(
1590 format!(
1591 "stryke uses `_` / `_1` (bareword in code) or `$_` / `$_1` \
1592 (sigil inside string interpolation / when whitespace \
1593 would change parsing) instead of `${}` (--no-interop is active)",
1594 name
1595 ),
1596 self.line,
1597 ));
1598 }
1599 self.last_was_term = true;
1600 if let Some(tail) = Self::braced_body_symbolic_scalar_deref_name(&name) {
1601 return Ok(Token::DerefScalarVar(tail.to_string()));
1602 }
1603 Ok(Token::ScalarVar(name))
1604 }
1605 '@' => {
1606 self.advance();
1607 if self.peek() == Some('-') {
1608 self.advance();
1609 self.last_was_term = true;
1610 return Ok(Token::ArrayVar("-".to_string()));
1611 }
1612 if self.peek() == Some('+') {
1613 self.advance();
1614 self.last_was_term = true;
1615 return Ok(Token::ArrayVar("+".to_string()));
1616 }
1617 if self.peek() == Some('^')
1618 && self
1619 .input
1620 .get(self.pos + 1)
1621 .is_some_and(|c| c.is_alphabetic() || *c == '_')
1622 {
1623 self.advance();
1624 let name = format!("^{}", self.read_package_qualified_identifier());
1625 self.last_was_term = true;
1626 return Ok(Token::ArrayVar(name));
1627 }
1628 if self.peek() == Some('_') || self.peek().is_some_and(|c| c.is_alphabetic()) {
1629 let name = self.read_package_qualified_identifier();
1630 self.last_was_term = true;
1631 return Ok(Token::ArrayVar(name));
1632 }
1633 self.last_was_term = false;
1634 Ok(Token::ArrayAt)
1635 }
1636 '%' if !self.last_was_term => {
1637 self.advance();
1638 // `%+` — named regex captures (Perl special hash)
1639 if self.peek() == Some('+') {
1640 self.advance();
1641 self.last_was_term = true;
1642 return Ok(Token::HashVar("+".to_string()));
1643 }
1644 if self.peek() == Some('^')
1645 && self
1646 .input
1647 .get(self.pos + 1)
1648 .is_some_and(|c| c.is_alphabetic() || *c == '_')
1649 {
1650 self.advance();
1651 let name = format!("^{}", self.read_package_qualified_identifier());
1652 self.last_was_term = true;
1653 return Ok(Token::HashVar(name));
1654 }
1655 if self.peek().is_some_and(|c| c.is_alphabetic() || c == '_') {
1656 let name = self.read_package_qualified_identifier();
1657 self.last_was_term = true;
1658 return Ok(Token::HashVar(name));
1659 }
1660 self.last_was_term = false;
1661 Ok(Token::HashPercent)
1662 }
1663
1664 // Numbers
1665 '0'..='9' => {
1666 let tok = self.read_number()?;
1667 self.last_was_term = true;
1668 Ok(tok)
1669 }
1670
1671 // Strings
1672 '\'' => {
1673 let tok = self.read_single_quoted_string()?;
1674 self.last_was_term = true;
1675 Ok(tok)
1676 }
1677 '"' => {
1678 let tok = self.read_double_quoted_string()?;
1679 self.last_was_term = true;
1680 Ok(tok)
1681 }
1682
1683 // Backtick — Perl `` `cmd` `` (qx), not a plain double-quoted string
1684 '`' => {
1685 self.advance();
1686 let cmd = self.read_escaped_until('`')?;
1687 self.last_was_term = true;
1688 Ok(Token::BacktickString(cmd))
1689 }
1690
1691 // Regex or division
1692 '/' => {
1693 if !self.last_was_term {
1694 let tok = self.read_regex()?;
1695 self.last_was_term = true;
1696 return Ok(tok);
1697 }
1698 self.advance();
1699 if self.peek() == Some('=') {
1700 self.advance();
1701 self.last_was_term = false;
1702 return Ok(Token::DivAssign);
1703 }
1704 if self.peek() == Some('/') {
1705 self.advance();
1706 if self.peek() == Some('=') {
1707 self.advance();
1708 self.last_was_term = false;
1709 return Ok(Token::DefinedOrAssign);
1710 }
1711 self.last_was_term = false;
1712 return Ok(Token::DefinedOr);
1713 }
1714 self.last_was_term = false;
1715 Ok(Token::Slash)
1716 }
1717
1718 // Operators and punctuation
1719 '+' => {
1720 self.advance();
1721 if self.peek() == Some('+') {
1722 self.advance();
1723 // Whether it was term depends on context
1724 return Ok(Token::Increment);
1725 }
1726 if self.peek() == Some('=') {
1727 self.advance();
1728 self.last_was_term = false;
1729 return Ok(Token::PlusAssign);
1730 }
1731 self.last_was_term = false;
1732 Ok(Token::Plus)
1733 }
1734 '-' => {
1735 self.advance();
1736 // File test operators: -e, -f, -d, etc.
1737 if !self.last_was_term {
1738 if let Some(c) = self.peek() {
1739 if "efdlpSszrwxoRWXOBCTMAgut".contains(c)
1740 && self.peek_at(1).is_none_or(|n| {
1741 n.is_whitespace()
1742 || n == '$'
1743 || n == '\''
1744 || n == '"'
1745 || n == '('
1746 || n == ')'
1747 || n == '}'
1748 || n == ';'
1749 || n == ','
1750 })
1751 {
1752 self.advance();
1753 self.last_was_term = false;
1754 return Ok(Token::FileTest(c));
1755 }
1756 }
1757 }
1758 if self.peek() == Some('-') {
1759 self.advance();
1760 return Ok(Token::Decrement);
1761 }
1762 if self.peek() == Some('=') {
1763 self.advance();
1764 self.last_was_term = false;
1765 return Ok(Token::MinusAssign);
1766 }
1767 if self.peek() == Some('>') {
1768 self.advance();
1769 if self.peek() == Some('>') {
1770 self.advance();
1771 self.last_was_term = false;
1772 return Ok(Token::ThreadArrowLast);
1773 }
1774 self.last_was_term = false;
1775 // Arm the arrow flag so the next identifier (e.g. `y`,
1776 // `s`, `tr`, `m`, `q…`) decodes as a method name, not
1777 // a substitution / transliteration / quote-like body.
1778 self.last_was_arrow = true;
1779 return Ok(Token::Arrow);
1780 }
1781 self.last_was_term = false;
1782 Ok(Token::Minus)
1783 }
1784 '*' => {
1785 self.advance();
1786 if self.peek() == Some('*') {
1787 self.advance();
1788 if self.peek() == Some('=') {
1789 self.advance();
1790 self.last_was_term = false;
1791 return Ok(Token::PowAssign);
1792 }
1793 self.last_was_term = false;
1794 return Ok(Token::Power);
1795 }
1796 if self.peek() == Some('=') {
1797 self.advance();
1798 self.last_was_term = false;
1799 return Ok(Token::MulAssign);
1800 }
1801 self.last_was_term = false;
1802 Ok(Token::Star)
1803 }
1804 '%' => {
1805 // Only reached when last_was_term is true (hash sigil handled above)
1806 self.advance();
1807 if self.peek() == Some('=') {
1808 self.advance();
1809 self.last_was_term = false;
1810 return Ok(Token::ModAssign);
1811 }
1812 self.last_was_term = false;
1813 Ok(Token::Percent)
1814 }
1815 '.' => {
1816 self.advance();
1817 if self.peek() == Some('.') {
1818 self.advance();
1819 if self.peek() == Some('.') {
1820 self.advance();
1821 self.last_was_term = false;
1822 return Ok(Token::RangeExclusive);
1823 }
1824 self.last_was_term = false;
1825 return Ok(Token::Range);
1826 }
1827 if self.peek() == Some('=') {
1828 self.advance();
1829 self.last_was_term = false;
1830 return Ok(Token::DotAssign);
1831 }
1832 self.last_was_term = false;
1833 Ok(Token::Dot)
1834 }
1835 '=' => {
1836 let eq_pos = self.pos;
1837 self.advance();
1838 if self.peek() == Some('=') {
1839 self.advance();
1840 self.last_was_term = false;
1841 return Ok(Token::NumEq);
1842 }
1843 if self.peek() == Some('~') {
1844 self.advance();
1845 self.last_was_term = false;
1846 return Ok(Token::BindMatch);
1847 }
1848 if self.peek() == Some('>') {
1849 self.advance();
1850 self.last_was_term = false;
1851 return Ok(Token::FatArrow);
1852 }
1853 // POD: =head1 etc — only when `=` begins the line (after optional whitespace).
1854 if self.peek().is_some_and(|c| c.is_alphabetic())
1855 && self.at_line_start_for_pod(eq_pos)
1856 {
1857 // Skip POD
1858 loop {
1859 let line = self.read_while(|c| c != '\n');
1860 if self.peek() == Some('\n') {
1861 self.advance();
1862 }
1863 if line.starts_with("=cut") || self.pos >= self.input.len() {
1864 break;
1865 }
1866 }
1867 return self.next_token();
1868 }
1869 self.last_was_term = false;
1870 Ok(Token::Assign)
1871 }
1872 '!' => {
1873 self.advance();
1874 if self.peek() == Some('=') {
1875 self.advance();
1876 self.last_was_term = false;
1877 return Ok(Token::NumNe);
1878 }
1879 if self.peek() == Some('~') {
1880 self.advance();
1881 self.last_was_term = false;
1882 return Ok(Token::BindNotMatch);
1883 }
1884 self.last_was_term = false;
1885 Ok(Token::LogNot)
1886 }
1887 '<' => {
1888 self.advance();
1889 let after_lt = self.pos;
1890 // Readline `<$fh>` (scalar handle) — must come before `<IDENT>` / numeric `<`.
1891 if self.peek() == Some('$') {
1892 self.advance();
1893 let name = self.read_variable_name();
1894 if !name.is_empty() && self.peek() == Some('>') {
1895 self.advance();
1896 self.last_was_term = true;
1897 return Ok(Token::ReadLine(name));
1898 }
1899 self.pos = after_lt;
1900 }
1901 // Diamond operator <> or <STDIN>
1902 if self.peek() == Some('>') {
1903 self.advance();
1904 self.last_was_term = true;
1905 return Ok(Token::Diamond);
1906 }
1907 if self.peek().is_some_and(|c| c.is_uppercase()) {
1908 let name = self.read_identifier();
1909 if self.peek() == Some('>') {
1910 self.advance();
1911 self.last_was_term = true;
1912 return Ok(Token::ReadLine(name));
1913 }
1914 // Not a readline, put back — this is tricky, we'll handle as less-than
1915 // followed by ident. For simplicity, return the ident separately.
1916 self.last_was_term = false;
1917 return Ok(Token::NumLt);
1918 }
1919 if self.peek() == Some('=') {
1920 self.advance();
1921 if self.peek() == Some('>') {
1922 self.advance();
1923 self.last_was_term = false;
1924 return Ok(Token::Spaceship);
1925 }
1926 self.last_was_term = false;
1927 return Ok(Token::NumLe);
1928 }
1929 if self.peek() == Some('<') {
1930 self.advance();
1931 if self.peek() == Some('=') {
1932 self.advance();
1933 self.last_was_term = false;
1934 return Ok(Token::ShiftLeftAssign);
1935 }
1936 // `<<` — binary shift after a complete term (`1 << 4`, `"x" << 2`); heredoc when a
1937 // term is expected (`print <<EOF`, `my $x = <<EOF`, after `.` / `,` / `(` …).
1938 //
1939 // `}` always sets `last_was_term=true`, but a `}` ending
1940 // a block / fn body followed by a newline and `<<TAG` on
1941 // the next line is unambiguously heredoc, not shift —
1942 // `block << bareword` is meaningless. Disambiguate by
1943 // peeking after `<<`: if the next char looks like the
1944 // start of a heredoc tag (uppercase, `_`, `~`, `"`,
1945 // `'`), prefer heredoc even when last_was_term is set.
1946 // Numeric / sigil / lowercase still falls through to
1947 // ShiftLeft so `1 << 4` and `$x << $shift` still work.
1948 let looks_like_heredoc_tag =
1949 matches!(self.peek(), Some('~') | Some('"') | Some('\'') | Some('_'),)
1950 || self.peek().is_some_and(|c| c.is_ascii_uppercase());
1951 if self.last_was_term && !looks_like_heredoc_tag {
1952 self.last_was_term = false;
1953 return Ok(Token::ShiftLeft);
1954 }
1955 let (tag, interpolate, indented) = self.read_heredoc_tag()?;
1956 let body = self.read_heredoc_body(&tag, indented)?;
1957 self.last_was_term = true;
1958 return Ok(Token::HereDoc(tag, body, interpolate));
1959 }
1960 self.last_was_term = false;
1961 Ok(Token::NumLt)
1962 }
1963 '>' => {
1964 self.advance();
1965 if self.peek() == Some('{') {
1966 self.advance();
1967 self.last_was_term = false;
1968 return Ok(Token::ArrowBrace);
1969 }
1970 if self.peek() == Some('=') {
1971 self.advance();
1972 self.last_was_term = false;
1973 return Ok(Token::NumGe);
1974 }
1975 if self.peek() == Some('>') {
1976 self.advance();
1977 if self.peek() == Some('=') {
1978 self.advance();
1979 self.last_was_term = false;
1980 return Ok(Token::ShiftRightAssign);
1981 }
1982 self.last_was_term = false;
1983 return Ok(Token::ShiftRight);
1984 }
1985 self.last_was_term = false;
1986 Ok(Token::NumGt)
1987 }
1988 '&' => {
1989 self.advance();
1990 if self.peek() == Some('&') {
1991 self.advance();
1992 if self.peek() == Some('=') {
1993 self.advance();
1994 self.last_was_term = false;
1995 return Ok(Token::AndAssign);
1996 }
1997 self.last_was_term = false;
1998 return Ok(Token::LogAnd);
1999 }
2000 if self.peek() == Some('=') {
2001 self.advance();
2002 self.last_was_term = false;
2003 return Ok(Token::BitAndAssign);
2004 }
2005 self.last_was_term = false;
2006 Ok(Token::BitAnd)
2007 }
2008 '|' => {
2009 self.advance();
2010 if self.peek() == Some('|') {
2011 self.advance();
2012 if self.peek() == Some('=') {
2013 self.advance();
2014 self.last_was_term = false;
2015 return Ok(Token::OrAssign);
2016 }
2017 self.last_was_term = false;
2018 return Ok(Token::LogOr);
2019 }
2020 if self.peek() == Some('=') {
2021 self.advance();
2022 self.last_was_term = false;
2023 return Ok(Token::BitOrAssign);
2024 }
2025 if self.peek() == Some('>') {
2026 self.advance();
2027 self.last_was_term = false;
2028 return Ok(Token::PipeForward);
2029 }
2030 self.last_was_term = false;
2031 Ok(Token::BitOr)
2032 }
2033 '^' => {
2034 self.advance();
2035 if self.peek() == Some('=') {
2036 self.advance();
2037 self.last_was_term = false;
2038 return Ok(Token::XorAssign);
2039 }
2040 self.last_was_term = false;
2041 Ok(Token::BitXor)
2042 }
2043 '~' => {
2044 self.advance();
2045 if self.peek() == Some('>') {
2046 self.advance();
2047 if self.peek() == Some('>') {
2048 self.advance();
2049 self.last_was_term = false;
2050 return Ok(Token::ThreadArrowLast);
2051 }
2052 self.last_was_term = false;
2053 return Ok(Token::ThreadArrow);
2054 }
2055 // `~s>` (streaming thread-first) / `~s>>` (streaming thread-last)
2056 // — per-item streaming thread-macros that lower to
2057 // `par_pipeline_streaming`: each stage runs in its own worker
2058 // connected by bounded channels, items flow one-at-a-time.
2059 if self.peek() == Some('s') && self.peek_at(1) == Some('>') {
2060 self.advance(); // consume 's'
2061 self.advance(); // consume first '>'
2062 if self.peek() == Some('>') {
2063 self.advance(); // consume second '>'
2064 self.last_was_term = false;
2065 return Ok(Token::ThreadArrowStreamLast);
2066 }
2067 self.last_was_term = false;
2068 return Ok(Token::ThreadArrowStream);
2069 }
2070 // `~p>` (parallel-chunk thread-first) / `~p>>` (thread-last)
2071 // — sugar for `par_reduce { stage1 |> stage2 |> ... } SOURCE`.
2072 // `||>` or `|then|` mid-pipeline switches back to a normal
2073 // `~>` continuation operating on the merged result.
2074 if self.peek() == Some('p') && self.peek_at(1) == Some('>') {
2075 self.advance(); // consume 'p'
2076 self.advance(); // consume first '>'
2077 if self.peek() == Some('>') {
2078 self.advance(); // consume second '>'
2079 self.last_was_term = false;
2080 return Ok(Token::ThreadArrowParLast);
2081 }
2082 self.last_was_term = false;
2083 return Ok(Token::ThreadArrowPar);
2084 }
2085 // `~d>` (distributed thread-first) / `~d>>` (thread-last) —
2086 // mirrors `~p>` but each chunk is shipped to a remote worker
2087 // on a cluster. Syntax: `~d> on $cluster SOURCE stages...`.
2088 if self.peek() == Some('d') && self.peek_at(1) == Some('>') {
2089 self.advance(); // consume 'd'
2090 self.advance(); // consume first '>'
2091 if self.peek() == Some('>') {
2092 self.advance(); // consume second '>'
2093 self.last_was_term = false;
2094 return Ok(Token::ThreadArrowDistLast);
2095 }
2096 self.last_was_term = false;
2097 return Ok(Token::ThreadArrowDist);
2098 }
2099 self.last_was_term = false;
2100 Ok(Token::BitNot)
2101 }
2102 '?' => {
2103 self.advance();
2104 self.last_was_term = false;
2105 Ok(Token::Question)
2106 }
2107 ':' => {
2108 self.advance();
2109 if self.peek() == Some(':') {
2110 self.advance();
2111 // IPv6 zero-compressed prefix: `::1`, `::ffff:c000:280`.
2112 // Only fires in term position (where `Pkg::ident` is
2113 // impossible) and only when the chars after `::` form a
2114 // valid IPv6 by Rust's parser. Skip when the `::` lives
2115 // inside `[…]` — that's array-slice step syntax
2116 // (`@a[::2]`, `@a[::-1]`), not an address.
2117 let in_bracket_subscript =
2118 self.input.get(self.pos.saturating_sub(3)).copied() == Some('[');
2119 if !self.last_was_term && !in_bracket_subscript {
2120 let saved = self.pos - 2;
2121 if let Some(consumed) = self.try_consume_ipv6_tail(saved) {
2122 self.last_was_term = true;
2123 return Ok(Token::DoubleString(consumed));
2124 }
2125 }
2126 self.last_was_term = false;
2127 return Ok(Token::PackageSep);
2128 }
2129 self.last_was_term = false;
2130 Ok(Token::Colon)
2131 }
2132 '\\' => {
2133 self.advance();
2134 // Backslash-newline: line continuation (shell-style)
2135 // Don't increment line — continued line is logically part of the same line
2136 if self.peek() == Some('\n') {
2137 self.pos += 1; // skip newline without incrementing self.line
2138 return self.next_token();
2139 }
2140 self.last_was_term = false;
2141 Ok(Token::Backslash)
2142 }
2143 ',' => {
2144 self.advance();
2145 self.last_was_term = false;
2146 Ok(Token::Comma)
2147 }
2148 ';' => {
2149 self.advance();
2150 self.last_was_term = false;
2151 Ok(Token::Semicolon)
2152 }
2153 '(' => {
2154 self.advance();
2155 self.last_was_term = false;
2156 Ok(Token::LParen)
2157 }
2158 ')' => {
2159 self.advance();
2160 self.last_was_term = true;
2161 Ok(Token::RParen)
2162 }
2163 '[' => {
2164 self.advance();
2165 self.last_was_term = false;
2166 Ok(Token::LBracket)
2167 }
2168 ']' => {
2169 self.advance();
2170 self.last_was_term = true;
2171 Ok(Token::RBracket)
2172 }
2173 '{' => {
2174 self.advance();
2175 self.last_was_term = false;
2176 Ok(Token::LBrace)
2177 }
2178 '}' => {
2179 self.advance();
2180 self.last_was_term = true;
2181 Ok(Token::RBrace)
2182 }
2183
2184 // Identifiers and keywords
2185 c if c.is_alphabetic() || c == '_' => {
2186 let ident_start = self.pos;
2187 let mut ident = self.read_identifier();
2188
2189 // IPv6 lookahead for hex-letter prefixes: `fe80::1`, `abcd::ff`,
2190 // `dead:beef::1`, etc. Only fires when the just-consumed
2191 // identifier is a valid 1..=4 hex-digit group (i.e. could be
2192 // an IPv6 segment) AND the next char is `:`. Speculatively
2193 // greedily consumes hex / `:` / `::` and asks Rust's
2194 // `Ipv6Addr` parser to validate; on failure restores `pos`
2195 // so the identifier-as-bareword path runs unchanged.
2196 //
2197 // Skip when we're already in the middle of a package-qualified
2198 // path: `package A::B::C` lexes "A" then "::", and at that
2199 // point the next ident "B" must NOT be IPv6-trapped — `B::C`
2200 // is the rest of the package name, not the address
2201 // `0:0:0:0:0:0:B:C`. Same rule for `Foo::Bar::baz` mid-stream
2202 // with hex letters that happen to look like an address.
2203 let after_package_sep = ident_start >= 2
2204 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2205 && self.input.get(ident_start.saturating_sub(1)) == Some(&':');
2206 if !after_package_sep
2207 && self.peek() == Some(':')
2208 && ident.len() <= 4
2209 && ident.chars().all(|ch| ch.is_ascii_hexdigit())
2210 {
2211 if let Some(consumed) = self.try_consume_ipv6_tail(ident_start) {
2212 self.last_was_term = true;
2213 return Ok(Token::DoubleString(consumed));
2214 }
2215 }
2216
2217 // Outer-topic chain in bare form: `_<<<<` (slot 0) and
2218 // `_N<<<<` (slot N). Greedy consume `<` chevrons immediately
2219 // following `_` or `_<digits>`. This is what makes
2220 // `_<` ≡ `$_<` ≡ `_0<` ≡ `$_0<` work without a sigil.
2221 // Stryke power-user note: `_ < 5` (with whitespace) still
2222 // tokenizes as topic-then-less-than; only `_<` with no
2223 // intervening space becomes a topic-slot identifier.
2224 let is_topic_slot = ident == "_"
2225 || (ident.len() > 1
2226 && ident.starts_with('_')
2227 && ident[1..].bytes().all(|b| b.is_ascii_digit()));
2228 if is_topic_slot {
2229 // Greedy `<` chevrons for the outer-topic chain, BUT only
2230 // when the chevron run isn't followed by a slice index.
2231 // `_<1:5>` is a string slice; `_<<<<<` is the 5-deep
2232 // outer-topic. Disambiguate by peeking past the run: if
2233 // the first non-`<` char is a digit, `-`, `:`, or `>`,
2234 // we're in a slice — bail out and let the parser handle
2235 // `<...>` as postfix subscript.
2236 //
2237 // Indexed-ascent shortcut: `_<N` ≡ `_<<<...<` (N chevrons)
2238 // when N is digits NOT followed by `>` or `:`. So `_<3` is
2239 // a depth-3 reference (more readable than `_<<<`), while
2240 // `_<3>` and `_<3:5>` remain string slices.
2241 let mut peek_off = 0usize;
2242 while self.peek_at(peek_off) == Some('<') {
2243 peek_off += 1;
2244 }
2245 let trailing = self.peek_at(peek_off);
2246 // Single `<` followed by digits: try indexed-ascent first.
2247 // Only triggers for one-chevron runs because `_<<3` would
2248 // mean "depth 2 of position 3" (which is not how the
2249 // grammar works) — we only allow `_<digits` at depth 1.
2250 let mut indexed_ascent: Option<usize> = None;
2251 if peek_off == 1 && trailing.is_some_and(|c: char| c.is_ascii_digit()) {
2252 let mut off = 1usize;
2253 while self.peek_at(off).is_some_and(|c| c.is_ascii_digit()) {
2254 off += 1;
2255 }
2256 let after_digits = self.peek_at(off);
2257 let still_a_slice = matches!(after_digits, Some(':') | Some('>'));
2258 if !still_a_slice {
2259 // Parse the digit run.
2260 let mut digits = String::new();
2261 for k in 1..off {
2262 if let Some(c) = self.peek_at(k) {
2263 digits.push(c);
2264 }
2265 }
2266 if let Ok(n) = digits.parse::<usize>() {
2267 if n >= 1 {
2268 indexed_ascent = Some(n);
2269 // Consume `<` + the digits.
2270 self.advance();
2271 for _ in 1..off {
2272 self.advance();
2273 }
2274 }
2275 }
2276 }
2277 }
2278 if let Some(n) = indexed_ascent {
2279 for _ in 0..n {
2280 ident.push('<');
2281 }
2282 } else {
2283 let is_slice = peek_off > 0
2284 && matches!(trailing, Some(c) if c.is_ascii_digit() || c == '-' || c == ':' || c == '>');
2285 if !is_slice {
2286 for _ in 0..peek_off {
2287 self.advance();
2288 ident.push('<');
2289 }
2290 }
2291 }
2292 // `_N` (underscore + digits, ≥ 1 digit) is a reserved
2293 // positional-alias name — never a function name. Emit
2294 // ScalarVar directly so bareword `_1`, `_2`, ... in
2295 // expression position resolves to the scalar slot
2296 // instead of being looked up as a sub call. Bare `_`
2297 // alone (without digits) keeps Ident shape so the
2298 // existing topic/bareword machinery still runs.
2299 if ident.len() > 1
2300 && ident.starts_with('_')
2301 && ident.chars().nth(1).is_some_and(|c| c.is_ascii_digit())
2302 {
2303 self.last_was_term = true;
2304 self.last_was_bare_positional = true;
2305 return Ok(Token::ScalarVar(ident));
2306 }
2307 // Also reserve bare `_<+` (chevron-only topic ascent on
2308 // slot 0) — these are never sub names.
2309 if ident.starts_with('_') && ident.contains('<') {
2310 self.last_was_term = true;
2311 self.last_was_bare_positional = true;
2312 return Ok(Token::ScalarVar(ident));
2313 }
2314 }
2315
2316 // Special multi-char constructs
2317 match ident.as_str() {
2318 "format" => {
2319 // `$obj->format` — method call, not format declaration.
2320 if self.prev_arrow {
2321 self.last_was_term = true;
2322 return Ok(Token::Ident(ident));
2323 }
2324 // `Foo::format` — namespaced identifier tail.
2325 if ident_start >= 2
2326 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2327 && self.input.get(ident_start.saturating_sub(1)) == Some(&':')
2328 {
2329 self.last_was_term = true;
2330 return Ok(Token::Ident(ident));
2331 }
2332 // Hash-key bareword contexts: `$h{format}`, `{format => ...}`,
2333 // `$h{format,...}`. The char immediately before `format`
2334 // (modulo whitespace) being `{` means we're inside a
2335 // hash-subscript or hash-literal — `format` is the key,
2336 // not the FORMAT keyword.
2337 {
2338 let mut p = ident_start;
2339 while p > 0 {
2340 match self.input.get(p - 1) {
2341 Some(&' ') | Some(&'\t') => p -= 1,
2342 _ => break,
2343 }
2344 }
2345 if p > 0 && self.input.get(p - 1) == Some(&'{') {
2346 self.last_was_term = true;
2347 return Ok(Token::Ident(ident));
2348 }
2349 }
2350 // Lookahead-disambiguation: shapes that prove
2351 // `format` is a bareword, not a declaration keyword.
2352 // `}` / `,` / `;` / `)` / `]` — list/hash/expr context.
2353 // `=>` — autoquoted hash key.
2354 // `(` — function call (format() / format($x)).
2355 // EOF — bare ident at end of input.
2356 // A real `format NAME = ... .` declaration always has
2357 // an identifier between `format` and `=`; if no
2358 // identifier is found at the expected slot, it's
2359 // a bareword.
2360 {
2361 let saved_pos = self.pos;
2362 let saved_line = self.line;
2363 self.skip_whitespace_only();
2364 let bare = match self.peek() {
2365 None => true,
2366 Some(',' | ';' | ')' | ']' | '}' | '(') => true,
2367 Some('=') if self.peek_at(1) == Some('>') => true,
2368 Some(c) if !c.is_alphabetic() && c != '_' => true,
2369 _ => false,
2370 };
2371 self.pos = saved_pos;
2372 self.line = saved_line;
2373 if bare {
2374 self.last_was_term = true;
2375 return Ok(Token::Ident(ident));
2376 }
2377 }
2378 self.skip_whitespace_and_comments();
2379 let fname = self.read_package_qualified_identifier();
2380 self.skip_whitespace_and_comments();
2381 if self.peek() != Some('=') {
2382 return Err(
2383 self.syntax_err("Expected '=' after format name", self.line)
2384 );
2385 }
2386 self.advance();
2387 let lines = self.read_format_body()?;
2388 self.last_was_term = false;
2389 return Ok(Token::FormatDecl { name: fname, lines });
2390 }
2391 "r" if self.peek() == Some('"')
2392 && self.peek_at(1) == Some('"')
2393 && self.peek_at(2) == Some('"') =>
2394 {
2395 // `r"""..."""` — raw triple-quoted string. No
2396 // interpolation, no backslash escapes; every byte
2397 // is copied verbatim until the closing `"""`.
2398 // Only triggers when `r` is followed IMMEDIATELY
2399 // by three quotes — `r 5`, `r->foo`, `r(...)` etc.
2400 // still hit the generic identifier path below.
2401 self.advance(); // 1st "
2402 self.advance(); // 2nd "
2403 self.advance(); // 3rd "
2404 let s = self.read_triple_quoted_body(false)?;
2405 self.last_was_term = true;
2406 return Ok(Token::SingleString(s));
2407 }
2408 "qw" => {
2409 // After `->`, `qw` is a method name, not a quote-word list.
2410 if self.prev_arrow {
2411 self.last_was_term = true;
2412 return Ok(Token::Ident(ident));
2413 }
2414 // `qw` followed by `=>` is an autoquoted hash key, not qw().
2415 let start_pos = self.pos;
2416 let start_line = self.line;
2417 self.skip_whitespace_only();
2418 if let Some(c) = self.peek() {
2419 if c == '=' && self.peek_at(1) == Some('>') {
2420 self.pos = start_pos;
2421 self.line = start_line;
2422 self.last_was_term = true;
2423 return Ok(Token::Ident(ident));
2424 }
2425 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
2426 self.pos = start_pos;
2427 self.line = start_line;
2428 self.last_was_term = true;
2429 return Ok(Token::Ident(ident));
2430 }
2431 }
2432 self.pos = start_pos; // restore for read_qw
2433 self.line = start_line;
2434 let tok = self.read_qw()?;
2435 self.last_was_term = true;
2436 return Ok(tok);
2437 }
2438 "qq" | "q" => {
2439 // After `->`, `q` / `qq` are method names, not quote operators.
2440 if self.prev_arrow {
2441 self.last_was_term = true;
2442 return Ok(Token::Ident(ident));
2443 }
2444 // After `::`, treat as namespaced identifier (`Foo::q`, `Foo::qq`).
2445 if ident_start >= 2
2446 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2447 && self.input.get(ident_start.saturating_sub(1)) == Some(&':')
2448 {
2449 self.last_was_term = true;
2450 return Ok(Token::Ident(ident));
2451 }
2452 // `q` / `qq` followed by `=>` is an autoquoted hash key, not a quote operator.
2453 // Also treat as identifier if followed by terminators like `;`, `,`, `)`, etc.
2454 // Must check AFTER skipping whitespace to handle `q => 5`.
2455 let start_pos = self.pos;
2456 let start_line = self.line;
2457 self.skip_whitespace_only();
2458 if let Some(c) = self.peek() {
2459 // `=` followed by `>` is fat comma — `q` is a bareword key
2460 if c == '=' && self.peek_at(1) == Some('>') {
2461 self.pos = start_pos; // restore position
2462 self.line = start_line;
2463 self.last_was_term = true;
2464 return Ok(Token::Ident(ident));
2465 }
2466 // Other terminators: `q` is an identifier
2467 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
2468 self.pos = start_pos;
2469 self.line = start_line;
2470 self.last_was_term = true;
2471 return Ok(Token::Ident(ident));
2472 }
2473 }
2474 let delim = self.advance().ok_or_else(|| {
2475 self.syntax_err("Expected delimiter after q/qq", self.line)
2476 })?;
2477 let close = match delim {
2478 '(' => ')',
2479 '[' => ']',
2480 '{' => '}',
2481 '<' => '>',
2482 c => c,
2483 };
2484 let s = if matches!(delim, '(' | '[' | '{' | '<') {
2485 self.read_q_qq_balanced_body(delim, close, ident == "qq")?
2486 } else {
2487 self.read_escaped_until(close)?
2488 };
2489 self.last_was_term = true;
2490 if ident == "qq" {
2491 return Ok(Token::DoubleString(s));
2492 }
2493 return Ok(Token::SingleString(s));
2494 }
2495 "qx" => {
2496 // After `->`, `qx` is a method name, not a backtick command.
2497 if self.prev_arrow {
2498 self.last_was_term = true;
2499 return Ok(Token::Ident(ident));
2500 }
2501 // After `::`, treat as namespaced identifier (`Foo::qx`).
2502 if ident_start >= 2
2503 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2504 && self.input.get(ident_start.saturating_sub(1)) == Some(&':')
2505 {
2506 self.last_was_term = true;
2507 return Ok(Token::Ident(ident));
2508 }
2509 // `qx` followed by `=>` is an autoquoted hash key.
2510 let start_pos = self.pos;
2511 let start_line = self.line;
2512 self.skip_whitespace_only();
2513 if let Some(c) = self.peek() {
2514 if c == '=' && self.peek_at(1) == Some('>') {
2515 self.pos = start_pos;
2516 self.line = start_line;
2517 self.last_was_term = true;
2518 return Ok(Token::Ident(ident));
2519 }
2520 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
2521 self.pos = start_pos;
2522 self.line = start_line;
2523 self.last_was_term = true;
2524 return Ok(Token::Ident(ident));
2525 }
2526 }
2527 let delim = self.advance().ok_or_else(|| {
2528 self.syntax_err("Expected delimiter after qx", self.line)
2529 })?;
2530 let close = match delim {
2531 '(' => ')',
2532 '[' => ']',
2533 '{' => '}',
2534 '<' => '>',
2535 c => c,
2536 };
2537 let s = self.read_escaped_until(close)?;
2538 self.last_was_term = true;
2539 return Ok(Token::BacktickString(s));
2540 }
2541 "qr" => {
2542 // After `->`, `qr` is a method name, not a quoted regex.
2543 if self.prev_arrow {
2544 self.last_was_term = true;
2545 return Ok(Token::Ident(ident));
2546 }
2547 // After `::`, treat as namespaced identifier (`Foo::qr`).
2548 if ident_start >= 2
2549 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2550 && self.input.get(ident_start.saturating_sub(1)) == Some(&':')
2551 {
2552 self.last_was_term = true;
2553 return Ok(Token::Ident(ident));
2554 }
2555 // `qr` followed by `=>` is an autoquoted hash key.
2556 let start_pos = self.pos;
2557 let start_line = self.line;
2558 self.skip_whitespace_only();
2559 if let Some(c) = self.peek() {
2560 if c == '=' && self.peek_at(1) == Some('>') {
2561 self.pos = start_pos;
2562 self.line = start_line;
2563 self.last_was_term = true;
2564 return Ok(Token::Ident(ident));
2565 }
2566 if matches!(c, ';' | ',' | ')' | ']' | '}' | '\n') {
2567 self.pos = start_pos;
2568 self.line = start_line;
2569 self.last_was_term = true;
2570 return Ok(Token::Ident(ident));
2571 }
2572 }
2573 let delim = self.advance().ok_or_else(|| {
2574 self.syntax_err("Expected delimiter after qr", self.line)
2575 })?;
2576 let close = match delim {
2577 '(' => ')',
2578 '[' => ']',
2579 '{' => '}',
2580 '<' => '>',
2581 c => c,
2582 };
2583 // Regex pattern: preserve backslash escapes raw so the
2584 // regex engine sees `\$`, `\@`, `\d`, etc. as written.
2585 // Do NOT route through `read_escaped_until` — that's
2586 // for double-quoted strings and rewrites `\$` to a
2587 // private-use sentinel that the regex compiler can't
2588 // decode (would silently strip the `$`).
2589 let mut pattern = String::new();
2590 loop {
2591 match self.advance() {
2592 Some('\\') => {
2593 pattern.push('\\');
2594 if let Some(c) = self.advance() {
2595 pattern.push(c);
2596 }
2597 }
2598 Some(c) if c == close => break,
2599 Some(c) => pattern.push(c),
2600 None => {
2601 return Err(self.syntax_err("Unterminated qr regex", self.line))
2602 }
2603 }
2604 }
2605 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
2606 self.last_was_term = true;
2607 return Ok(Token::Regex(pattern, flags, delim));
2608 }
2609 "m" => {
2610 // After `->`, `m` is a method name, not a regex match.
2611 if self.prev_arrow {
2612 self.last_was_term = true;
2613 return Ok(Token::Ident(ident));
2614 }
2615 // `Foo::m` — after `::`, `m` is the tail of a namespaced
2616 // identifier, never a regex-match operator.
2617 if ident_start >= 2
2618 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2619 && self.input.get(ident_start.saturating_sub(1)) == Some(&':')
2620 {
2621 self.last_was_term = true;
2622 return Ok(Token::Ident(ident));
2623 }
2624 // `m` followed by terminators is a bareword, not match operator.
2625 // Must check AFTER skipping whitespace to handle `m => "val"`.
2626 let start_pos = self.pos;
2627 let start_line = self.line;
2628 self.skip_whitespace_only();
2629 if let Some(d) = self.peek() {
2630 if d == '=' && self.peek_at(1) == Some('>') {
2631 self.pos = start_pos;
2632 self.line = start_line;
2633 self.last_was_term = true;
2634 return Ok(Token::Ident(ident));
2635 }
2636 if matches!(d, ';' | ',' | ')' | ']' | '}' | '>' | ':' | '\n') {
2637 self.pos = start_pos;
2638 self.line = start_line;
2639 self.last_was_term = true;
2640 return Ok(Token::Ident(ident));
2641 }
2642 }
2643 self.pos = start_pos;
2644 self.line = start_line;
2645 // m/pattern/flags — try parsing as regex, but backtrack if
2646 // unterminated (handles thread stages where `/m/` is a grep filter)
2647 if self.suppress_m_regex == 0 {
2648 if let Some(delim) = self.peek() {
2649 if !delim.is_alphanumeric() && delim != '_' {
2650 // Save state for backtracking
2651 let saved_pos = self.pos;
2652 let saved_line = self.line;
2653 self.advance(); // consume delimiter
2654 let close = match delim {
2655 '(' => ')',
2656 '[' => ']',
2657 '{' => '}',
2658 '<' => '>',
2659 c => c,
2660 };
2661 let mut pattern = String::new();
2662 let mut terminated = true;
2663 loop {
2664 match self.advance() {
2665 Some('\\') => {
2666 pattern.push('\\');
2667 if let Some(c) = self.advance() {
2668 pattern.push(c);
2669 }
2670 }
2671 Some(c) if c == close => break,
2672 Some(c) if c == '\n' && close == '/' => {
2673 // Newline before closing / — not a valid m//
2674 terminated = false;
2675 break;
2676 }
2677 Some(c) => pattern.push(c),
2678 None => {
2679 return Err(self.syntax_err(
2680 "Search pattern not terminated",
2681 saved_line,
2682 ));
2683 }
2684 }
2685 }
2686 if terminated {
2687 let flags =
2688 self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
2689 self.last_was_term = true;
2690 return Ok(Token::Regex(pattern, flags, delim));
2691 }
2692 // Newline before closing / — backtrack and treat `m` as identifier
2693 self.pos = saved_pos;
2694 self.line = saved_line;
2695 }
2696 }
2697 }
2698 // Just the identifier 'm'
2699 self.last_was_term = true;
2700 return Ok(Token::Ident(ident));
2701 }
2702 "s" => {
2703 // `$obj->s` / `$obj->s(...)` — after `->`, `s` is a method name.
2704 if self.prev_arrow {
2705 self.last_was_term = true;
2706 return Ok(Token::Ident(ident));
2707 }
2708 // `Foo::s` / `Foo::Bar::s` — after `::`, `s` is the tail
2709 // segment of a namespaced identifier, never substitution.
2710 // Same check shape as the IPv6 / `after_package_sep` guard
2711 // up at line ~2158: previous two chars are `::`.
2712 if ident_start >= 2
2713 && self.input.get(ident_start.saturating_sub(2)) == Some(&':')
2714 && self.input.get(ident_start.saturating_sub(1)) == Some(&':')
2715 {
2716 self.last_was_term = true;
2717 return Ok(Token::Ident(ident));
2718 }
2719 // `s` followed by terminators is a bareword, not substitution.
2720 // Must check AFTER skipping whitespace to handle `s => "val"`.
2721 // `,` is treated as a terminator UNLESS the lookahead shows the
2722 // full `s,PAT,REPL,FLAGS` shape (≥ 2 more commas before the
2723 // statement ends) — that gates the comma-delim case to genuine
2724 // substitutions like `perl -pe 's,\bt\b,b,g'` while leaving
2725 // bareword `s` alone in struct fields, list literals, and
2726 // function args.
2727 let start_pos = self.pos;
2728 let start_line = self.line;
2729 self.skip_whitespace_only();
2730 if let Some(d) = self.peek() {
2731 if d == '=' && self.peek_at(1) == Some('>') {
2732 self.pos = start_pos;
2733 self.line = start_line;
2734 self.last_was_term = true;
2735 return Ok(Token::Ident(ident));
2736 }
2737 if matches!(d, ';' | ')' | ']' | '}' | '>' | ':' | '\n') {
2738 self.pos = start_pos;
2739 self.line = start_line;
2740 self.last_was_term = true;
2741 return Ok(Token::Ident(ident));
2742 }
2743 if d == ',' && !self.lookahead_is_comma_delim_subst() {
2744 self.pos = start_pos;
2745 self.line = start_line;
2746 self.last_was_term = true;
2747 return Ok(Token::Ident(ident));
2748 }
2749 }
2750 self.pos = start_pos;
2751 self.line = start_line;
2752 // s/pattern/replacement/flags
2753 if let Some(delim) = self.peek() {
2754 if !delim.is_alphanumeric() && delim != '_' && delim != ' ' {
2755 self.advance();
2756 let close = match delim {
2757 '(' => ')',
2758 '[' => ']',
2759 '{' => '}',
2760 '<' => '>',
2761 c => c,
2762 };
2763 let mut pattern = String::new();
2764 loop {
2765 match self.advance() {
2766 Some('\\') => {
2767 pattern.push('\\');
2768 if let Some(c) = self.advance() {
2769 pattern.push(c);
2770 }
2771 }
2772 Some(c) if c == close => break,
2773 Some(c) => pattern.push(c),
2774 None => {
2775 return Err(self.syntax_err(
2776 "Unterminated s/// pattern",
2777 self.line,
2778 ))
2779 }
2780 }
2781 }
2782 // For paired delimiters, read the opening of the replacement part
2783 if "([{<".contains(delim) {
2784 self.skip_whitespace_only();
2785 let open2 = self.advance().unwrap_or(delim);
2786 let close = match open2 {
2787 '(' => ')',
2788 '[' => ']',
2789 '{' => '}',
2790 '<' => '>',
2791 c => c,
2792 };
2793 let replacement = self.read_substitution_replacement(close)?;
2794 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
2795 self.last_was_term = true;
2796 // Encode as special token — parser will decode
2797 // Format: \x00s\x00pattern\x00replacement\x00flags\x00delim
2798 return Ok(Token::Ident(format!(
2799 "\x00s\x00{}\x00{}\x00{}\x00{}",
2800 pattern, replacement, flags, delim
2801 )));
2802 }
2803 let replacement = self.read_substitution_replacement(close)?;
2804 let flags = self.read_while(|c| REGEX_FLAG_CHARS.contains(c));
2805 self.last_was_term = true;
2806 return Ok(Token::Ident(format!(
2807 "\x00s\x00{}\x00{}\x00{}\x00{}",
2808 pattern, replacement, flags, delim
2809 )));
2810 }
2811 }
2812 self.last_was_term = true;
2813 return Ok(Token::Ident(ident));
2814 }
2815 "tr" | "y" => {
2816 // `$obj->tr` / `$obj->y` — after `->`, this is a method name,
2817 // not transliteration.
2818 if self.prev_arrow {
2819 self.last_was_term = true;
2820 return Ok(Token::Ident(ident));
2821 }
2822 // After `::`, treat as package-qualified identifier, not transliteration.
2823 // e.g. `Foo::y(...)` is a function call, not `y///`.
2824 if self.pos >= ident.len() + 2 {
2825 let prev_start = self.pos - ident.len() - 2;
2826 if self.input.get(prev_start) == Some(&':')
2827 && self.input.get(prev_start + 1) == Some(&':')
2828 {
2829 self.last_was_term = true;
2830 return Ok(Token::Ident(ident));
2831 }
2832 }
2833 // `tr` / `y` followed by terminators is a bareword, not transliteration.
2834 // Check BEFORE skipping whitespace to catch newlines (implicit semicolon).
2835 // `,` is treated as a terminator UNLESS the lookahead shows the
2836 // full `tr,FROM,TO,FLAGS` shape — same gating as `s` above so
2837 // `y` / `tr` can still appear as struct field names, list elements,
2838 // and arg names without being eaten as transliteration bodies.
2839 if let Some(d) = self.peek() {
2840 if matches!(d, ';' | ')' | ']' | '}' | '>' | ':' | '\n') {
2841 self.last_was_term = true;
2842 return Ok(Token::Ident(ident));
2843 }
2844 if d == ',' && !self.lookahead_is_comma_delim_subst() {
2845 self.last_was_term = true;
2846 return Ok(Token::Ident(ident));
2847 }
2848 } else {
2849 self.last_was_term = true;
2850 return Ok(Token::Ident(ident));
2851 }
2852 // Now skip whitespace to check for `=>` or `=`
2853 let start_pos = self.pos;
2854 let start_line = self.line;
2855 self.skip_whitespace_only();
2856 if let Some(d) = self.peek() {
2857 // `=` alone (not `==` comparison) means assignment — y is an identifier
2858 if d == '=' && self.peek_at(1) != Some('=') {
2859 self.pos = start_pos;
2860 self.line = start_line;
2861 self.last_was_term = true;
2862 return Ok(Token::Ident(ident));
2863 }
2864 }
2865 self.pos = start_pos;
2866 self.line = start_line;
2867 // Check for function signature pattern: y(...) { — this is `fn y`, not tr
2868 if self.peek() == Some('(') {
2869 // Scan ahead to see if there's ) followed by {
2870 let scan_pos = self.pos;
2871 let scan_line = self.line;
2872 self.advance(); // skip (
2873 let mut depth = 1;
2874 while depth > 0 {
2875 match self.peek() {
2876 Some('(') => {
2877 self.advance();
2878 depth += 1;
2879 }
2880 Some(')') => {
2881 self.advance();
2882 depth -= 1;
2883 }
2884 Some(_) => {
2885 self.advance();
2886 }
2887 None => break,
2888 }
2889 }
2890 self.skip_whitespace_only();
2891 let is_func_def = self.peek() == Some('{');
2892 self.pos = scan_pos;
2893 self.line = scan_line;
2894 if is_func_def {
2895 self.last_was_term = true;
2896 return Ok(Token::Ident(ident));
2897 }
2898 }
2899 // tr/from/to/flags
2900 if let Some(delim) = self.peek() {
2901 if !delim.is_alphanumeric() && delim != '_' && delim != ' ' {
2902 self.advance();
2903 let close = match delim {
2904 '(' => ')',
2905 '[' => ']',
2906 '{' => '}',
2907 '<' => '>',
2908 c => c,
2909 };
2910 let from = self.read_escaped_until(close)?;
2911 // For paired delimiters
2912 if "([{<".contains(delim) {
2913 self.skip_whitespace_only();
2914 self.advance(); // open second pair
2915 }
2916 let to = self.read_escaped_until(close)?;
2917 let flags = self.read_while(|c| "cdsr".contains(c));
2918 self.last_was_term = true;
2919 return Ok(Token::Ident(format!(
2920 "\x00tr\x00{}\x00{}\x00{}\x00{}",
2921 from, to, flags, delim
2922 )));
2923 }
2924 }
2925 self.last_was_term = true;
2926 return Ok(Token::Ident(ident));
2927 }
2928 _ => {}
2929 }
2930
2931 // Fat arrow lookahead: ident followed by => is a string.
2932 // CRITICAL: save AND restore `self.line` too. `skip_whitespace_and_comments`
2933 // increments `self.line` for each `\n` it skips, and restoring only
2934 // `self.pos` leaves the line counter drifted by N. The drift bleeds into
2935 // every subsequent token (every `class { field\n field\n }` line shift
2936 // appearing on `my` / `p` / etc. after the class, and bytecode `lines[]`
2937 // metadata that no longer matches the source — breaking the DAP debugger
2938 // since BPs key off original line numbers).
2939 let saved_pos2 = self.pos;
2940 let saved_line2 = self.line;
2941 self.skip_whitespace_and_comments();
2942 if self.peek() == Some('=') && self.peek_at(1) == Some('>') {
2943 self.pos = saved_pos2;
2944 self.line = saved_line2;
2945 self.last_was_term = true;
2946 return Ok(Token::Ident(ident));
2947 }
2948 self.pos = saved_pos2;
2949 self.line = saved_line2;
2950
2951 // Perl: `x` is the string-repetition infix operator only after a complete term.
2952 // After `sub`, `package`, `(`, etc. a term is expected — bare `x` must be an
2953 // identifier (`sub x {`, `x::Foo`, leading `x` in `(x)`).
2954 let tok = if ident == "x" && !self.last_was_term {
2955 Token::Ident("x".to_string())
2956 } else {
2957 keyword_or_ident(&ident)
2958 };
2959 // `x=` is the string-repetition compound assignment. The
2960 // identifier `x` has already been consumed; peek for the
2961 // trailing `=` and merge into `XAssign`. Skip whitespace
2962 // between `x` and `=` to mirror how Perl's lexer treats
2963 // `$s x= 3` and `$s x =3` identically.
2964 let tok = if matches!(tok, Token::X) {
2965 let saved_pos = self.pos;
2966 let saved_line = self.line;
2967 while matches!(self.peek(), Some(' ') | Some('\t')) {
2968 self.advance();
2969 }
2970 if self.peek() == Some('=') && self.peek_at(1) != Some('=') {
2971 self.advance();
2972 self.last_was_term = false;
2973 Token::XAssign
2974 } else {
2975 self.pos = saved_pos;
2976 self.line = saved_line;
2977 tok
2978 }
2979 } else {
2980 tok
2981 };
2982 if matches!(tok, Token::Ident(ref s) if s == "_") {
2983 self.last_was_bare_positional = true;
2984 }
2985 // Keywords that expect a variable next should not set last_was_term
2986 // so that % is parsed as hash sigil, not modulo
2987 self.last_was_term = match ident.as_str() {
2988 // Keywords/builtins that always expect arguments — never a term,
2989 // so the next `/` is always a regex start.
2990 "my"
2991 | "mysync"
2992 | "frozen"
2993 | "const"
2994 | "typed"
2995 | "our"
2996 | "oursync"
2997 | "local"
2998 | "state"
2999 | "return"
3000 | "print"
3001 | "pr"
3002 | "say"
3003 | "p"
3004 | "die"
3005 | "warn"
3006 | "push"
3007 | "pop"
3008 | "shift"
3009 | "shuffle"
3010 | "chunked"
3011 | "windowed"
3012 | "unshift"
3013 | "splice"
3014 | "delete"
3015 | "exists"
3016 | "chomp"
3017 | "chop"
3018 | "defined"
3019 | "keys"
3020 | "values"
3021 | "each"
3022 | "sub"
3023 | "struct"
3024 | "if"
3025 | "unless"
3026 | "while"
3027 | "until"
3028 | "for"
3029 | "foreach"
3030 | "elsif"
3031 | "use"
3032 | "no"
3033 | "require"
3034 | "eval"
3035 | "do"
3036 | "map"
3037 | "maps"
3038 | "flat_maps"
3039 | "grep"
3040 | "greps"
3041 | "sort"
3042 | "all"
3043 | "any"
3044 | "none"
3045 | "take_while"
3046 | "drop_while"
3047 | "skip_while"
3048 | "skip"
3049 | "first_or"
3050 | "tap"
3051 | "peek"
3052 | "with_index"
3053 | "pmap"
3054 | "pflat_map"
3055 | "puniq"
3056 | "pfirst"
3057 | "pany"
3058 | "pmap_chunked"
3059 | "pipeline"
3060 | "pgrep"
3061 | "pfor"
3062 | "par_lines"
3063 | "par_walk"
3064 | "pwatch"
3065 | "watch"
3066 | "psort"
3067 | "reduce"
3068 | "fold"
3069 | "inject"
3070 | "first"
3071 | "detect"
3072 | "find"
3073 | "find_all"
3074 | "preduce"
3075 | "preduce_init"
3076 | "pmap_reduce"
3077 | "pcache"
3078 | "fan"
3079 | "fan_cap"
3080 | "pchannel"
3081 | "pselect"
3082 | "uniq"
3083 | "distinct"
3084 | "flatten"
3085 | "set"
3086 | "list_count"
3087 | "list_size"
3088 | "count"
3089 | "len"
3090 | "size"
3091 | "cnt"
3092 | "zip"
3093 | "async"
3094 | "trace"
3095 | "timer"
3096 | "await"
3097 | "slurp"
3098 | "capture"
3099 | "fetch_url"
3100 | "fetch"
3101 | "fetch_json"
3102 | "fetch_async"
3103 | "fetch_async_json"
3104 | "par_fetch"
3105 | "par_csv_read"
3106 | "par_pipeline"
3107 | "par_pipeline_stream"
3108 | "par_sed"
3109 | "join"
3110 | "json_encode"
3111 | "json_decode"
3112 | "json_jq"
3113 | "jwt_encode"
3114 | "jwt_decode"
3115 | "jwt_decode_unsafe"
3116 | "log_info"
3117 | "log_warn"
3118 | "log_error"
3119 | "log_debug"
3120 | "log_trace"
3121 | "log_json"
3122 | "log_level"
3123 | "sha256"
3124 | "sha1"
3125 | "md5"
3126 | "hmac_sha256"
3127 | "hmac"
3128 | "uuid"
3129 | "base64_encode"
3130 | "base64_decode"
3131 | "hex_encode"
3132 | "hex_decode"
3133 | "gzip"
3134 | "gunzip"
3135 | "zstd"
3136 | "zstd_decode"
3137 | "datetime_utc"
3138 | "datetime_from_epoch"
3139 | "datetime_parse_rfc3339"
3140 | "datetime_strftime"
3141 | "toml_decode"
3142 | "toml_encode"
3143 | "yaml_decode"
3144 | "yaml_encode"
3145 | "url_encode"
3146 | "url_decode"
3147 | "uri_escape"
3148 | "uri_unescape"
3149 | "split"
3150 | "reverse"
3151 | "reversed"
3152 | "not"
3153 | "ref"
3154 | "scalar"
3155 | "try"
3156 | "catch"
3157 | "finally"
3158 | "given"
3159 | "when"
3160 | "default"
3161 | "eval_timeout"
3162 | "tie"
3163 | "retry"
3164 | "rate_limit"
3165 | "every"
3166 | "gen"
3167 | "yield"
3168 | "match"
3169 | "filter"
3170 | "f"
3171 | "reject"
3172 | "grepv"
3173 | "collect"
3174 | "compact"
3175 | "concat"
3176 | "chain"
3177 | "min_by"
3178 | "max_by"
3179 | "sort_by"
3180 | "tally"
3181 | "find_index"
3182 | "each_with_index"
3183 | "fore"
3184 | "e"
3185 | "ep"
3186 | "flat_map"
3187 | "group_by"
3188 | "chunk_by"
3189 | "bench" => false,
3190 // `thread`/`t` are ambiguous: at statement start they're the
3191 // thread keyword (expect args → false), but after an operator
3192 // they could be variable names (e.g., `$x / t / 2` → true).
3193 "thread" | "t" => !self.last_was_term,
3194 _ => matches!(tok, Token::Ident(_)),
3195 };
3196 Ok(tok)
3197 }
3198
3199 c => Err(self.syntax_err(format!("Unexpected character '{c}'"), self.line)),
3200 }
3201 }
3202
3203 /// Tokenize entire input.
3204 pub fn tokenize(&mut self) -> StrykeResult<Vec<(Token, usize)>> {
3205 let mut tokens = Vec::new();
3206 loop {
3207 // `next_token` internally skips whitespace + POD / heredoc and
3208 // stamps `self.token_start_line` to the line where the emitted
3209 // token actually starts. Use that, not a pre-call snapshot of
3210 // `self.line` — POD blocks can advance the line counter by
3211 // many lines before the real token is produced (see
3212 // `token_start_line` doc).
3213 let tok = self.next_token()?;
3214 let line = self.token_start_line;
3215 if self.last_was_bare_positional {
3216 self.bare_positional_indices.insert(tokens.len());
3217 }
3218 if tok == Token::Eof {
3219 tokens.push((Token::Eof, line));
3220 break;
3221 }
3222 tokens.push((tok, line));
3223 }
3224 Ok(tokens)
3225 }
3226}
3227
3228#[cfg(test)]
3229mod tests {
3230 use super::*;
3231 use crate::token::Token;
3232
3233 #[test]
3234 fn tokenize_empty_yields_eof() {
3235 let mut l = Lexer::new("");
3236 let t = l.tokenize().expect("tokenize");
3237 assert_eq!(t.len(), 1);
3238 assert!(matches!(t[0].0, Token::Eof));
3239 }
3240
3241 #[test]
3242 fn tokenize_integer_literal() {
3243 let mut l = Lexer::new("42");
3244 let t = l.tokenize().expect("tokenize");
3245 assert!(matches!(t[0].0, Token::Integer(42)));
3246 }
3247
3248 #[test]
3249 fn tokenize_keyword_my_and_semicolon() {
3250 let mut l = Lexer::new("my;");
3251 let t = l.tokenize().expect("tokenize");
3252 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "my"));
3253 assert!(matches!(t[1].0, Token::Semicolon));
3254 }
3255
3256 #[test]
3257 fn tokenize_skips_hash_line_comment() {
3258 let mut l = Lexer::new("1#comment\n2");
3259 let t = l.tokenize().expect("tokenize");
3260 assert!(matches!(t[0].0, Token::Integer(1)));
3261 assert!(matches!(t[1].0, Token::Integer(2)));
3262 assert!(matches!(t[2].0, Token::Eof));
3263 }
3264
3265 #[test]
3266 fn tokenize_double_quoted_string_literal() {
3267 let mut l = Lexer::new(r#""hi""#);
3268 let t = l.tokenize().expect("tokenize");
3269 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "hi"));
3270 }
3271
3272 #[test]
3273 fn tokenize_triple_quoted_interpolating_multiline() {
3274 // `"""..."""` — interpolating triple-quote. Newlines preserved
3275 // raw; interpolation flag stays on (Token::DoubleString flows
3276 // through the regular string-interp pipeline downstream).
3277 let mut l = Lexer::new("\"\"\"hello\nworld\nline\"\"\"");
3278 let t = l.tokenize().expect("tokenize");
3279 assert!(
3280 matches!(t[0].0, Token::DoubleString(ref s) if s == "hello\nworld\nline"),
3281 "got: {:?}",
3282 t[0].0
3283 );
3284 }
3285
3286 #[test]
3287 fn tokenize_triple_quoted_empty() {
3288 let mut l = Lexer::new("\"\"\"\"\"\"");
3289 let t = l.tokenize().expect("tokenize");
3290 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s.is_empty()));
3291 }
3292
3293 #[test]
3294 fn tokenize_triple_quoted_with_embedded_quotes() {
3295 // Two consecutive `""` inside a `"""..."""` body should not
3296 // close — only three `"""` in a row terminates.
3297 let mut l = Lexer::new("\"\"\"a \"\" b\"\"\"");
3298 let t = l.tokenize().expect("tokenize");
3299 assert!(
3300 matches!(t[0].0, Token::DoubleString(ref s) if s == "a \"\" b"),
3301 "got: {:?}",
3302 t[0].0
3303 );
3304 }
3305
3306 #[test]
3307 fn tokenize_raw_triple_quoted() {
3308 // `r"""..."""` — non-interpolating raw triple-quote. No escape
3309 // processing: `\n` stays as the two literal chars `\` and `n`.
3310 let mut l = Lexer::new("r\"\"\"raw \\n and $no_interp\"\"\"");
3311 let t = l.tokenize().expect("tokenize");
3312 assert!(
3313 matches!(t[0].0, Token::SingleString(ref s) if s == "raw \\n and $no_interp"),
3314 "got: {:?}",
3315 t[0].0
3316 );
3317 }
3318
3319 #[test]
3320 fn tokenize_r_bareword_not_triple_quote() {
3321 // Lone `r` (not followed by `"""`) is still a plain identifier.
3322 let mut l = Lexer::new("r => 5");
3323 let t = l.tokenize().expect("tokenize");
3324 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "r"));
3325 }
3326
3327 #[test]
3328 fn tokenize_unterminated_triple_quote_errors() {
3329 let mut l = Lexer::new("\"\"\"never closes");
3330 assert!(l.tokenize().is_err());
3331 }
3332
3333 #[test]
3334 fn tokenize_double_string_escaped_sigils_are_literal() {
3335 // `\$` in source becomes a sentinel + parser emits literal `$` (not outer interpolation).
3336 let mut l = Lexer::new(r#""my \$x""#);
3337 let t = l.tokenize().expect("tokenize");
3338 let want = format!("my {}x", LITERAL_DOLLAR_IN_DQUOTE);
3339 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
3340 }
3341
3342 #[test]
3343 fn tokenize_double_string_braced_hex_unicode_escape() {
3344 let mut l = Lexer::new(r#""\x{1215}""#);
3345 let t = l.tokenize().expect("tokenize");
3346 let want: String = ['\u{1215}'].into_iter().collect();
3347 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
3348 }
3349
3350 #[test]
3351 fn tokenize_double_string_braced_unicode_u_escape() {
3352 let mut l = Lexer::new(r#""\u{0301}""#);
3353 let t = l.tokenize().expect("tokenize");
3354 let want: String = ['\u{0301}'].into_iter().collect();
3355 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
3356 }
3357
3358 #[test]
3359 fn tokenize_double_string_braced_unicode_u_escape_multi() {
3360 // \u{0041} = 'A', \u{00E9} = 'é', \u{1F600} = '😀'
3361 let mut l = Lexer::new(r#""\u{0041}\u{00E9}\u{1F600}""#);
3362 let t = l.tokenize().expect("tokenize");
3363 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "Aé😀"));
3364 }
3365
3366 #[test]
3367 fn tokenize_double_string_octal_escape() {
3368 let mut l = Lexer::new(r#""\101""#);
3369 let t = l.tokenize().expect("tokenize");
3370 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "A"));
3371 }
3372
3373 #[test]
3374 fn tokenize_double_string_braced_octal_escape() {
3375 let mut l = Lexer::new(r#""\o{101}""#);
3376 let t = l.tokenize().expect("tokenize");
3377 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "A"));
3378 }
3379
3380 #[test]
3381 fn tokenize_double_string_control_char_escape() {
3382 let mut l = Lexer::new(r#""\cA""#);
3383 let t = l.tokenize().expect("tokenize");
3384 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "\x01"));
3385 }
3386
3387 #[test]
3388 fn tokenize_double_string_named_unicode_escape() {
3389 let mut l = Lexer::new(r#""\N{SNOWMAN}""#);
3390 let t = l.tokenize().expect("tokenize");
3391 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "☃"));
3392 }
3393
3394 #[test]
3395 fn tokenize_double_string_named_unicode_u_plus() {
3396 let mut l = Lexer::new(r#""\N{U+2603}""#);
3397 let t = l.tokenize().expect("tokenize");
3398 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "☃"));
3399 }
3400
3401 #[test]
3402 fn tokenize_double_string_unbraced_hex_two_digits() {
3403 let mut l = Lexer::new(r#""\x41""#);
3404 let t = l.tokenize().expect("tokenize");
3405 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "A"));
3406 }
3407
3408 #[test]
3409 fn tokenize_single_quoted_string_literal() {
3410 let mut l = Lexer::new("'x'");
3411 let t = l.tokenize().expect("tokenize");
3412 assert!(matches!(t[0].0, Token::SingleString(ref s) if s == "x"));
3413 }
3414
3415 #[test]
3416 fn tokenize_spaceship_operator() {
3417 let mut l = Lexer::new("1 <=> 2");
3418 let t = l.tokenize().expect("tokenize");
3419 assert!(matches!(t[0].0, Token::Integer(1)));
3420 assert!(matches!(t[1].0, Token::Spaceship));
3421 assert!(matches!(t[2].0, Token::Integer(2)));
3422 }
3423
3424 #[test]
3425 fn tokenize_m_regex_literal() {
3426 let mut l = Lexer::new("m/abc/");
3427 let t = l.tokenize().expect("tokenize");
3428 assert!(matches!(t[0].0, Token::Regex(ref p, ref f, _) if p == "abc" && f.is_empty()));
3429 }
3430
3431 #[test]
3432 fn tokenize_q_brace_constructor() {
3433 let mut l = Lexer::new("q{lit}");
3434 let t = l.tokenize().expect("tokenize");
3435 assert!(matches!(t[0].0, Token::SingleString(ref s) if s == "lit"));
3436 }
3437
3438 /// `q(sub ($) { 1 })` — nested `()` must not end at the `)` in `($)` (core `Carp.pm`).
3439 #[test]
3440 fn tokenize_q_paren_balances_nested_parens_in_prototype() {
3441 let mut l = Lexer::new("q(fn ($) { 1 })");
3442 let t = l.tokenize().expect("tokenize");
3443 assert!(matches!(t[0].0, Token::SingleString(ref s) if s == "fn ($) { 1 }"));
3444 }
3445
3446 /// `qw( (SV*)x )` — nested `()` inside `qw(...)` (core `B.pm`).
3447 #[test]
3448 fn tokenize_qw_paren_balances_nested_parens() {
3449 let mut l = Lexer::new("qw( (SV*)pWARN_ALL )");
3450 let t = l.tokenize().expect("tokenize");
3451 assert!(matches!(t[0].0, Token::QW(ref w) if w.len() == 1 && w[0] == "(SV*)pWARN_ALL"));
3452 }
3453
3454 #[test]
3455 fn tokenize_float_literal() {
3456 let mut l = Lexer::new("3.25");
3457 let t = l.tokenize().expect("tokenize");
3458 assert!(matches!(t[0].0, Token::Float(f) if (f - 3.25).abs() < f64::EPSILON));
3459 }
3460
3461 #[test]
3462 fn tokenize_scientific_float() {
3463 let mut l = Lexer::new("1e2");
3464 let t = l.tokenize().expect("tokenize");
3465 assert!(matches!(t[0].0, Token::Float(f) if (f - 100.0).abs() < 1e-9));
3466 }
3467
3468 #[test]
3469 fn tokenize_hex_with_underscore_separators() {
3470 let mut l = Lexer::new("0x_FF");
3471 let t = l.tokenize().expect("tokenize");
3472 assert!(matches!(t[0].0, Token::Integer(255)));
3473 }
3474
3475 #[test]
3476 fn tokenize_qr_regex_with_flags() {
3477 let mut l = Lexer::new("qr/pat/i");
3478 let t = l.tokenize().expect("tokenize");
3479 assert!(matches!(t[0].0, Token::Regex(ref p, ref f, _) if p == "pat" && f == "i"));
3480 }
3481
3482 #[test]
3483 fn tokenize_m_slash_includes_gc_flags() {
3484 let mut l = Lexer::new("m/./gc");
3485 let t = l.tokenize().expect("tokenize");
3486 assert!(matches!(&t[0].0, Token::Regex(p, f, _) if p == "." && f == "gc"));
3487 }
3488
3489 #[test]
3490 fn tokenize_m_hash_delimiter_includes_gc_flags() {
3491 let mut l = Lexer::new("m#\\w#gc");
3492 let t = l.tokenize().expect("tokenize");
3493 assert!(matches!(&t[0].0, Token::Regex(p, f, _) if p == r"\w" && f == "gc"));
3494 }
3495
3496 #[test]
3497 fn tokenize_qr_slash_includes_gco_flags() {
3498 let mut l = Lexer::new("qr/x/gco");
3499 let t = l.tokenize().expect("tokenize");
3500 assert!(matches!(&t[0].0, Token::Regex(p, f, _) if p == "x" && f == "gco"));
3501 }
3502
3503 #[test]
3504 fn tokenize_qw_hash_delimiter_not_line_comment() {
3505 // `#` after `qw` must be the opener, not `skip_whitespace_and_comments` eating the line.
3506 let mut l = Lexer::new("qw# a b #;");
3507 let t = l.tokenize().expect("tokenize");
3508 assert!(
3509 matches!(&t[0].0, Token::QW(w) if w == &["a", "b"]),
3510 "first={:?}",
3511 t.first()
3512 );
3513 }
3514
3515 #[test]
3516 fn tokenize_qq_hash_delimiter_single_line() {
3517 let mut l = Lexer::new("qq#x#;");
3518 let t = l.tokenize().expect("tokenize");
3519 assert!(matches!(&t[0].0, Token::DoubleString(s) if s == "x"));
3520 }
3521
3522 #[test]
3523 fn tokenize_qr_hash_delimiter_text_balanced_preamble() {
3524 let src = "qr#(\n [!=]~\n | split|grep|map\n | not|and|or|xor\n)#x";
3525 let mut l = Lexer::new(src);
3526 let t = l.tokenize().expect("tokenize");
3527 let Token::Regex(p, f, _) = &t[0].0 else {
3528 panic!("expected Regex, got {:?}", t[0].0);
3529 };
3530 let rest: Vec<_> = t.iter().skip(1).take(8).map(|x| &x.0).collect();
3531 assert!(f.contains('x'), "flags={f:?} pattern={p:?} rest={rest:?}");
3532 assert!(p.contains("[!=]~"), "{p:?}");
3533 assert!(p.contains("split|grep|map"), "{p:?}");
3534 }
3535
3536 #[test]
3537 fn tokenize_octal_integer_literal() {
3538 let mut l = Lexer::new("010");
3539 let t = l.tokenize().expect("tokenize");
3540 assert!(matches!(t[0].0, Token::Integer(8)));
3541 }
3542
3543 #[test]
3544 fn tokenize_binary_integer_literal() {
3545 let mut l = Lexer::new("0b1010");
3546 let t = l.tokenize().expect("tokenize");
3547 assert!(matches!(t[0].0, Token::Integer(10)));
3548 }
3549
3550 #[test]
3551 fn tokenize_filetest_exists() {
3552 let mut l = Lexer::new("-e '.'");
3553 let t = l.tokenize().expect("tokenize");
3554 assert!(matches!(t[0].0, Token::FileTest('e')));
3555 assert!(matches!(t[1].0, Token::SingleString(ref s) if s == "."));
3556 }
3557
3558 #[test]
3559 fn tokenize_filetest_tty() {
3560 let mut l = Lexer::new("-t 'STDIN'");
3561 let t = l.tokenize().expect("tokenize");
3562 assert!(matches!(t[0].0, Token::FileTest('t')));
3563 assert!(matches!(t[1].0, Token::SingleString(ref s) if s == "STDIN"));
3564 }
3565
3566 #[test]
3567 fn tokenize_power_and_range_operators() {
3568 let mut l = Lexer::new("2 ** 3");
3569 let t = l.tokenize().expect("tokenize");
3570 assert!(matches!(t[0].0, Token::Integer(2)));
3571 assert!(matches!(t[1].0, Token::Power));
3572 assert!(matches!(t[2].0, Token::Integer(3)));
3573
3574 let mut l = Lexer::new("1..4");
3575 let t = l.tokenize().expect("tokenize");
3576 assert!(matches!(t[0].0, Token::Integer(1)));
3577 assert!(matches!(t[1].0, Token::Range));
3578 assert!(matches!(t[2].0, Token::Integer(4)));
3579 }
3580
3581 #[test]
3582 fn tokenize_numeric_equality_operators() {
3583 let mut l = Lexer::new("1 == 2");
3584 let t = l.tokenize().expect("tokenize");
3585 assert!(matches!(t[0].0, Token::Integer(1)));
3586 assert!(matches!(t[1].0, Token::NumEq));
3587 assert!(matches!(t[2].0, Token::Integer(2)));
3588
3589 let mut l = Lexer::new("3 != 4");
3590 let t = l.tokenize().expect("tokenize");
3591 assert!(matches!(t[0].0, Token::Integer(3)));
3592 assert!(matches!(t[1].0, Token::NumNe));
3593 assert!(matches!(t[2].0, Token::Integer(4)));
3594 }
3595
3596 #[test]
3597 fn tokenize_logical_and_or_plus_assign() {
3598 let mut l = Lexer::new("1 && 0");
3599 let t = l.tokenize().expect("tokenize");
3600 assert!(matches!(t[0].0, Token::Integer(1)));
3601 assert!(matches!(t[1].0, Token::LogAnd));
3602 assert!(matches!(t[2].0, Token::Integer(0)));
3603
3604 let mut l = Lexer::new("0 || 9");
3605 let t = l.tokenize().expect("tokenize");
3606 assert!(matches!(t[0].0, Token::Integer(0)));
3607 assert!(matches!(t[1].0, Token::LogOr));
3608 assert!(matches!(t[2].0, Token::Integer(9)));
3609
3610 let mut l = Lexer::new("n += 1");
3611 let t = l.tokenize().expect("tokenize");
3612 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "n"));
3613 assert!(matches!(t[1].0, Token::PlusAssign));
3614 assert!(matches!(t[2].0, Token::Integer(1)));
3615 }
3616
3617 #[test]
3618 fn tokenize_bitwise_and_operator() {
3619 let mut l = Lexer::new("3 & 5");
3620 let t = l.tokenize().expect("tokenize");
3621 assert!(matches!(t[0].0, Token::Integer(3)));
3622 assert!(matches!(t[1].0, Token::BitAnd));
3623 assert!(matches!(t[2].0, Token::Integer(5)));
3624 }
3625
3626 #[test]
3627 fn tokenize_braced_caret_scalar_global_phase() {
3628 let mut l = Lexer::new(r#"print ${^GLOBAL_PHASE}, "\n";"#);
3629 let t = l.tokenize().expect("tokenize");
3630 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "print"));
3631 assert!(matches!(t[1].0, Token::ScalarVar(ref s) if s == "^GLOBAL_PHASE"));
3632 assert!(matches!(t[2].0, Token::Comma));
3633 assert!(matches!(t[3].0, Token::DoubleString(ref s) if s == "\n"));
3634 assert!(matches!(t[4].0, Token::Semicolon));
3635 }
3636
3637 #[test]
3638 fn tokenize_bitwise_or_and_assign() {
3639 let mut l = Lexer::new("$a |= $b");
3640 let t = l.tokenize().expect("tokenize");
3641 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "a"));
3642 assert!(matches!(t[1].0, Token::BitOrAssign));
3643 assert!(matches!(t[2].0, Token::ScalarVar(ref s) if s == "b"));
3644
3645 let mut l = Lexer::new("$a &= $b");
3646 let t = l.tokenize().expect("tokenize");
3647 assert!(matches!(t[1].0, Token::BitAndAssign));
3648 }
3649
3650 #[test]
3651 fn tokenize_division_and_modulo() {
3652 let mut l = Lexer::new("7 / 2");
3653 let t = l.tokenize().expect("tokenize");
3654 assert!(matches!(t[1].0, Token::Slash));
3655
3656 let mut l = Lexer::new("7 % 3");
3657 let t = l.tokenize().expect("tokenize");
3658 assert!(matches!(t[1].0, Token::Percent));
3659 }
3660
3661 #[test]
3662 fn tokenize_comma_fat_arrow_and_semicolon() {
3663 let mut l = Lexer::new("a => 1;");
3664 let t = l.tokenize().expect("tokenize");
3665 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "a"));
3666 assert!(matches!(t[1].0, Token::FatArrow));
3667 assert!(matches!(t[2].0, Token::Integer(1)));
3668 assert!(matches!(t[3].0, Token::Semicolon));
3669 }
3670
3671 #[test]
3672 fn tokenize_minus_unary_vs_binary() {
3673 let mut l = Lexer::new("- 5");
3674 let t = l.tokenize().expect("tokenize");
3675 assert!(matches!(t[0].0, Token::Minus));
3676 assert!(matches!(t[1].0, Token::Integer(5)));
3677 }
3678
3679 #[test]
3680 fn tokenize_dollar_scalar_sigil() {
3681 let mut l = Lexer::new("$foo");
3682 let t = l.tokenize().expect("tokenize");
3683 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "foo"));
3684 }
3685
3686 /// `=` + letter is assignment unless `=` starts the line (POD). `$_=foo` must not skip POD.
3687 #[test]
3688 fn tokenize_assign_not_pod_when_eq_not_line_start() {
3689 let mut l = Lexer::new("$_=foo;");
3690 let t = l.tokenize().expect("tokenize");
3691 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "_"));
3692 assert!(matches!(t[1].0, Token::Assign));
3693 assert!(matches!(t[2].0, Token::Ident(ref s) if s == "foo"));
3694 assert!(matches!(t[3].0, Token::Semicolon));
3695 }
3696
3697 #[test]
3698 fn tokenize_pod_equals_still_skipped_at_line_start() {
3699 let mut l = Lexer::new("=head1 NAME\ncode\n=cut\n$x;");
3700 let t = l.tokenize().expect("tokenize");
3701 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "x"));
3702 assert!(matches!(t[1].0, Token::Semicolon));
3703 }
3704
3705 #[test]
3706 fn tokenize_at_array_sigil() {
3707 let mut l = Lexer::new("@arr");
3708 let t = l.tokenize().expect("tokenize");
3709 assert!(matches!(t[0].0, Token::ArrayVar(ref s) if s == "arr"));
3710 }
3711
3712 #[test]
3713 fn tokenize_at_caret_capture_array() {
3714 let mut l = Lexer::new("@^CAPTURE");
3715 let t = l.tokenize().expect("tokenize");
3716 assert!(matches!(t[0].0, Token::ArrayVar(ref s) if s == "^CAPTURE"));
3717 }
3718
3719 #[test]
3720 fn tokenize_percent_caret_hook_hash() {
3721 let mut l = Lexer::new("%^HOOK");
3722 let t = l.tokenize().expect("tokenize");
3723 assert!(matches!(t[0].0, Token::HashVar(ref s) if s == "^HOOK"));
3724 }
3725
3726 #[test]
3727 fn tokenize_caret_letter_and_at_minus_plus() {
3728 let mut l = Lexer::new("$^I@-@+");
3729 let t = l.tokenize().expect("tokenize");
3730 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "^I"));
3731 assert!(matches!(t[1].0, Token::ArrayVar(ref s) if s == "-"));
3732 assert!(matches!(t[2].0, Token::ArrayVar(ref s) if s == "+"));
3733 }
3734
3735 #[test]
3736 fn tokenize_percent_hash_sigil() {
3737 let mut l = Lexer::new("%h");
3738 let t = l.tokenize().expect("tokenize");
3739 assert!(matches!(t[0].0, Token::HashVar(ref s) if s == "h"));
3740 }
3741
3742 #[test]
3743 fn tokenize_percent_plus_named_capture_hash() {
3744 let mut l = Lexer::new("%+");
3745 let t = l.tokenize().expect("tokenize");
3746 assert!(matches!(t[0].0, Token::HashVar(ref s) if s == "+"));
3747 }
3748
3749 #[test]
3750 fn tokenize_dollar_dollar_under_brace_is_not_pid() {
3751 // `$$_{$k}` — second `$$` is not PID; tokenizes as `$_` then `{` (Perl `$_->{$k}`).
3752 let mut l = Lexer::new("$$_{$k}");
3753 let t = l.tokenize().expect("tokenize");
3754 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "_"));
3755 assert!(matches!(t[1].0, Token::LBrace));
3756 }
3757
3758 #[test]
3759 fn tokenize_braced_scalar_deref_try_tiny() {
3760 // `${$code_ref}` ≡ `$$code_ref` (Try::Tiny blesses scalar refs to coderefs).
3761 let mut l = Lexer::new("${$code_ref}");
3762 let t = l.tokenize().expect("tokenize");
3763 assert!(matches!(t[0].0, Token::DerefScalarVar(ref s) if s == "code_ref"));
3764 }
3765
3766 #[test]
3767 fn tokenize_braced_scalar_deref_package_qualified() {
3768 let mut l = Lexer::new("${$Foo::bar}");
3769 let t = l.tokenize().expect("tokenize");
3770 assert!(matches!(t[0].0, Token::DerefScalarVar(ref s) if s == "Foo::bar"));
3771 }
3772
3773 #[test]
3774 fn tokenize_dollar_colon_stash_brace() {
3775 // `$::{$k}` — `%::` main stash (core Carp.pm line 32).
3776 let mut l = Lexer::new("$::{$pack}");
3777 let t = l.tokenize().expect("tokenize");
3778 assert!(matches!(t[0].0, Token::ScalarVar(ref s) if s == "::"));
3779 assert!(matches!(t[1].0, Token::LBrace));
3780 }
3781
3782 #[test]
3783 fn tokenize_ampersand_then_ident_is_bitand_not_coderef() {
3784 // Subroutine coderef `&name` is not a distinct token; lexer emits `&` then ident.
3785 let mut l = Lexer::new("&f");
3786 let t = l.tokenize().expect("tokenize");
3787 assert!(matches!(t[0].0, Token::BitAnd));
3788 assert!(matches!(t[1].0, Token::Ident(ref s) if s == "f"));
3789 }
3790
3791 #[test]
3792 fn tokenize_qq_paren_constructor() {
3793 let mut l = Lexer::new("qq(x y)");
3794 let t = l.tokenize().expect("tokenize");
3795 assert!(matches!(t[0].0, Token::DoubleString(ref s) if s == "x y"));
3796 }
3797
3798 #[test]
3799 fn tokenize_qq_slash_escaped_dollar_is_literal() {
3800 let mut l = Lexer::new(r#"qq/my \$y/"#);
3801 let t = l.tokenize().expect("tokenize");
3802 let want = format!("my {}y", LITERAL_DOLLAR_IN_DQUOTE);
3803 assert!(matches!(t[0].0, Token::DoubleString(ref s) if *s == want));
3804 }
3805
3806 #[test]
3807 fn tokenize_s_substitution_alternate_delimiter() {
3808 let mut l = Lexer::new("s#a#b#");
3809 let t = l.tokenize().expect("tokenize");
3810 assert!(matches!(t[0].0, Token::Ident(ref s) if s.starts_with("\x00s\x00")));
3811 }
3812
3813 #[test]
3814 fn tokenize_tr_slash_delimiter() {
3815 let mut l = Lexer::new("tr/a/b/");
3816 let t = l.tokenize().expect("tokenize");
3817 assert!(matches!(t[0].0, Token::Ident(ref s) if s.starts_with("\x00tr\x00")));
3818 }
3819
3820 #[test]
3821 fn tokenize_y_synonym_for_tr() {
3822 let mut l = Lexer::new("y/x/y/");
3823 let t = l.tokenize().expect("tokenize");
3824 assert!(matches!(t[0].0, Token::Ident(ref s) if s.starts_with("\x00tr\x00")));
3825 }
3826
3827 #[test]
3828 fn tokenize_less_equal_greater_relops() {
3829 let mut l = Lexer::new("1 <= 2");
3830 let t = l.tokenize().expect("tokenize");
3831 assert!(matches!(t[1].0, Token::NumLe));
3832
3833 let mut l = Lexer::new("3 >= 2");
3834 let t = l.tokenize().expect("tokenize");
3835 assert!(matches!(t[1].0, Token::NumGe));
3836
3837 let mut l = Lexer::new("1 < 2");
3838 let t = l.tokenize().expect("tokenize");
3839 assert!(matches!(t[1].0, Token::NumLt));
3840
3841 let mut l = Lexer::new("3 > 2");
3842 let t = l.tokenize().expect("tokenize");
3843 assert!(matches!(t[1].0, Token::NumGt));
3844 }
3845
3846 #[test]
3847 fn tokenize_readline_scalar_handle() {
3848 let mut l = Lexer::new("<$fh>");
3849 let t = l.tokenize().expect("tokenize");
3850 assert!(matches!(t[0].0, Token::ReadLine(ref s) if s == "fh"));
3851 }
3852
3853 #[test]
3854 fn tokenize_shift_right_and_shift_left_assign() {
3855 let mut l = Lexer::new("8 >> 1");
3856 let t = l.tokenize().expect("tokenize");
3857 assert!(matches!(t[1].0, Token::ShiftRight));
3858
3859 let mut l = Lexer::new("8 << 1");
3860 let t = l.tokenize().expect("tokenize");
3861 assert!(matches!(t[1].0, Token::ShiftLeft));
3862
3863 let mut l = Lexer::new("x <<= 3");
3864 let t = l.tokenize().expect("tokenize");
3865 assert!(matches!(t[1].0, Token::ShiftLeftAssign));
3866 }
3867
3868 #[test]
3869 fn tokenize_heredoc_after_print_not_shift() {
3870 let src = "print <<EOT\nhi\nEOT\n";
3871 let mut l = Lexer::new(src);
3872 let t = l.tokenize().expect("tokenize");
3873 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "print"));
3874 assert!(
3875 matches!(&t[1].0, Token::HereDoc(tag, body, interpolate) if tag == "EOT" && body == "hi\n" && *interpolate),
3876 "got {:?}",
3877 t[1].0
3878 );
3879 }
3880
3881 #[test]
3882 fn tokenize_bitwise_or_xor() {
3883 let mut l = Lexer::new("3 | 1");
3884 let t = l.tokenize().expect("tokenize");
3885 assert!(matches!(t[1].0, Token::BitOr));
3886
3887 let mut l = Lexer::new("3 ^ 1");
3888 let t = l.tokenize().expect("tokenize");
3889 assert!(matches!(t[1].0, Token::BitXor));
3890 }
3891
3892 #[test]
3893 fn tokenize_pipe_forward_vs_bitor_vs_logor() {
3894 // `|>` must lex as a distinct token (not `|` followed by `>`).
3895 let mut l = Lexer::new("1 |> f");
3896 let t = l.tokenize().expect("tokenize");
3897 assert!(matches!(t[1].0, Token::PipeForward), "got {:?}", t[1].0);
3898
3899 // Make sure `|` and `||` still work alongside `|>`.
3900 let mut l = Lexer::new("a | b || c |> d");
3901 let t = l.tokenize().expect("tokenize");
3902 let kinds: Vec<_> = t.iter().map(|(k, _)| k.clone()).collect();
3903 assert!(kinds.iter().any(|k| matches!(k, Token::BitOr)));
3904 assert!(kinds.iter().any(|k| matches!(k, Token::LogOr)));
3905 assert!(kinds.iter().any(|k| matches!(k, Token::PipeForward)));
3906 }
3907
3908 #[test]
3909 fn tokenize_compare_and_three_way_string_ops() {
3910 let mut l = Lexer::new("\"a\" cmp \"b\"");
3911 let t = l.tokenize().expect("tokenize");
3912 assert!(matches!(t[1].0, Token::StrCmp));
3913 }
3914
3915 #[test]
3916 fn tokenize_package_double_colon_splits_qualified_name() {
3917 let mut l = Lexer::new("Foo::Bar::baz");
3918 let t = l.tokenize().expect("tokenize");
3919 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "Foo"));
3920 assert!(matches!(t[1].0, Token::PackageSep));
3921 assert!(matches!(t[2].0, Token::Ident(ref s) if s == "Bar"));
3922 assert!(matches!(t[3].0, Token::PackageSep));
3923 assert!(matches!(t[4].0, Token::Ident(ref s) if s == "baz"));
3924 }
3925
3926 #[test]
3927 fn tokenize_pod_line_skipped_like_comment_prefix() {
3928 // `=head1` at line start starts POD; lexer should skip until =cut
3929 let mut l = Lexer::new("=pod\n=cut\n42");
3930 let t = l.tokenize().expect("tokenize");
3931 assert!(matches!(t[0].0, Token::Integer(42)));
3932 }
3933
3934 #[test]
3935 fn tokenize_underscore_in_identifier() {
3936 let mut l = Lexer::new("__PACKAGE__");
3937 let t = l.tokenize().expect("tokenize");
3938 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "__PACKAGE__"));
3939 }
3940
3941 /// `x` is the repetition operator only in infix position; after `sub` it is a sub name (Perl).
3942 #[test]
3943 fn tokenize_x_repeat_vs_sub_name() {
3944 let mut l = Lexer::new("3 x 4");
3945 let t = l.tokenize().expect("tokenize");
3946 assert!(matches!(t[1].0, Token::X));
3947
3948 let mut l = Lexer::new("sub x { 1 }");
3949 let t = l.tokenize().expect("tokenize");
3950 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "sub"));
3951 assert!(matches!(t[1].0, Token::Ident(ref s) if s == "x"));
3952 }
3953
3954 /// Regression for the fat-arrow lookahead drift. Every identifier is
3955 /// followed by a speculative `skip_whitespace_and_comments` peek for
3956 /// `=>`, which advances `self.line` over any intervening newlines.
3957 /// Restoring only `self.pos` (the original bug) leaves the line counter
3958 /// drifted +1 per newline crossed, so every subsequent statement
3959 /// reports a wrong source line and breakpoints on those lines silently
3960 /// drop. With the save/restore fix, the line numbers match the source.
3961 #[test]
3962 fn fat_arrow_lookahead_does_not_drift_line() {
3963 let src = "x\ny\nz\n";
3964 let mut l = Lexer::new(src);
3965 let t = l.tokenize().expect("tokenize");
3966 // Token lines should match source lines.
3967 assert!(matches!(t[0].0, Token::Ident(ref s) if s == "x"));
3968 assert_eq!(t[0].1, 1, "x is on line 1");
3969 assert!(matches!(t[1].0, Token::Ident(ref s) if s == "y"));
3970 assert_eq!(t[1].1, 2, "y is on line 2");
3971 assert!(matches!(t[2].0, Token::Ident(ref s) if s == "z"));
3972 assert_eq!(t[2].1, 3, "z is on line 3");
3973 }
3974
3975 /// Specific case from the bug report: a class-body with a typed field
3976 /// followed by code. The bug had `=>` lookahead consume the `\n` after
3977 /// `Int`, then the recursive next_token / outer tokenize re-process the
3978 /// same `\n`, double-incrementing `self.line`. With the fix, statements
3979 /// after the class body report their real source lines.
3980 #[test]
3981 fn class_field_does_not_drift_subsequent_statement_line() {
3982 let src = "class Foo {\n x: Int\n}\nmy $y = 1\np $y\n";
3983 let mut l = Lexer::new(src);
3984 let t = l.tokenize().expect("tokenize");
3985 // Find the `my` token after the class body.
3986 let my_line = t
3987 .iter()
3988 .find_map(|(tok, line)| match tok {
3989 Token::Ident(s) if s == "my" => Some(*line),
3990 _ => None,
3991 })
3992 .expect("expected `my` token");
3993 assert_eq!(my_line, 4, "my $y = 1 lives on source line 4");
3994 }
3995
3996 /// Same shape as `fat_arrow_lookahead_does_not_drift_line` but for the
3997 /// `qw` bareword-vs-quote-operator lookahead. When `qw` is followed by
3998 /// terminators (here a newline + closing paren) it's lexed as an Ident,
3999 /// not a `qw(...)` list. The `skip_whitespace_only` call inside the
4000 /// lookahead advances `self.line` for the `\n` it consumes; without
4001 /// restoring `self.line` the next token's line drifted +1.
4002 #[test]
4003 fn qw_bareword_lookahead_does_not_drift_line() {
4004 // `qw` followed by `)` (treated as bareword) on its own line, then
4005 // a new statement. The post-class-field pattern: a newline between
4006 // `qw` and the next statement must not bump the line counter.
4007 let src = "(qw\n)\nmy $x = 1\n";
4008 let mut l = Lexer::new(src);
4009 let t = l.tokenize().expect("tokenize");
4010 let my_line = t
4011 .iter()
4012 .find_map(|(tok, line)| match tok {
4013 Token::Ident(s) if s == "my" => Some(*line),
4014 _ => None,
4015 })
4016 .expect("expected `my` token");
4017 assert_eq!(my_line, 3, "my $x = 1 lives on source line 3");
4018 }
4019
4020 /// `m` as a bareword (the parser handles `$obj->m`, sort `m`-prefix,
4021 /// etc.) goes through the same skip-whitespace-only lookahead as `qw`.
4022 /// Newlines between `m` and the next non-terminator must not drift
4023 /// the lexer's `self.line` past restoration.
4024 #[test]
4025 fn m_bareword_lookahead_does_not_drift_line() {
4026 // `$obj->m` — after `->`, `m` is a method name; the followup token
4027 // tracking must keep its source line.
4028 let src = "$obj->m\nmy $y = 2\n";
4029 let mut l = Lexer::new(src);
4030 let t = l.tokenize().expect("tokenize");
4031 let my_line = t
4032 .iter()
4033 .find_map(|(tok, line)| match tok {
4034 Token::Ident(s) if s == "my" => Some(*line),
4035 _ => None,
4036 })
4037 .expect("expected `my` token");
4038 assert_eq!(my_line, 2, "my $y = 2 lives on source line 2");
4039 }
4040
4041 /// Bareword `_N` (underscore + ≥1 digit) lexes to `ScalarVar(_N)` —
4042 /// the implicit-closure-positional name that works without a sigil.
4043 /// The lexer carves these out at line 2241 so the parser sees them
4044 /// as variables, not as undefined sub names.
4045 #[test]
4046 fn bareword_underscore_n_lexes_as_scalar_var() {
4047 let mut l = Lexer::new("_1");
4048 let t = l.tokenize().expect("tokenize");
4049 assert!(
4050 matches!(t[0].0, Token::ScalarVar(ref s) if s == "_1"),
4051 "_1 → ScalarVar(_1): got {:?}",
4052 t[0].0
4053 );
4054 }
4055
4056 #[test]
4057 fn bareword_underscore_n_two_digits_lexes_as_scalar_var() {
4058 let mut l = Lexer::new("_42");
4059 let t = l.tokenize().expect("tokenize");
4060 assert!(
4061 matches!(t[0].0, Token::ScalarVar(ref s) if s == "_42"),
4062 "_42 → ScalarVar(_42): got {:?}",
4063 t[0].0
4064 );
4065 }
4066
4067 /// Bare `_` (no digit) stays an Ident so the existing topic /
4068 /// bareword-call machinery keeps working — `_` can be a sub call,
4069 /// a bareword filename, etc., context-dependent.
4070 #[test]
4071 fn bare_underscore_alone_lexes_as_ident() {
4072 let mut l = Lexer::new("_");
4073 let t = l.tokenize().expect("tokenize");
4074 assert!(
4075 matches!(t[0].0, Token::Ident(ref s) if s == "_"),
4076 "_ → Ident(_): got {:?}",
4077 t[0].0
4078 );
4079 }
4080
4081 /// `_foo` is a user-defined identifier — NOT a topic alias.
4082 /// Only `_<digits>` is reserved for the positional slot family.
4083 #[test]
4084 fn underscore_prefix_word_is_plain_ident() {
4085 let mut l = Lexer::new("_foo");
4086 let t = l.tokenize().expect("tokenize");
4087 assert!(
4088 matches!(t[0].0, Token::Ident(ref s) if s == "_foo"),
4089 "_foo → Ident(_foo): got {:?}",
4090 t[0].0
4091 );
4092 }
4093
4094 /// POD blocks (`=pod ... =cut`) advance `self.line` many lines during
4095 /// the skip but the original tokenize loop captured `line` *before*
4096 /// calling next_token, then pushed (token, captured_line) — so the
4097 /// first token after POD got the pre-POD line instead of its real
4098 /// post-skip line. Now next_token stamps `self.token_start_line` after
4099 /// the skip and tokenize reads from there.
4100 #[test]
4101 fn token_after_pod_block_uses_post_skip_line() {
4102 let src = "\
4103=pod
4104big POD
4105block here
4106=cut
4107my $x = 1
4108";
4109 let mut l = Lexer::new(src);
4110 let t = l.tokenize().expect("tokenize");
4111 // First non-Eof token is `my`, on source line 5.
4112 let (first_tok, first_line) = t
4113 .iter()
4114 .find(|(tok, _)| !matches!(tok, Token::Eof))
4115 .expect("non-empty tokens");
4116 assert!(matches!(first_tok, Token::Ident(ref s) if s == "my"));
4117 assert_eq!(
4118 *first_line, 5,
4119 "my $x lives on source line 5 (after POD ends)"
4120 );
4121 }
4122}